# Web Scraping using BeautifulSoup4

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas # I am using pandas to generate a csv file at the end of the script

## What site should I scrape?

### Well, I will just use apartments.com for now and I will scrape (in this case) in the San Diego Area

In [2]:
base_url = "https://www.apartments.com/san-diego-ca/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}

## Next, we use the requests library to hit the base_url
### We can take the content and set it to variable c

In [3]:
# To get the html contents
r = requests.get(base_url, headers=headers)
c = r.content

In [10]:
c

b'\r\n\r\n<!DOCTYPE html>\r\n<html lang="en" data-placeholder-focus="false">\r\n<head>\r\n    <meta charset="utf-8" />\r\n    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" />\r\n    <meta name="format-detection" content="telephone=no" />\r\n    <meta name="format-detection" content="email=no" />\r\n    <meta name="HandheldFriendly" content="true" />\r\n    <title> Apartments for Rent in San Diego CA - Page 28 | Apartments.com</title>\r\n    <link rel="icon" href="/a/8a55f9/favicon.ico">\r\n        <link rel="canonical" href="https://www.apartments.com/san-diego-ca/28/" />\r\n        <meta name="keywords" content="San Diego apartments for rent, condos, townhomes, houses for rent" />\r\n    <link href="https://plus.google.com/+apartmentscom/" rel="publisher" />\r\n    <meta name="description" content="See all 6,516 apartments in San Diego, CA currently available for rent. Each Apartments.com listing has verified availability, r

## So while we have the actual content, it is HIDEOUS
### here is where Beautiful Soup comes into play

In [12]:
# To parse the html
soup = BeautifulSoup(c,"html.parser")
soup


<!DOCTYPE html>

<html data-placeholder-focus="false" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="telephone=no" name="format-detection"/>
<meta content="email=no" name="format-detection"/>
<meta content="true" name="HandheldFriendly"/>
<title> Apartments for Rent in San Diego CA - Page 28 | Apartments.com</title>
<link href="/a/8a55f9/favicon.ico" rel="icon"/>
<link href="https://www.apartments.com/san-diego-ca/28/" rel="canonical">
<meta content="San Diego apartments for rent, condos, townhomes, houses for rent" name="keywords"/>
<link href="https://plus.google.com/+apartmentscom/" rel="publisher"/>
<meta content="See all 6,516 apartments in San Diego, CA currently available for rent. Each Apartments.com listing has verified availability, rental rates, photos, floor plans and more." name="description"/>
<meta content="en" name="language"/>
<meta content="en" http-

## The Beautiful Soup parses the content and we can see it produces what looks like real HTML
### PHEW

In [14]:
# To extract the first and last page numbers
# Since we want more than just one page of listings
## we will set the first and last pages to variables and loop through
paging = soup.find("div",{"id":"placardContainer"}).find("div",{"id":"paging"}).find_all("a")
start_page = paging[1].text
last_page = paging[len(paging)-2].text
web_content_list = [] # we are instantiating this so that we can loop and add content after looping

Now the looping:

for each page ranging from the start ot last page, we will:
- request content
- parse with bs4
- find content of interest
- put content in instantiated web_content_list

In [6]:
for page_number in range(int(start_page),int(last_page) + 1):
    
    # To form the url based on page numbers
    url = base_url+str(page_number)+"/.html"
    r = requests.get(base_url+str(page_number)+"/", headers=headers)
    c = r.content
    soup = BeautifulSoup(c,"html.parser")
    
    # To extract the Title and the Location
    placard_header = soup.find_all("header",{"class":"placardHeader"})
    
    # To extract the Rent, No of Beds and Phone Number
    placard_content = soup.find_all("section",{"class" :"placardContent"})
    
    # To process property by property by looping
    for item_header,item_content in zip(placard_header,placard_content):
         # To store the information to a dictionary
          web_content_dict = {}
          web_content_dict["Title"]=item_header.find("a",{"class":"placardTitle"}).text.replace("\r","").replace("\n","")
          web_content_dict["Address"] = item_header.find("div",{"class":"location"}).text
          web_content_dict["Price"] = item_content.find("span",{"class":"altRentDisplay"}).text
          web_content_dict["Beds"] = item_content.find("span",{"class":"unitLabel"}).text
          web_content_dict["Phone"] = item_content.find("div",{"class":"phone"}).find("span").text
          # To store the dictionary to into a list
          web_content_list.append(web_content_dict)

## Lastly, now that we have our content:
### We will put it into a pandas dataframe
### and convert to csv file

In [7]:
# To make a dataframe with the list
df = pandas.DataFrame(web_content_list)

# convert to CSV
df.to_csv("EXAMPLE.csv")