## Scrapping Real-estate property data

In [1]:
import requests
from bs4 import BeautifulSoup

In [4]:
r = requests.get('https://www.century21.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/?k=1')

In [7]:
c = r.content
soup = BeautifulSoup(c,'html.parser')

### Analyse the html code and find out which element has the desired data

In [27]:
inf_cont = soup.find_all("div",{"class":"property-card-primary-info"})

In [12]:
len(inf_cont)

12

In [13]:
inf_cont[0]

<div class="property-card-primary-info">
<div class="pdp-listing-type sale">FOR SALE</div>
<a class="listing-price" href="/property/806-moccasin-lane-rock-springs-wy-82901-CBR50172228">
         
            
               $58,900
            
            
         
      </a>
<div class="col-wrap-mid">
<div class="property-beds">
<strong>3</strong> beds
         </div>
<div class="property-baths">
<strong>1</strong> bath
         </div>
</div>
<div class="col-wrap-last">
<div class="property-half-baths">
<strong>1</strong> half bath
         </div>
<div class="property-sqft">
<strong>1,204</strong> sq. ft
         </div>
</div>
<div class="property-address-info">
<div class="property-address" title="806 Moccasin Lane">
               806 Moccasin Lane
            </div>
<div class="property-city">
            Rock Springs WY 82901
         </div>
</div>
<div class="property-card-attribution">
               Courtesy Of Coldwell Banker Sweetwater Realty
            
         </div>
</

In [14]:
inf_cont[1]

<div class="property-card-primary-info">
<div class="pdp-listing-type sale">FOR SALE</div>
<a class="listing-price" href="/property/1620-w-2nd-st-19-rock-springs-wy-82901-CBR50146466">
         
            
               $37,000
            
            
         
      </a>
<div class="col-wrap-mid">
<div class="property-beds">
<strong>3</strong> beds
         </div>
<div class="property-baths">
<strong>2</strong> baths
         </div>
</div>
<div class="col-wrap-last">
<div class="property-sqft">
<strong>1,296</strong> sq. ft
         </div>
</div>
<div class="property-address-info">
<div class="property-address" title="1620 W 2nd St #19">
               1620 W 2nd St #19
            </div>
<div class="property-city">
            Rock Springs WY 82901
         </div>
</div>
<div class="property-card-attribution">
               Courtesy Of Coldwell Banker Sweetwater Realty
            
         </div>
</div>

### Get the Property price and property square feet

#### Price

In [21]:
inf_cont[1].find("a").text.strip()

'$37,000'

#### Sq. feet

In [23]:
inf_cont[1].find("div",{"class" : "property-sqft"})

<div class="property-sqft">
<strong>1,296</strong> sq. ft
         </div>

In [25]:
inf_cont[1].find("div",{"class" : "property-sqft"}).find("strong").text

'1,296'

#### Make sure to deal with missing data

In [33]:
for item in inf_cont:
    try:
        price = item.find("a").text.strip()
        price = price.replace("$","")
    except AttributeError:
        price = 0
    try:    
        area = item.find("div",{"class" : "property-sqft"}).find("strong").text
    except AttributeError:
        area = "NA"
    print(price,area)
    

58,900 1,204
37,000 1,296
430,000 4,560
120,800 1,344
150,000 NA
230,000 NA
200,000 NA
219,000 NA
220,000 NA
179,900 960
15,000 NA
99,900 1,324


#### Getting other info : bed, baths, city

In [34]:
inf_cont[1]

<div class="property-card-primary-info">
<div class="pdp-listing-type sale">FOR SALE</div>
<a class="listing-price" href="/property/1620-w-2nd-st-19-rock-springs-wy-82901-CBR50146466">
         
            
               $37,000
            
            
         
      </a>
<div class="col-wrap-mid">
<div class="property-beds">
<strong>3</strong> beds
         </div>
<div class="property-baths">
<strong>2</strong> baths
         </div>
</div>
<div class="col-wrap-last">
<div class="property-sqft">
<strong>1,296</strong> sq. ft
         </div>
</div>
<div class="property-address-info">
<div class="property-address" title="1620 W 2nd St #19">
               1620 W 2nd St #19
            </div>
<div class="property-city">
            Rock Springs WY 82901
         </div>
</div>
<div class="property-card-attribution">
               Courtesy Of Coldwell Banker Sweetwater Realty
            
         </div>
</div>

In [41]:
beds = inf_cont[1].find("div",{"class":"property-beds"}).find("strong").text
baths = inf_cont[1].find("div",{"class":"property-baths"}).find("strong").text
city = inf_cont[1].find("div",{"class":"property-city"}).text.strip()

'Rock Springs WY 82901'

#### Iterating through a pandas data fame is expensive. So create a list of dicts and change it to data frame

In [53]:
data_list = []
for id, item in enumerate(inf_cont):
    try:
        price = item.find("a").text.strip()
        price = price.replace("$","")
    except AttributeError:
        price = 0
    try:    
        area = item.find("div",{"class" : "property-sqft"}).find("strong").text
    except AttributeError:
        area = "NA"
    
    try:    
        beds = item.find("div",{"class":"property-beds"}).find("strong").text
    except AttributeError:
        beds = "NA"
        
    try:    
        baths = item.find("div",{"class":"property-baths"}).find("strong").text
    except AttributeError:
        baths = "NA"
        
    try:    
        city = item.find("div",{"class":"property-city"}).text.strip()
    except AttributeError:
        city = "NA"

    data = {}
    data["price ($)"] = price
    data["area (sq.feet)"] = area
    data["beds"] = beds
    data["baths"] = baths
    data["city"] = city
    data_list.append(data)

import pandas as pd
df = pd.DataFrame(data_list)
df

Unnamed: 0,area (sq.feet),baths,beds,city,price ($)
0,1204.0,1.0,3.0,Rock Springs WY 82901,58900
1,1296.0,2.0,3.0,Rock Springs WY 82901,37000
2,4560.0,2.0,6.0,Rock Springs WY 82901,430000
3,1344.0,2.0,3.0,Rock Springs WY 82901,120800
4,,1.0,4.0,Rock Springs WY 82901,150000
5,,,4.0,Rock Springs WY 82901,230000
6,,,3.0,Rock Springs WY 82901,200000
7,,2.0,3.0,Rock Springs WY 82901,219000
8,,,4.0,Rock Springs WY 82901,220000
9,960.0,2.0,5.0,Rock Springs WY 82901,179900


#### Write to CSV file

In [55]:
df.to_csv("Real_estate_data.csv")