# Using Zillow API and Scrapy to Extract Housing Information Data

**Author:** Brendan McDonnell

I built a script that pulls information (slowly, very slowly) from Zillow.com using Scrapy by feeding a list of zip codes into a brute force algorithm. The final data is messy and raw and will need some extensive EDA before we can use it in our model.

In [2]:
import pandas as pd
import requests
import scrapy
from scrapy_splash import SplashRequest
import time

In [3]:
csv = pd.read_csv('data/mhi_classified_zips.csv')

In [4]:
zip_list = list(csv['Zip'])

In [6]:
len(zip_list)

319

In [8]:
# headers so the website doesn't hit me with the captcha

header = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
               'accept-encoding': 'gzip, deflate, sdch, br',
               'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
               'cache-control': 'max-age=0',
               'upgrade-insecure-requests': '1',
               'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}

# initialize house_dict to build dataframe

house_dict = {'address': [],
              'type': [],
              'price': [],
              'features': [],
              'labels': [],
             }

# build the base url & zip code list

url_base = 'https://www.zillow.com/'

for zip_code in zip_list:
    url_zip = url_base + f'homes/{zip_code}_rb/'
    time.sleep(1)
    for j in range(1,100):
        url = url_zip + f'p_{j}/'
        res = requests.get(url=url, headers=header)
        if res.status_code == 200: # is the website giving me a response
            sel = scrapy.Selector(text=res.content)
            for house in sel.css('div.result-list-container>ul>li>article>a'):
                house_dict['address'].append(house.css('h3::text').extract_first())
                house_dict['type'].append(house.css('div.list-card-type::text').extract_first())
                house_dict['price'].append(house.css('div.list-card-heading>div.list-card-price::text').extract_first())
                list_of_feat = house.css('div.list-card-heading>ul>li::text').extract()
                list_of_labels = list(set(house.css('div.list-card-heading>ul>li>span::text').extract()))
                list_of_labels.remove(' ')
                house_dict['features'].append(list_of_feat)
                house_dict['labels'].append(list_of_labels)
#                 while len(list_of_feat) != 3:
#                     list_of_feat.append(' ')
#                     list_of_labels.append(' ')
#                 length = 3
#                 while length > 0:
#                     house_dict[f'feat{length}'].append(list_of_feat[3-length] + ' ' + list_of_labels[length-4])
#                     length -= 1
                time.sleep(1)                    
        else:
            break

In [9]:
df = pd.DataFrame(house_dict)

In [10]:
df.to_csv('data/raw_zillow_data_full_raw.csv')

In [13]:
df.head()

Unnamed: 0,address,type,price,features,labels
0,"9177 Collington Sq, Allison Park, PA 15101",Townhouse for rent,"$1,550/mo","[3, 1.5, 1,404]","[sqft, ba, bds]"
1,"400 Chapel Rd, Center Township, PA 15101",House for sale,"$235,000","[3, 2, 3,648]","[sqft, ba, bds]"
2,"9003 Elm St, Allison Park, PA 15101",House for sale,"$199,900","[3, 2, 1,238]","[sqft, ba, bds]"
3,"1111 Burchfield Rd, Allison Park, PA 15101",House for sale,"$625,000","[3, 4, 2,527]","[sqft, ba, bds]"
4,"1865 Concord Dr, Allison Park, PA 15101",House for sale,"$269,900","[4, 3, 2,214]","[sqft, ba, bds]"
