# Zillow Web Scraping

In [8]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time, os
import numpy as np
import random
import re
import pickle
import urllib.request, urllib.error, urllib.parse
from fake_useragent import UserAgent
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### Approach #1 - Scrape All Property Links and and Download Individual Page HTML

In [2]:
#Pull all links via zipcodes and iterate through individual property links
link_list = []

def get_all_links(link, zipcode):
    
    #Create list of links
    def get_page_links(search_page):
        for div in search_page.findAll('div', attrs={'class' : 'list-card-info'}):
            link_list.append(div.a['href'])
        return link_list
    
    options = webdriver.ChromeOptions()
    options.add_argument("--user-agent=New User Agent")
    driver = webdriver.Chrome(chrome_path,options=options)
    zipcode_specific_link = link[:-1] + '-{zc}/'.format(zc=zipcode)
    driver.get(zipcode_specific_link)
    first_soup = BeautifulSoup(driver.page_source)
    total_listings = int(first_soup.find('div', attrs={'class' : 'total-text'}).text.replace(',', ''))
    total_pages = total_listings//40
    total_pages = total_pages + 2
    for i in range(1,total_pages):
        if (i == 1):
            link_list = get_page_links(first_soup)
            print('On page', i, '& Scraped', len(link_list), 'links so far')
            time.sleep(random.uniform(3,6))
        else:
            if (i % 2 == 0):
                driver.quit()
                options = webdriver.ChromeOptions()
                user_agent = ua.random
                options.add_argument(f'user-agent={user_agent}')
                options.add_argument('--disable-blink-features=AutomationControlled')
                driver = webdriver.Chrome(chrome_path,options=options)
            if (i % 5 == 0):
                time.sleep(random.uniform(300,320))
            iterable_link = link[:-1] + '-{zc}/'.format(zc=zipcode) + '{pn}_p/'.format(pn=i)
            driver.get(iterable_link)
            page_soup = BeautifulSoup(driver.page_source)
            link_list = get_page_links(page_soup)
            print('On page', i, '& Scraped', len(link_list), 'links so far')
            time.sleep(random.uniform(3,6))
    driver.quit()
    return print('All complete')

# Apply to all zipcodes
def zipcode_apply(link,zipcode_list):
    for zipcode in zipcode_list:
        print('Starting', zipcode)
        get_all_links(link,zipcode)
    return print('Done!')

#Pickle final list
with open('philly_property_lists', 'wb') as philly_links:
    pickle.dump(link_list, philly_links)
    
#Iterate through properties and download HTML
def download_properties_html(link_list):
    options = webdriver.ChromeOptions()
    options.add_argument("--user-agent=New User Agent")
    driver = webdriver.Chrome(chrome_path,options=options) 
    for count,link in enumerate(link_list, start=1):
        time.sleep(random.uniform(20.25, 40.75))
        if (count % random.randint(2,8) == 0):
            time.sleep(random.uniform(30,60))
        driver.get(link)
        time.sleep(random.uniform(10.75, 30.25))
        page_to_save = driver.page_source
        file_name = 'zillow{zc}-pg{pg}-{n}.html'.format(zc=zipcode, pg=page, n=count)
        saved_html = open(file_name, 'w')
        saved_html.write(page_to_save)
        driver.back()
    return print(count," total properties downloaded")

### Approach #2 - Use Selenium to Click Each Individual Property on Each Search Pages and Download HTML

In [10]:
#Go through search pages manually
def get_all_html(link, zipcode):

    def find_total_pages(soup):
        total_listings = int(soup.find('div', attrs={'class' : 'total-text'}).text.replace(',', ''))
        total_pages = total_listings//40
        total_pages = total_pages + 2
        return total_pages

    def download_properties_html(zipcode,page):
        property_clicks_list = driver.find_elements_by_class_name('list-card-top')
        for count,link in enumerate(property_clicks_list, start=1):
            time.sleep(random.uniform(20.25, 40.75))
            if (count % random.randint(2,8) == 0):
                time.sleep(random.uniform(30,60))
            attempts = 0
            while(attempts < 2):
                try:
                    link.find_element_by_xpath('.//a/img').click()
                except:
                    time.sleep(random.uniform(10,15))
                    attempts += 1
            time.sleep(random.uniform(10.75, 30.25))
            page_to_save = driver.page_source
            file_name = 'zillow{zc}-pg{pg}-{n}.html'.format(zc=zipcode, pg=page, n=count)
            saved_html = open(file_name, 'w')
            saved_html.write(page_to_save)
            driver.back()
        return print(count," total properties downloaded")
       
    def click_next_zillow_page():
        list_of_xpaths = ['//*[@id="grid-search-results"]/div[2]/nav/ul/li[{n}]/a'.format(n=i) for i in range(1,12)]
        for xpath in list_of_xpaths:
            try:
                nextpage = driver.find_element_by_xpath(xpath)
                nextpage.click()
            except:
                pass
        return print('Moved to next page')
    
    options = webdriver.ChromeOptions()
    options.add_argument("--user-agent=New User Agent")
    driver = webdriver.Chrome(chrome_path ,options=options)
    zipcode_specific_link = link[:-1] + '-{zc}/'.format(zc=zipcode)
    driver.get(zipcode_specific_link)
    first_soup = BeautifulSoup(driver.page_source)
    total_pages = find_total_pages(first_soup)
    print('On page 1')
    download_properties_html(zipcode,1)
    time.sleep(random.uniform(3,6))
    for i in range(2,total_pages):
        if (i % random.randint(2,8) == 0):
            time.sleep(random.uniform(30,60))
        elif (i % random.randint(2,15) == 0):
            time.sleep(random.uniform(300,320))
        elif (i % random.randint(2,20) == 0):
            time.sleep(random.uniform(400,500))
        time.sleep(np.random.lognormal(0,1))
        click_next_zillow_page()
        if (i % random.randint(2,5) == 0):
            time.sleep(random.uniform(5,10))
        print('On page', i)
        download_properties_html(zipcode,i)
        time.sleep(random.uniform(3,6))
    driver.quit()
    return print('All complete')

#Apply to all zipcodes
def apply_all_zipcodes(philly_link,zipcode_list):
    for zipcode in zipcode_list:
        get_all_html(philly_link,zipcode)
        time.sleep(random.uniform(600,800))
    return print('Finished with all zipcodes')

In [9]:
#Create Chrome driver
chrome_path = r'/Applications/chromedriver'

#Link for Zillow Philadelphia region
philly_link = 'https://www.zillow.com/philadelphia-pa/'

#Pull in zipcodes
with open('philly_zipcodes','rb') as read_file:
    philly_zipcodes = pickle.load(read_file)
    
#Update based-on last zipcode successfully scrape
zipcodes_remaining = philly_zipcodes[3:]

In [15]:
#Run code

apply_all_zipcodes(philly_link, zipcodes_remaining)

### Scrape Property HTML Files - Need to Fix
Please note, since my project moved away from Zillow scraping, this section of code is not fully complete.

In [None]:
#Create property dictionary
property_headers = ['total_bedrooms', 'total_bathrooms',
           'zip_code', 'sqft', 'property_type',
            'year_built', 'cooling', 'heating', 'parking',
            'lot_size_acres', 'walk_score', 'transit_score',
            'price', 'tax_value']

property_data = []

property_dict = dict(zip(property_headers, ['total_bedrooms', 'total_bathrooms',
           'zip_code', 'sqft', 'property_type',
            'year_built', 'cooling', 'heating', 'parking',
            'lot_size_acres', 'walk_score', 'transit_score', 'price', 'tax_value']))

In [None]:
    with open(html_file) as page:
        property_html = page.read()
    property_soup = BeautifulSoup(property_html, "lxml")

In [None]:
#Scrape a single page

def scrape_a_page(html_file):
    with open(html_file) as page:
        property_html = page.read()
    property_soup = BeautifulSoup(property_html, "lxml")

#Number of bedrooms/bathrooms/sqft of property
    top_str = property_soup.find('span', attrs={'class' : 'ds-bed-bath-living-area-container'}).text
    top_str = re.sub('[A-Za-z,]','',top_str).strip().split(' ')
    total_bedrooms = top_str[0]
    total_bathrooms = top_str[1]
    sqft = top_str[2]

#Zipcode
    zip_code = property_soup.find('h1', attrs={'id' : 'ds-chip-property-address'}).text[-5:]

#Property Type/year built/cooling/heating/parking
    info_sect_str = property_soup.find('ul', attrs={'class' : 'ds-home-fact-list'}).text.replace(',','').split(':')
    property_type = info_sect_str[1][:-10]
    year_built = re.sub('[A-Za-z]','',info_sect_str[2])
    cooling = info_sect_str[4][:-7]
    heating = info_sect_str[3][:-7]
    if property_soup.find(text=)
    parking = info_sect_str[5][:-3]
    lot_size_acres = info_sect_str[6][:-16]

#Price
    header_str = property_soup.find('div', attrs={'class' : 'ds-summary-row'}).text
    total_not_needed = -len(property_soup.find('div', attrs={'class' : 'ds-bed-bath-living-area-header'}).text)
    price = int(header_str[:total_not_needed].replace('$','').replace(',',''))
    
    return print('total_bedrooms:', total_bedrooms,
                 ', total_bathrooms:', total_bathrooms,
                ', sqft:', sqft,
                ', zip_code:', zip_code,
                ', property_type:', property_type,
                ', year_built:', year_built,
                ', cooling:', cooling,
                ', heating:', heating,
                ', parking:', parking,
                ', lot_size_acres:', lot_size_acres)

In [None]:
#Scrape individual page
listings_without_tax_info = []

if property_soup.find(text='Tax history is unavailable.') == 'NoneType':
    
else: 
    listings_without_tax_info.append('INSERT LINK HERE')  

In [None]:
#Walk score

#Transit score

#Tax value