In [83]:
import os
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import sys
import numpy as np
import pandas as pd
import regex as re
from selenium import webdriver
import pickle
import json


class ZillowData:
    """
    Zillow data scrapper.
    """

    def __init__(self, driver, chromedriver, url):
        """
        Contructor
        """
        self.driver = driver
        self.chromedriver = chromedriver
        self.url = url

    def get_house_links(self, url, driver, pages=20):
        """
        Function to fetch list of house links on all the defined pages.
        :param url: link of the page.
        :param driver: selenium driver instance.
        :param pages: how many pages to scrape the listing.
        :return: list of house links on all the defined pages.
        """
        # list to store links of houses on desired pages
        house_links = []
        try:
            for i in range(0, pages):
                # set url for each page by appending _p/ to base_url
                # for exaple to navigate to page 2: url+'2'+'_p/' i.e https://www.zillow.com/new-york-ny/2_p
                self.url = self.url + str(i + 1) + '_p/'

                # navigate to page
                self.driver.get(url)
                # sleep for N secs
                time.sleep(3)
                soup = BeautifulSoup(driver.page_source, 'html.parser')

                # get the tags for listings
                listings = soup.find_all("a", class_="list-card-link list-card-link-top-margin list-card-img")

                # get href or link from listings
                page_data = [row['href'] for row in listings]
                print("listing of houses on page  "+ str(i+1)+ " is " + str(len(page_data)))

                # store page links into the list
                house_links.append(page_data)
                time.sleep(4)

            return [j for i in house_links for j in i]  # flatten and resturn the house links as list.
        except:
            return 'None'

    def get_html_data(self, base_url):
        """
        Function to  extract html data of the page using BeautifulSoup
        :param base_url: link of the particular house on the page listings.
        :return: extracted data of html page with all tags.
        """
        try:
            # navigate to page
            self.driver.get(base_url)
            time.sleep(3)
            # extract html data of the page using BeautifulSoup
            self.soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            # driver.close()
            return self.soup
        except:
            return 'None'

    def get_num_beds(self, soup):
        """
        Function to get number of bed, bathroom and size of apartment in sqfeet.
        :param soup: extracted html data of a  page with all tags.
        :return: umber of bed, bathroom and size of apartment in sqfeet.
        """
        try:
            # extract data of the page which holds information about the beds, bathroom,flat size.
            html_data = self.soup.find_all("span", class_='ds-bed-bath-living-area')
            # extract bed info
            bedroom = html_data[0].text.split()[0]
            # extract bathroom info
            bathroom = html_data[1].text.split()[0]
            # extract apartment size
            size = html_data[2].text.split()[0]
            return bedroom, bathroom, size
        except:
            return 'None', 'None', 'None'

    def get_addresss(self, soup):
        """
        Function to get address of the listing.
        :param soup: extracted html data of a  page with all tags.
        :return: address and city of the apartment.
        """
        try:
            # extract data of the page which holds information about the address.
            html_data = self.soup.find_all("h1",
                                           class_='Text-c11n-8-18-0__aiai24-0 StyledHeading-c11n-8-18-0__ktujwe-0 efSAZl')
            # extract address of the apartment
            address = html_data[0].text.split(',')[0] + ' ' + html_data[0].text.split(',')[1].rsplit('\xa0')[1] \
                      + html_data[0].text.split(',')[2]
            # extract city of the apartment
            city = html_data[0].text.split(',')[1].rsplit('\xa0')[1]
            return address, city
        except:
            return 'None', 'None'

    def get_ad_days_views(self, soup):
        """
        Function to get number of days and views for the ad
        :param soup: extracted html data of a  page with all tags.
        :return: number of days and views for the ad
        """
        try:
            # extract data of the page which holds information how long ad listed on Zillow page.
            html_data = self.soup.find_all("div", class_='Text-c11n-8-18-0__aiai24-0 einFCw')
            # extract number of days of ad
            zillow_days = html_data[0].text.split()[0]
            # extract number of views for the ad
            views = html_data[1].text.split()[0]
            return zillow_days, views
        except:
            return 'None', 'None'

    def get_price(self, soup):
        """
        Function to get price of the apartment
        :param soup: extracted html data of a  page with all tags.
        :return: price of the apartment
        """
        try:
            # extract data of the page which holds information apartment price.
            html_data = self.soup.find_all("h4",
                                           class_='Text-c11n-8-18-0__aiai24-0 StyledHeading-c11n-8-18-0__ktujwe-0 gcaUyc sc-pHIBf jLwdeZ')
            # extract price of the apartment
            price = html_data[0].text.split()[0].rsplit('$')[1]
            return price
        except:
            return 'None'

    def get_type_of_ad(self, soup):
        """
        Function to get ad type.
        :param soup: extracted html data of a  page with all tags.
        :return: ad type.
        """
        try:
            # extract data of the page which holds information type of ad.
            html_data = self.soup.find_all("span", class_='sc-pYA-dN ivRwcz ds-status-details')
            # extract ad type.
            ad_type = html_data[0].text
            return ad_type
        except:
            return 'None'

    def get_company(self, soup):
        """
        Function to get company or broker who listed the ad.
        :param soup: extracted html data of a  page with all tags.
        :return: company or broker who listed the ad.
        """
        try:
            # extract data of the page which holds information company/broker listed the ad.
            html_data = soup.find_all("strong", class_='Text-c11n-8-18-0__aiai24-0 dokllX')
            # extract commpany or broker info
            company = html_data[0].text
            return company
        except:
            return 'None'

    def get_agent(self, soup):
        """
        Function to get agent or owner  who listed the ad.
        :param soup: extracted html data of a  page with all tags.
        :return: agent or owner  who listed the ad.
        """
        try:
            # extract data of the page which holds information agent or owner who listed the ad.
            html_data = self.soup.find_all("p", class_='Text-c11n-8-18-0__aiai24-0 foiYRz')
            # extract agent or owner info
            agent = html_data[0].text
            return agent
        except:
            return 'None'

    def get_zillow_data(self, soup, link):
        """
        Function to get all the features of a house.
        :param soup: extracted html data of a  page with all tags.
        :param link: link of the house from which we extract data.
        :return: list containing all the features of a house.
        """

        # store all the
        zillow_house_data = []
        bedroom, bathroom, size = self.get_num_beds(self.soup)
        address, city = self.get_addresss(self.soup)
        zillow_days, views = self.get_ad_days_views(self.soup)
        price = self.get_price(self.soup)
        ad_type = self.get_type_of_ad(self.soup)
        company = self.get_company(self.soup)
        agent = self.get_agent(self.soup)

        zillow_house_data.append(
            [bedroom, bathroom, size, address, city, zillow_days, views, price, ad_type, company, agent, link])
        return zillow_house_data

    def get_all_data(self, chromedriver, driver, pages=20):
        """
        :param chromedriver: chrome drive path to run url and extract data.
        :param driver: driver instance to use selenium.
        :param pages: number of pages to scrape.
        :return: pandas dataframe containing houses  information.
        """
        print('chromedriver path: {}'.format(self.chromedriver))
        sys.path.append(self.chromedriver)
        # self.driver = webdriver.Chrome(self.chromedriver)
        houses = self.get_house_links(self.url, self.driver, pages)
        print("------calculations house links done--------")
        zillow_data = []
        for i in range(0, len(houses)):
            # driver = webdriver.Chrome(chromedriver)
            soup = self.get_html_data(houses[i])
            # time.sleep(3)
            zillow_data.append(self.get_zillow_data(soup, houses[i]))
            time.sleep(4)

        with open('zillow_data_1000.pkl', 'wb') as f:
            pickle.dump(zillow_data, f)
        df = pd.DataFrame([j for i in zillow_data for j in i])
        columns = ['bedroom', 'bathroom', 'size', 'address', 'city', 'zillow_days', 'views', 'price', 'ad_type',
                   'company', 'agent', 'link']
        df.columns = columns
        df.to_csv('zillow_data_26.csv', index=False)
        print("-----dataframe crated ------")
        self.driver.close()
        return df

    def stat_of_data(self, df):
        """
        Function to print statics about the scraped data
        :param df: dataframe which holds all the scraped data
        :return: nothing.
        """
        print('Number of unique properties per  page {}'.format(40))
        print('Number of unique properties on all  pages {}'.format(df['link'].nunique()))
        #print('Number of unique properties on all 25 pages is ')
        print('Number of properties per type of the ad   ')
        print(df.ad_type.value_counts())
        print('        ')

        print('Number of properties per company/broker  ')
        print(df.agent.value_counts())

        df['price'] = df['price'].apply(lambda x: x.replace(',', ''))
        df['size'] = df['size'].apply(lambda x: x.replace(',', '') if x != 'None' else '0')
        df['size'] = df['size'].apply(lambda x: x.replace(',', '') if x != '--' else '0')
        print('Average price (in total): ${} '.format(np.average(df['price'].astype(int))))
        print('Average price per sq.ft: ${} '.format(
            round((np.average(df['price'].astype(int)) / np.average(df['size'].astype(int))), 4)))
        results={'Number of unique properties per  page ': 40,
                'Number of properties per page ': df['link'].nunique(),
                'Number of properties per type of the ad   ': df.ad_type.value_counts(),
                'Number of properties per company/broker  ': df.agent.value_counts(),
                'Average price (in total)$: ':np.average(df['price'].astype(int)),
                'Average price per sq.ft$: ':round((np.average(df['price'].astype(int)) / np.average(df['size'].astype(int))), 4)}
        print(type(results))
        with open('results.txt', 'w') as file:
            file.write(str(results))
        return results


# Listing on a single page

In [84]:
chromedriver = r"C:\Users\49176\OneDrive\Desktop\chromedriver_win32/chromedriver.exe" # path to the chromedriver executable
chromedriver = os.path.expanduser(chromedriver)
sys.path.append(chromedriver)
driver = webdriver.Chrome(chromedriver)
z= ZillowData(driver,chromedriver,'https://www.zillow.com/new-york-ny/')
df=z.get_all_data(chromedriver,driver,pages=1)
z.stat_of_data(df)

chromedriver path: C:\Users\49176\OneDrive\Desktop\chromedriver_win32/chromedriver.exe
listing of houses on page  1 is 40
------calculations house links done--------
-----dataframe crated ------
Number of unique properties per  page 40
Number of unique properties on all  pages 40
Number of properties per type of the ad   
For sale             37
For sale by owner     3
Name: ad_type, dtype: int64
        
Number of properties per company/broker  
Property Owner                       3
Joelle Pergolotti                    2
Elayne Reimer                        2
Louis Belisario                      1
Keith Jacoby                         1
Danielle Stout                       1
Jorge Mendoza                        1
Kim Mc Keller                        1
Robyn Diament                        1
Amy Williamson                       1
Nicholas Venturini                   1
Karen Cantor                         1
Eva Penson                           1
Ryan Stenta at Eklund|Gomes Team     1
Jos

{'Number of unique properties per  page ': 40,
 'Number of properties per page ': 40,
 'Number of properties per type of the ad   ': For sale             37
 For sale by owner     3
 Name: ad_type, dtype: int64,
 'Number of properties per company/broker  ': Property Owner                       3
 Joelle Pergolotti                    2
 Elayne Reimer                        2
 Louis Belisario                      1
 Keith Jacoby                         1
 Danielle Stout                       1
 Jorge Mendoza                        1
 Kim Mc Keller                        1
 Robyn Diament                        1
 Amy Williamson                       1
 Nicholas Venturini                   1
 Karen Cantor                         1
 Eva Penson                           1
 Ryan Stenta at Eklund|Gomes Team     1
 Josh Rubin                           1
 Michael S. Rastegar                  1
 Richard J Steinberg                  1
 Craig Dix                            1
 Alina Yukhtman        

# Listings on 25 pages

In [85]:
# listing on all the 25 pages
# load the data from saved pickle 
with open('zillow_data_25Pages.pkl', 'rb') as f:
    data = pickle.load(f)
#create dataframe for listings on 25 pages
df=pd.DataFrame([j for i in data for j in i])
#define column
columns=['bedroom','bathroom','size','address','city','zillow_days','views','price','ad_type','company','agent','link']
df.columns=columns
#show data
df.head()

Unnamed: 0,bedroom,bathroom,size,address,city,zillow_days,views,price,ad_type,company,agent,link
0,1,1,900,100 W 57th St #11R New York NY 10019,New York,190,41236,215000,For sale,Douglas Elliman,Phyllis Pei,https://www.zillow.com/homedetails/100-W-57th-...
1,1,1,900,100 W 57th St APT 5R New York NY 10019,New York,162,8479,185000,For sale,Pergolotti Realty 917-415-6609,Joelle Pergolotti,https://www.zillow.com/homedetails/100-W-57th-...
2,5,4,2268,14 W 184th St Bronx NY 10468,Bronx,23,10499,399000,For sale,RE/MAX Distinguished Hms.&Prop 914-346-8255,Hermi Aquino,https://www.zillow.com/homedetails/14-W-184th-...
3,2,3,1800,303 E 57th St APT 32B New York NY 10022,New York,46,43519,445000,For sale,Brown Harris Stevens 212-381-3372,Elayne Reimer,https://www.zillow.com/homedetails/303-E-57th-...
4,4,6,7130,111 W 57th St PENTHOUSE 72 New York NY 10019,New York,148,42892,66000000,For sale,Douglas Elliman 212-203-5401,Amy Williamson,https://www.zillow.com/homedetails/111-W-57th-...


In [86]:
z.stat_of_data(df)

Number of unique properties per  page 40
Number of unique properties on all  pages 800
Number of properties per type of the ad   
For sale             956
For sale by owner     31
New construction       7
Foreclosure            2
Contingent             1
Pending                1
Lot/land               1
Auction                1
Name: ad_type, dtype: int64
        
Number of properties per company/broker  
Property Owner                          31
Joelle Pergolotti                       24
Elayne Reimer                           15
Eric Lee                                13
Keystone Realty Of Greater New York     10
                                        ..
Nicholas Isra                            1
Deedee Weiss                             1
Dave Constantino                         1
Deanna Kory                              1
Judith McKenna                           1
Name: agent, Length: 650, dtype: int64
Average price (in total): $5535262.854 
Average price per sq.ft: $1127.9802 
<c

{'Number of unique properties per  page ': 40,
 'Number of properties per page ': 800,
 'Number of properties per type of the ad   ': For sale             956
 For sale by owner     31
 New construction       7
 Foreclosure            2
 Contingent             1
 Pending                1
 Lot/land               1
 Auction                1
 Name: ad_type, dtype: int64,
 'Number of properties per company/broker  ': Property Owner                          31
 Joelle Pergolotti                       24
 Elayne Reimer                           15
 Eric Lee                                13
 Keystone Realty Of Greater New York     10
                                         ..
 Nicholas Isra                            1
 Deedee Weiss                             1
 Dave Constantino                         1
 Deanna Kory                              1
 Judith McKenna                           1
 Name: agent, Length: 650, dtype: int64,
 'Average price (in total)$: ': 5535262.854,
 'Average pric