In [23]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import json
import numpy as np
import pandas as pd

In [28]:
driver = webdriver.Chrome('./chromedriver') 

In [29]:
prices=[]
beds=[]
baths=[]
square_feets=[]
addresses=[]
states=[]
cities=[]
zipcodes=[]
urls=[]
property_type=[]

def web_scrap(url):
    driver.get(url)
    time.sleep(5)
    html=driver.page_source
    result_page=BeautifulSoup(html,"html.parser")
    houses=result_page.find_all("div",{'class':'bottomV2'})

    for house in houses:
        price=house.find("span",{'class':'homecardV2Price'}).get_text().strip('$')
        prices.append(int("".join(price.split(","))))

        stats=house.find_all('div',{'class':"stats"})
        bed=stats[0].get_text().strip('Beds')
        if bed=='—':
            beds.append(0)
        else:
            beds.append(int(bed))

        bath=stats[1].get_text().strip("Baths")
        if bath=='—':
            baths.append(0)
        else:
            baths.append(float(bath))

        square_feet=stats[2].get_text()
        square_feet="".join(square_feet.strip('Sq. Ft.').split(","))
        if square_feet=='' or square_feet=='—':
            square_feets.append(0)
        else:   
            square_feets.append(int(square_feet))

        more_information=house.find('script', {'type':"application/ld+json"})
        if more_information:
            str_=more_information.text[1:-1]
            list_of_strs=str_.split('},{"@context"')
            str_1=list_of_strs[0]+'}'
            dict_1 = json.loads(str_1)

            if dict_1['address'].get('streetAddress'):
                addresses.append(dict_1['address']['streetAddress'])
            else:
                address.append(0)

            if dict_1['address'].get('addressLocality'):
                cities.append(dict_1['address']['addressLocality'])
            else:
                cities.append(0)

            if dict_1['address'].get('addressRegion'):
                states.append(dict_1['address']['addressRegion'])
            else:
                states.append(0)

            if dict_1['address'].get('postalCode'):
                zipcodes.append(dict_1['address']['postalCode'])
            else:
                zipcodes.append(0)

            if dict_1.get('url'):
                urls.append("https://www.redfin.com"+dict_1['url'])
            else:
                urls.append(0)

            if dict_1.get('@type'):
                property_type.append(dict_1['@type'])
            else:
                property_type.append(0)

        else:
            address_list=house.find("div", {"class":"homeAddressV2"}).get_text().split(",")
            addresses.append(address_list[0].strip())
            states.append(address_list[2].strip()[:3])
            cities.append(address_list[1].strip())
            zipcodes.append(address_list[2].strip()[-5:])
            urls.append("https://www.redfin.com"+house.find('a').get('href'))
            
    res_array=np.concatenate((np.array([property_type]).T, np.array([addresses]).T, 
                      np.array([cities]).T, np.array([states]).T,
                      np.array([zipcodes]).T, np.array([prices]).T, 
                      np.array([beds]).T, np.array([baths]).T, 
                      np.array([square_feets]).T, np.array([urls]).T),axis=1) 
    return res_array

In [30]:
def web_scrap_rolling():
    url="https://www.redfin.com/city/30749/NY/New-York"
    res_array = web_scrap(url)
    res=np.array([np.empty(10)])
    res=np.concatenate((res,res_array),axis=0)
    for i in range(1,18):
        url_new="https://www.redfin.com/city/30749/NY/New-York/page-"+str(i)
        res_array = web_scrap(url_new)
        res=np.concatenate((res,res_array),axis=0)
    return pd.DataFrame(res[1:],columns=["Property_Type","Address","City","State","Zip_or_Postal_Code",
                               "Price", "Beds", "Baths","Square_Feet", "URLs"])

In [32]:
result_df=web_scrap_rolling()

In [33]:
result_df.head()

Unnamed: 0,Property_Type,Address,City,State,Zip_or_Postal_Code,Price,Beds,Baths,Square_Feet,URLs
0,SingleFamilyResidence,245-49 76th Ave Unit B,Bellerose,NY,11426,309000,1,1.0,644,https://www.redfin.com/NY/Jamaica/245-49-76th-...
1,SingleFamilyResidence,2104 Haight Ave,BRONX,NY,10461,699000,3,2.5,2040,https://www.redfin.com/NY/The-Bronx/2104-Haigh...
2,SingleFamilyResidence,Unit at 220 E 65th St,New York,NY,10065,899000,1,1.0,0,https://www.redfin.com/NY/New-York/220-E-65th-...
3,SingleFamilyResidence,Unit at 118 E 60th St,New York,NY,10022,589000,1,1.0,0,https://www.redfin.com/NY/New-York/118-E-60th-...
4,SingleFamilyResidence,105 W 70th St Unit 3F,New York,NY,10023,750000,1,1.0,0,https://www.redfin.com/NY/New-York/105-W-70th-...


In [35]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3584 entries, 0 to 3583
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Property_Type       3584 non-null   object
 1   Address             3584 non-null   object
 2   City                3584 non-null   object
 3   State               3584 non-null   object
 4   Zip_or_Postal_Code  3584 non-null   object
 5   Price               3584 non-null   object
 6   Beds                3584 non-null   object
 7   Baths               3584 non-null   object
 8   Square_Feet         3584 non-null   object
 9   URLs                3584 non-null   object
dtypes: object(10)
memory usage: 280.1+ KB


In [36]:
result_df.describe()

Unnamed: 0,Property_Type,Address,City,State,Zip_or_Postal_Code,Price,Beds,Baths,Square_Feet,URLs
count,3584,3584,3584,3584,3584,3584,3584,3584.0,3584,3584
unique,2,342,48,1,137,243,13,14.0,113,350
top,SingleFamilyResidence,145 Mcguinness Blvd Unit 3A,New York,NY,11355,899000,2,1.0,0,https://www.redfin.com/NY/The-Bronx/2104-Haigh...
freq,2914,35,952,3584,152,103,945,1299.0,2125,35
