In [1]:
#RentHop Web Scrape Program
#This file will save rental postings from the RentHop website
#The data will be saved as a list of dictionaries where each dictionary represents a rental posting
import time
import pandas as pd

import requests
from bs4 import BeautifulSoup
import re

In [2]:
#This function receives a page number (integer) and returns the content of a RentHop search page

def get_main_listings(page):
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
    url = 'https://www.renthop.com/search/nyc?max_price=50000&min_price=0&sort=hopscore&q=&search=0&page={}'.format(page)
    page = requests.get(url, headers=headers, timeout=50) #it will keep trying at this stage, the loop will not continue to iterate until a response is given
    c = page.content
    return BeautifulSoup(c, 'html.parser')
    

In [3]:
#This code tests the above function and sections off the page to the specific content that represents the
#rent postings. There are twenty postings per page
one_page = get_main_listings(1).find_all('div', class_ = 'search-listing font-size-10 my-4 my-md-0 py-0 py-md-4')

In [4]:
len(one_page)

20

In [5]:
#This function receives a specific posting URL and returns the amenities and nearest
#subway stop (in miles - integer). There is a try and except block of code because the amenities are not
#posted uniform across the site

def get_amenities_subway(listing_url):
    
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
    listing_page = requests.get(listing_url, headers=headers, timeout=50) #it will keep trying at this stage, the loop will not continue to iterate until a response is given
    c = listing_page.content
    
    
    try:
        amenities = BeautifulSoup(c, 'html.parser').find_all('div', class_ = 'px-3 px-lg-0')[1].get_text().split('\n\nFeatures & Amenities\n\n\n\n\n')[1].split('\n\n\n')

    except:
        amenities = []
        
        
    try:
        subway = BeautifulSoup(c, 'html.parser').find_all('div', class_ = 'px-3 px-lg-0')[3].get_text().replace('\n\n', ',').replace('\n', '').split(',')
        subway_mi = float(subway[3:4][0].split(' mi')[0])

    except:
        try:
            subway = BeautifulSoup(c, 'html.parser').find_all('div', class_ = 'px-3 px-lg-0')[4].get_text().replace('\n\n', ',').replace('\n', '').split(',')
            subway_mi = float(subway[3:4][0].split(' mi')[0])
            
        except:
            subway_mi = 'NaN'
        
    time.sleep(.05)
        
    return [amenities[:-1], subway_mi]

In [6]:
#This function receives the summary posting and obtains the Asking Rent, Specific Posting URL, Number of Bedrooms and
#Bathrooms, and initial amenities list. This function accesses the individual posting's main page through the URL and uses the previous function to
#obtain the other amenities list and nearest subway information.
#The information is returned as a dictionary
def get_main_listing_dict(listing):
    
    #Creates an initial dictionary with neat data
    
    listing_dict = {
        'url' : listing.find('a', href=True)['href'],
        'rent' : listing.find("td", {"class": "font-size-11 bold"}).get_text().replace("\n","").replace("$","").replace(",",""),
        'bedrooms' : listing.find("td", {"class": "font-size-11 bold"}).find_next().get_text().replace("\n","").replace(" Bed",""),
        'bath' : listing.find("td", {"class": "font-size-11 bold"}).find_next_sibling().find_next_sibling().get_text().replace("\n","").replace(" Bath","")
    }
    
    #Separates the address and unit number
    try:
        listing_dict['address'] = listing.find('a', class_ = 'font-size-11 listing-title-link b').get_text().split(', ')[0]
        listing_dict['unit'] = listing.find('a', class_ = 'font-size-11 listing-title-link b').get_text().split(', ')[1]
    except:
        listing_dict['address'] = listing.find('a', class_ = 'font-size-11 listing-title-link b').get_text()
        listing_dict['unit'] = 'NaN'
    
    #Separates the borough
    listing_dict['borough'] = listing.find("div", {"class": "font-size-9 overflow-ellipsis"}).get_text().replace("\n","").split(', ')[-1]
    listing_dict['neighborhood'] = listing.find("div", {"class": "font-size-9 overflow-ellipsis"}).get_text().replace("\n","").split(', ')[0]
    
    #Cleans the amenities data in the 'extras' column
    try:
        extra = listing.find_all("div", {"class": "font-size-9"})[-1].get_text().replace("\n","")
        listing_dict['extra'] = extra.replace("\xa0"," · ")

        
        try:
            listing_dict['extra'] = listing_dict['extra'].split(' · ')
        except:
            listing_dict['extra'] = [listing_dict['extra']]
    except:
        listing_dict['extra'] = [listing_dict['extra']]
        
    
    #Adds the amenities from the specific listing page
    
    listing_dict['extra'] = listing_dict['extra'] + get_amenities_subway(listing_dict['url'])[0]
    
    
    listing_dict['nearest_subway'] = get_amenities_subway(listing_dict['url'])[1]
    
    try:
        listing_dict['sqft'] = list(filter(lambda x: ' ft²' in x, listing_dict['extra']))[0].split(' ')[0]
        listing_dict['sqft'] = int(listing_dict['sqft'])
    except:
        listing_dict['sqft'] = 0
        
        
    listing_dict['extra'] = list(filter(lambda x: ' ft²' not in x, listing_dict['extra']))
        
        
        
    return listing_dict
    
    

In [8]:
#This line of code tests the above function for one listing
get_main_listing_dict(one_page[0])

{'url': 'https://www.renthop.com/listings/1-blue-slip/na/14452182',
 'rent': '2863',
 'bedrooms': 'Studio',
 'bath': '1',
 'address': '1 Blue Slip',
 'unit': 'NaN',
 'borough': 'Manhattan',
 'neighborhood': 'Governors Island',
 'extra': ['No Fee',
  'Doorman',
  'Elevator',
  'Laundry in Unit',
  'No Fee',
  'Floorplans Available',
  'Featured',
  'Laundry In Unit',
  'Doorman',
  'Elevator',
  'Fitness Center',
  'Laundry In Building',
  'Common Outdoor Space',
  'Storage Facility',
  'Cats Allowed',
  'Dogs Allowed'],
 'nearest_subway': 0.96,
 'sqft': 0}

In [9]:
#This line of code tests the above function for a page of postings
list(map(lambda x: get_main_listing_dict(x), one_page))

[{'url': 'https://www.renthop.com/listings/1-blue-slip/na/14452182',
  'rent': '2863',
  'bedrooms': 'Studio',
  'bath': '1',
  'address': '1 Blue Slip',
  'unit': 'NaN',
  'borough': 'Manhattan',
  'neighborhood': 'Governors Island',
  'extra': ['No Fee',
   'Doorman',
   'Elevator',
   'Laundry in Unit',
   'No Fee',
   'Floorplans Available',
   'Featured',
   'Laundry In Unit',
   'Doorman',
   'Elevator',
   'Fitness Center',
   'Laundry In Building',
   'Common Outdoor Space',
   'Storage Facility',
   'Cats Allowed',
   'Dogs Allowed'],
  'nearest_subway': 0.96,
  'sqft': 0},
 {'url': 'https://www.renthop.com/listings/163-e-92nd-st/17/14481897',
  'rent': '3195',
  'bedrooms': '2',
  'bath': '1',
  'address': '163 E 92nd St',
  'unit': 'Apt 17',
  'borough': 'Manhattan',
  'neighborhood': 'Upper East Side',
  'extra': ['Exclusive', '', 'Featured', 'Exclusive'],
  'nearest_subway': 0.19,
  'sqft': 0},
 {'url': 'https://www.renthop.com/listings/543-east-5th-street/13/14573400',
  

In [10]:
#This is the main Web Scrape block of code. This code utilizes the above function to create a list of dictionaries
#that represent rent postings. There si a time.sleep function to limit the speed of the code and some print lines
#to monitor the success of the code.

total_pages = 2988
all_listing_dict = []

    

for page in range(1, 750):
    full_page = get_main_listings(page).find_all('div', class_ = 'search-listing font-size-10 my-4 my-md-0 py-0 py-md-4')
        
    for listing in full_page:
        all_listing_dict.append(get_main_listing_dict(listing))
    
    time.sleep(.05)
    
    if len(all_listing_dict) %100 == 0:
        print('{} rent listings have been added so far.'.format(len(all_listing_dict)))
            
    if len(all_listing_dict) %250 == 0:
        print('This is dictionary #{}: '.format(len(all_listing_dict)))
        print(all_listing_dict[-1])
        
        
        
        

100 rent listings have been added so far.
200 rent listings have been added so far.
300 rent listings have been added so far.
400 rent listings have been added so far.
500 rent listings have been added so far.
This is dictionary #500: 
{'url': 'https://www.renthop.com/listings/bushwick-avenue/c5/14554014', 'rent': '2975', 'bedrooms': '2', 'bath': '1.5', 'address': 'Bushwick Avenue', 'unit': 'NaN', 'borough': 'Brooklyn', 'neighborhood': 'Bushwick', 'extra': ['No Fee', 'Hardwood Floors', 'No Fee', 'Deck', 'Laundry In Building', 'Dishwasher', 'Hardwood Floors', 'New Construction', 'Bicycle Room', 'Common Outdoor Space', 'Courtyard', 'Light', 'Live In Super', 'Microwave', 'Receiving Room', 'Renovated', 'Stainless Steel Appliances', 'Virtual Doorman'], 'nearest_subway': 0.11, 'sqft': 0}
600 rent listings have been added so far.
700 rent listings have been added so far.
800 rent listings have been added so far.
900 rent listings have been added so far.
1000 rent listings have been added so f

5100 rent listings have been added so far.
5200 rent listings have been added so far.
5300 rent listings have been added so far.
5400 rent listings have been added so far.
5500 rent listings have been added so far.
This is dictionary #5500: 
{'url': 'https://www.renthop.com/listings/east-78th-street/1a/13970064', 'rent': '2300', 'bedrooms': '1', 'bath': '1', 'address': 'East 78th Street', 'unit': 'NaN', 'borough': 'Manhattan', 'neighborhood': 'Upper East Side', 'extra': ['No Fee', 'Laundry in Unit', 'Hardwood Floors', 'No Fee', 'Laundry In Unit', 'Dishwasher', 'Hardwood Floors', 'Brownstone', 'Diplomats Ok', 'Light', 'Renovated', 'Subway', 'Dogs Allowed', 'Cats Allowed'], 'nearest_subway': 0.39, 'sqft': 0}
5600 rent listings have been added so far.
5700 rent listings have been added so far.
5800 rent listings have been added so far.
5900 rent listings have been added so far.
6000 rent listings have been added so far.
This is dictionary #6000: 
{'url': 'https://www.renthop.com/listings/

ConnectionError: ('Connection aborted.', OSError("(54, 'ECONNRESET')"))

In [18]:
#This code tests an individual listing of the list of postings
all_listing_dict[1234]

{'url': 'https://www.renthop.com/listings/delavan-street/na/14551301',
 'rent': '3600',
 'bedrooms': 'Studio',
 'bath': '1',
 'address': 'Delavan street',
 'unit': 'NaN',
 'borough': 'Brooklyn',
 'neighborhood': 'Red Hook',
 'extra': ['Elevator',
  'Featured',
  'Elevator',
  'Laundry In Building',
  'Common Outdoor Space',
  'Storage Facility',
  'Cats Allowed',
  'Dogs Allowed'],
 'nearest_subway': 0.58,
 'sqft': 0}

In [12]:
#This line of code puts the list of dictionary postings into a pandas dataframe
df_summary_listings = pd.DataFrame(all_listing_dict)

In [16]:
#This line of code views the final entries of the dataframe
df_summary_listings.tail()

Unnamed: 0,address,bath,bedrooms,borough,extra,nearest_subway,neighborhood,rent,sqft,unit,url
6654,247 W 87th St,1,Studio,Manhattan,"[No Fee, Exclusive, Doorman, Elevator, No Fee,...",0.04,Upper West Side,3450,0,Apt 11H,https://www.renthop.com/listings/247-w-87th-st...
6655,Essex street studio!!,1,Studio,Manhattan,[],0.12,Chinatown,1850,0,,https://www.renthop.com/listings/essex-street-...
6656,West 30th st,2,3,Manhattan,"[Elevator, Private Outdoor Space, Elevator, La...",0.22,Chelsea,5000,0,,https://www.renthop.com/listings/west-30th-st/...
6657,Center Boulevard,1,1,Queens,"[Doorman, Elevator, Doorman, Fitness Center, E...",0.34,Hunters Point,3387,0,,https://www.renthop.com/listings/center-boulev...
6658,East 2nd Street,1,2,Manhattan,"[No Fee, Doorman, Elevator, Laundry in Unit, ...",0.42,Alphabet City,3700,0,,https://www.renthop.com/listings/east-2nd-stre...


In [19]:
#This line of code saves the dataframe as a csv file
df_summary_listings.to_csv("cleaner_list_of_summary_rent_listings.csv", index=False)