## 1- Data Gathering

In [0]:
# Essentials
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline

# Web scraping tools
import requests
from bs4 import BeautifulSoup

In [0]:
# Simple HTTP request
r = requests.get('https://www.renthop.com/nyc/apartments-for-rent') 

In [0]:
# Creating a soup object to parse the appartement data
soup = BeautifulSoup(r.content, "html5lib")

In [0]:
# Looking for all <div> elements that have a class attribute value containing "search-info"
listing_divs = soup.select('div[class*=search-info]') 

In [5]:
# Checking if the length of "listing_divs" is 20 just like the webpage
len(listing_divs)

20

In [0]:
# Isolating an element of the listing to try to extract information
current_listing = listing_divs[0]

In [7]:
# Extracting the listing's url, address and neighborhood
href = current_listing.select('a[id*=title]')[0]['href'] 
addy = current_listing.select('a[id*=title]')[0].string 
hood = current_listing.select('div[id*=hood]')[0].string.replace('\n','')
print(href)
print(addy)
print(hood)

https://www.renthop.com/listings/382-wadsworth-ave/4g/15761811
382 Wadsworth Ave, Apt 4G
Fort George, Washington Heights, Upper Manhattan, Manhattan


In [8]:
# Selecting the table containing the price, number of beds and number of baths
listing_specs = current_listing.select('table[id*=info] tr')
# Extracting the information from the table
spec_data = listing_specs[0].text.strip().replace(' ', '_').split()
spec_data = list(filter(lambda x: x[0] != '_', spec_data))
print(spec_data)

['$2,300', '2_Bed', '1_Bath']


In [0]:
# Creating a function to extract information from each element of the listing
def extract_info(current_listing):
    indv_listing = [] 
    indv_listing.append(current_listing.select('a[id*=title]')[0]['href'])
    indv_listing.append(current_listing.select('a[id*=title]')[0].string)
    indv_listing.append(current_listing.select('div[id*=hood]')[0].string.replace('\n',''))    
    listing_specs = current_listing.select('table[id*=info] tr')
    try:
        spec_data = listing_specs[0].text.strip().replace(' ', '_').split()
        spec_data = list(filter(lambda x: x != '_' and x[0] != '/' and x[:2] != '_/', spec_data))
        indv_listing.extend(spec_data)
    except:
        indv_listing.extend(np.nan)
    return indv_listing

In [10]:
# Doing the process for the whole listing
listing_list = []
for listing in listing_divs:
    listing_list.append(extract_info(listing))
listing_list[:3]

[['https://www.renthop.com/listings/382-wadsworth-ave/4g/15761811',
  '382 Wadsworth Ave, Apt 4G',
  'Fort George, Washington Heights, Upper Manhattan, Manhattan',
  '$2,300',
  '2_Bed',
  '1_Bath'],
 ['https://www.renthop.com/listings/162-east-33rd-street/3tw/15775226',
  '162 East 33rd Street, Apt 3TW',
  'Rose Hill, Kips Bay, Midtown Manhattan, Manhattan',
  '$3,500',
  '2_Bed',
  '1_Bath'],
 ['https://www.renthop.com/listings/400-east-71st-street/12def/15632907',
  '400 East 71st Street, Apt 12DE...',
  'Upper East Side, Upper Manhattan, Manhattan',
  '$5,795',
  '3_Bed',
  '2_Bath']]

In [0]:
# Defining a function that can parse a whole page
def parse_page(page_number):
    url_prefix = "https://www.renthop.com/search/nyc?max_price=50000&min_price=0&page="
    r = requests.get(url_prefix + str(page_number)) 
    soup = BeautifulSoup(r.content, "html5lib")
    listing_divs = soup.select('div[class*=search-info]') 
    listing_list_page = []
    for listing in listing_divs:
        listing_list_page.append(extract_info(listing))
    return listing_list_page

In [12]:
# Parsing multiple pages
all_pages_parsed = []
for page_number in range(1, 501):
    listing_list_page = parse_page(page_number)
    all_pages_parsed += listing_list_page
    if page_number % 100 == 0:
        print("{} pages parsed.".format(page_number))
        print("------------------------------------")
print('Parsing done!')
print('Gathered data about {} apartments.'.format(len(all_pages_parsed)))

100 pages parsed.
------------------------------------
200 pages parsed.
------------------------------------
300 pages parsed.
------------------------------------
400 pages parsed.
------------------------------------
500 pages parsed.
------------------------------------
Parsing done!
Gathered data about 10000 apartments.


In [13]:
# Tansforming the parsed data into a Pandas Dataframe
df = pd.DataFrame(all_pages_parsed, columns=['url', 'address', 'neighborhood', 'rent', 'beds', 'baths']) 
df.head()

Unnamed: 0,url,address,neighborhood,rent,beds,baths
0,https://www.renthop.com/listings/382-wadsworth...,"382 Wadsworth Ave, Apt 4G","Fort George, Washington Heights, Upper Manhatt...","$2,300",2_Bed,1_Bath
1,https://www.renthop.com/listings/162-east-33rd...,"162 East 33rd Street, Apt 3TW","Rose Hill, Kips Bay, Midtown Manhattan, Manhattan","$3,500",2_Bed,1_Bath
2,https://www.renthop.com/listings/400-east-71st...,"400 East 71st Street, Apt 12DE...","Upper East Side, Upper Manhattan, Manhattan","$5,795",3_Bed,2_Bath
3,https://www.renthop.com/listings/814-10th-aven...,"814 10th Avenue, Apt 7C","Hell's Kitchen, Midtown Manhattan, Manhattan","$2,225",Studio,1_Bath
4,https://www.renthop.com/listings/100-west-31st...,"100 West 31st Street, Apt 36G","Chelsea, Midtown Manhattan, Manhattan","$5,030",1_Bed,1_Bath


In [0]:
# Saving the raw Dataframe as csv for easy access
df.to_csv('Apparements_raw.csv')