In [645]:
import csv 
import json 
import re
import sys 
import datetime 
import requests 
import os 
import time 
import numpy as np 
import pandas as pd
from bs4 import BeautifulSoup 

In [627]:
# get url for each apartment
def get_apt_url(main_web):
    """
    main_web_url: main web url for apartments.com, i.e., 'https://www.apartments.com/boston-ma/'
    return: url for each apartment 
    """
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
    
    # find total pages 
    response = requests.get(main_web, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')
    soup.prettify()
    soup = soup.find('div', class_='placardContainer')
    soup = soup.ul.find('nav', class_='paging')
    
    pages = []
    for page in soup.find_all('a'):
        pages.append(page.get('data-page'))
        pages = list(map(int, pages))
    print('{} relevant numbers found, max page number is {}'.format(len(pages), max(pages)))
    
    # total pages found 
    last_page = max(pages)
    
    # add suffix to get whole url for all pages 
    urls = []
    for i in range(1, last_page+1):
        urls.append(''.join([main_web, str(i), '/']))

    print('Example url looks like {}'.format(urls[0]))
    
    # test request return status code 
    for url in urls:
        print('For url {}, status code is {}'\
              .format(url, requests.get(url, headers=headers).status_code))
        
    # get url for each apartment in each page 
    apt_urls = []
    for url in urls:
        response = requests.get(url, headers=headers)
        soup=BeautifulSoup(response.content, 'lxml')
        soup.prettify()
        soup = soup.find('div', class_='placardContainer')
        
        for item in soup.find_all('article', class_='placard'):
            # add random gap between each request 
            time.sleep(np.random.uniform(low=5, high=20))
            
            if item.find('a', class_='placardTitle js-placardTitle ') is None:
                continue 
            else:
                apt_urls.append(item.find('a', class_='placardTitle js-placardTitle ').get('href'))
                print('Parsing in progress: {}'.format(len(apt_urls)))
        print('{} apt urls got!'.format(len(apt_urls)))

    return apt_urls 
            

In [633]:
#apt_urls = get_apt_url('https://www.apartments.com/boston-ma/')

In [635]:
df_url = pd.DataFrame({'apt_url':apt_urls})

In [615]:
# build dataset
def build_dataset(apt_urls):
    """
    apt_url: url of each apartment/house/townhouse/condo 
    return: data frame populated with entity information 
    """
    # feature names 
    cols = [
        'name',
        'address',
        'bd',
        'ba',
        'sqft',
        'rent',
        'desc',
        'pet',
        'parking',
        'pubSchool',
        'privSchool',
        'walk',
        'transit',
        'ptype',
        'numPOI',
        'nearCollege',
        'distNearCollege',
        'numCollege',
        'distNearSubway',
        'numSubway',
        'distNearRail',
        'numRail',
        'distNearShop',
        'numShop',
        'distNearPark',
        'numPark',
        'distAirport'
    ]
    
    # create empty dataframe 
    df = pd.DataFrame(columns=cols)
    
    # used for requests.get()
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}

    # extract information from each apt/house 
    i = 1 # counter 
    for apt_url in apt_urls:        
        # read web content
        #url = 'https://www.apartments.com/10-shepard-st-boston-ma/xe7n4sm/'
        response = requests.get(apt_url, headers=headers)
        soup = BeautifulSoup(response.content, 'lxml')
        soup.prettify()
        
        print('processing: {}'.format(apt_url))

        # get name 
        name = soup.find('h1', class_='propertyName').getText().strip()

        # get address 
        address = ""
        add = soup.find('div', class_='propertyAddress').h2

        for item in add.find_all('span'):
            address += item.getText() + ','   

        # bed, bath, rent, sqft 
        bd = []
        ba = []
        sqft = []
        rent = []

        for item in soup.find_all('tr', class_='rentalGridRow'):
            if item is not None:
                bd_soup = item.find('td', class_='beds')
                bd.append(bd_soup.find('span', class_='longText').getText().strip())

                ba_soup = item.find('td', class_='baths')
                ba.append(ba_soup.find('span', class_='longText').getText().strip())

                sqft.append(item.find('td', class_='sqft').getText().strip())

                rent.append(item.find('td', class_='rent').getText().strip())
            else:
                bd = 'missing'
                ba = 'missing'
                sqft = 'missing'
                rent = 'missing' 

        # description 
        desc_soup = soup.find('section', class_='descriptionSection js-viewAnalyticsSection')

        if desc_soup is None:
            desc = 'missing'
        else:
            desc = desc_soup.p.getText()

        # pet 
        if soup.find('div', class_='petPolicyDetails') is not None:
            pet = soup.find('div', class_='petPolicyDetails')
            pet = pet.find('span').getText().strip()
        else:
            pet =  'missing' 

        # parking 
        if soup.find('div', class_='parkingDetails') is not None:
            parking = soup.find_all('div', 'parkingDetails')
            parking = parking[0].find('div', class_='parkingTypeFeeContainer')
            parking = parking.h4.getText().strip()
        else:
            parking = 'missing'

        # school section 
        pubSchool = 0
        privSchool = 0

        if soup.find('div', class_='schoolsPublicContainer') is not None:
            pub = soup.find('div', class_='schoolsPublicContainer')
            for item in pub.find_all('div', class_='cell-sm-6 cell-xs-12 paddingReset'):
                pubSchool += 1
        else:
            pubSchool = 'missing'

        if soup.find('div', class_='schoolsPrivateContainer') is not None:
            priv = soup.find('div', class_='schoolsPrivateContainer')
            for item in pub.find_all('div', class_='cell-sm-6 cell-xs-12 paddingReset'):
                privSchool += 1
        else:
            privSchool = 'missing'

        # walk score  
        if soup.find('div', class_='ratingCol walkScore') is not None:
            walk = soup.find('div', class_='ratingCol walkScore')['data-score']
        else:
            walk = 'missing'

        # transit score 
        if soup.find('div', class_='ratingCol transitScore') is not None:
            transit = soup.find('div', class_='ratingCol transitScore')['data-score']
        else:
            transit = 'missing'

        # get property type 
        if soup.find('div', class_='crumbs') is not None:
            ame_soup = soup.find('div', class_='crumbs')
            prop_type = ame_soup.find_all('span', 'crumb')
            ptype = prop_type[0].a['data-type']
        else:
            ptype = 'missing'

        # point of interst 
        numPOI = 0
        if soup.find('section', class_='pointsOfInterestSection') is not None:
            poi = soup.find('section', class_='pointsOfInterestSection')
            poi = poi.find_all('div', class_='transportationDetail ')
            for item in poi:
                numPOI += 1
        else:
            numPOI = 'missing'
            
        # nearby college 
        if soup.find('span', class_='poiSchoolIcon') is None:
            nearCollege = 'missing'
            distNearCollege = 'missing'
            numCollege = 'missing'
        else:
            son = soup.find('span', class_='poiSchoolIcon')
            par = son.find_previous('div', class_='transportationDetail')
            
            allCollege = par.find_all('div', class_='transportationName')
            nearCollege = allCollege[0].a.getText().strip()
            distNearCollege = allCollege[0].find_previous('td').find_next_siblings()[1].getText().strip()
            
        # nearby subway 
        if soup.find('span', class_='poiTransitIcon') is None:
            distNearSubway = 'missing'
            numSubway = 'missing'
        else:
            allTransit = soup.find_all('span', class_='poiTransitIcon')
            
            

        # populate a dataframe 
        df_new = pd.DataFrame(data={'name':name,
                      'address':address,
                      'bd':bd,
                      'ba':ba,
                      'sqft':sqft,
                      'rent':rent,
                      'desc':desc,
                      'pet':pet,
                      'parking':parking,
                      'pubSchool':pubSchool,
                      'privSchool':privSchool,
                      'walk':walk,
                      'transit':transit,
                      'ptype':ptype,
                      'numPOI':numPOI})
        
        
        # drop duplicated rows for apartments 
        df_new.drop_duplicates(inplace=True)
        
        # drop rows with same bd/ba/sqft/address, but different rent
        df_new.drop_duplicates(inplace=True, subset=['bd','ba','sqft'])
        
        print('#{}, \n new df: {} \n total df: {}'.format(i, df_new.shape, df.shape))
        i += 1 # counter 
          
        df = df.append(df_new, ignore_index=True)
        
        # gap between each parsing 
        gap = np.random.uniform(low=5, high=20)
        print('Wait for {} second'.format(gap))
        time.sleep(gap)
             
    return df 

In [None]:
'nearCollege',
        'distNearCollege',
        'numCollege',
        'distNearSubway',
        'numSubway',
        'distNearRail',
        'numRail',
        'distNearShop',
        'numShop',
        'distNearPark',
        'numPark',
        'distAirport'

In [None]:
desc = """Lantera is a word meaning "beacon", and it's the heart and soul of the new 15-acre Boston Landing neighborhood. Apartments feature floor-to-ceiling windows, premium finishes with stainless steel appliances, and technology touches throughout. The heart of Boston is less than five miles away and is easily accessible via the Boston Landing commuter rail stop - just a three-minute walk away. Lantera is surrounded by eclectic restaurants, curated retail shops and world-class athletic facilities.

"""