In [2]:
# import get to call a get request on the site
from requests import get
from bs4 import BeautifulSoup

In [3]:
# get the first page of the Vancouver housing prices
response = get('https://vancouver.craigslist.org/search/apa?availabilityMode=0&hasPic=1')  # removed posts without images

html_soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
# get the macro-container for the housing posts
posts = html_soup.find_all('li', class_='result-row')
print(type(posts))
print(len(posts)) # should show 120 results for first page

<class 'bs4.element.ResultSet'>
120


Practice with extracting one post

In [5]:
post_one = posts[0]

In [6]:
# grab the price of the first post
post_one_price = post_one.a.text
post_one_price.strip() # strip removes the whitespace before and after a string

'$3,300'

In [7]:
# grab the time and datetime it was posted
post_one_time = post_one.find('time', class_= 'result-date')
post_one_datetime = post_one_time['datetime']

In [8]:
# title is 'a', link is grabbing the href attribute of that variable
post_one_title = post_one.find('a', class_='result-title hdrlnk')
post_one_link = post_one_title['href']

post_one_title_text = post_one_title.text

In [9]:
# number of bedrooms, sqft
post_one_num_bedrooms = post_one.find('span', class_='housing').text.split()[0]

post_one_sqft = post_one.find('span', class_='housing').text.split()[2][:-3]

post_one_hood = post_one.find('span', class_='result-hood').text

Now we are going to build out the loop.
There isn't always information on square footage and number of bedrooms - we will use if/else statement to address this.

In [10]:
# import required packages

from time import sleep
from time import time
import re
from random import randint
from warnings import warn
from IPython.core.display import clear_output
import numpy as np


In [11]:
# find the total number of posts to find the limit of the pagination
results_num = html_soup.find('div', class_= 'search-legend')
results_total = int(results_num.find('span', class_='totalcount').text) #pulled the total count of posts as the upper bound of the pages array

# each page has 119 posts so each new page is defined as follows: s=120, s=240, s=360, and so on. So we need to step in size 120 in the np.arange function
pages = np.arange(0, results_total+1, 120)

iterations = 0

post_timing = []
post_hoods = []
post_title_texts = []
bedroom_counts = []
sqfts = []
post_links = []
post_prices = []

for page in pages:
    
    #get request
    response = get("https://vancouver.craigslist.org/search/apa?availabilityMode=0&hasPic=1&" 
                   + "s=" #the parameter for defining the page number 
                   + str(page) #the page number in the pages array from earlier
                   )

    sleep(randint(1,5))
     
    #throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        
    #define the html text
    page_html = BeautifulSoup(response.text, 'html.parser')
    
    #define the posts
    posts = html_soup.find_all('li', class_= 'result-row')
        
    #extract data item-wise
    for post in posts:

        if post.find('span', class_ = 'result-hood') is not None:

            #posting date
            #grab the datetime element 0 for date and 1 for time
            post_datetime = post.find('time', class_= 'result-date')['datetime']
            post_timing.append(post_datetime)

            #neighborhoods
            post_hood = post.find('span', class_= 'result-hood').text
            post_hoods.append(post_hood)

            #title text
            post_title = post.find('a', class_='result-title hdrlnk')
            post_title_text = post_title.text
            post_title_texts.append(post_title_text)

            #post link
            post_link = post_title['href']
            post_links.append(post_link)
            
            #removes the \n whitespace from each side, removes the currency symbol, and turns it into an int
            post_price = int(post.a.text.strip().replace("$", "").replace(",","")) 
            post_prices.append(post_price)
            
            if post.find('span', class_ = 'housing') is not None:
                
                #if the first element is accidentally square footage
                if 'ft2' in post.find('span', class_ = 'housing').text.split()[0]:
                    
                    #make bedroom nan
                    bedroom_count = np.nan
                    bedroom_counts.append(bedroom_count)
                    
                    #make sqft the first element
                    sqft = int(post.find('span', class_ = 'housing').text.split()[0][:-3])
                    sqfts.append(sqft)
                    
                #if the length of the housing details element is more than 2
                elif len(post.find('span', class_ = 'housing').text.split()) > 2:
                    
                    #therefore element 0 will be bedroom count
                    bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                    bedroom_counts.append(bedroom_count)
                    
                    #and sqft will be number 3, so set these here and append
                    sqft = int(post.find('span', class_ = 'housing').text.split()[2][:-3])
                    sqfts.append(sqft)
                    
                #if there is num bedrooms but no sqft
                elif len(post.find('span', class_ = 'housing').text.split()) == 2:
                    
                    #therefore element 0 will be bedroom count
                    bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                    bedroom_counts.append(bedroom_count)
                    
                    #and sqft will be number 3, so set these here and append
                    sqft = np.nan
                    sqfts.append(sqft)                    
                
                else:
                    bedroom_count = np.nan
                    bedroom_counts.append(bedroom_count)
                
                    sqft = np.nan
                    sqfts.append(sqft)
                
            #if none of those conditions catch, make bedroom nan, this won't be needed    
            else:
                bedroom_count = np.nan
                bedroom_counts.append(bedroom_count)
                
                sqft = np.nan
                sqfts.append(sqft)
            #    bedroom_counts.append(bedroom_count)
                
            #    sqft = np.nan
            #    sqfts.append(sqft)
                
    iterations += 1
    print("Page " + str(iterations) + " scraped successfully!")

print("\n")

print("Scrape complete!")

Page 1 scraped successfully!
Page 2 scraped successfully!
Page 3 scraped successfully!
Page 4 scraped successfully!
Page 5 scraped successfully!
Page 6 scraped successfully!
Page 7 scraped successfully!
Page 8 scraped successfully!
Page 9 scraped successfully!
Page 10 scraped successfully!
Page 11 scraped successfully!
Page 12 scraped successfully!
Page 13 scraped successfully!
Page 14 scraped successfully!
Page 15 scraped successfully!
Page 16 scraped successfully!
Page 17 scraped successfully!
Page 18 scraped successfully!
Page 19 scraped successfully!
Page 20 scraped successfully!
Page 21 scraped successfully!
Page 22 scraped successfully!
Page 23 scraped successfully!
Page 24 scraped successfully!
Page 25 scraped successfully!
Page 26 scraped successfully!


Scrape complete!


In [12]:
# create a dataframe
import pandas as pd

van_housing = pd.DataFrame({'posted': post_timing, 'neighborhood': post_hoods,
                            'post title': post_title_texts, 'num_bedrooms': bedroom_counts,
                            'sqft': sqfts, 'URL': post_links, 'price': post_prices})

print(van_housing.info())
van_housing.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3120 entries, 0 to 3119
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   posted        3120 non-null   object 
 1   neighborhood  3120 non-null   object 
 2   post title    3120 non-null   object 
 3   num_bedrooms  3068 non-null   object 
 4   sqft          2652 non-null   float64
 5   URL           3120 non-null   object 
 6   price         3120 non-null   int64  
dtypes: float64(1), int64(1), object(5)
memory usage: 170.8+ KB
None


Unnamed: 0,posted,neighborhood,post title,num_bedrooms,sqft,URL,price
0,2022-06-13 20:26,(Vancouver),[NextGen PM] 2BDR + Den Corner Unit at ELEMENTS,2,846.0,https://vancouver.craigslist.org/van/apa/d/van...,3300
1,2022-06-13 20:25,"(Yaletown, Vancouver)","1 Bedroom, Den, 1 Bathroom Corner Suite with B...",1,795.0,https://vancouver.craigslist.org/van/apa/d/van...,3695
2,2022-06-13 20:25,"(Fairview, Vancouver)",Charming 2 Bedroom Corner Suite With Views,2,950.0,https://vancouver.craigslist.org/van/apa/d/van...,3995
3,2022-06-13 20:24,"(Olympic Village, Vancouver)",2 Level Luxury Penthouse for Rent - 4 Bedroom ...,4,2454.0,https://vancouver.craigslist.org/van/apa/d/van...,12250
4,2022-06-13 20:24,(north Vancouver),Spacious 2 bedroom plus den with stunning fals...,2,1100.0,https://vancouver.craigslist.org/van/apa/d/van...,3650


Now we are going to clean the data.

In [13]:
# drop duplicate URLs
van_housing = van_housing.drop_duplicates(subset='URL')
len(van_housing.drop_duplicates(subset='URL'))

120

In [14]:
# make the number of bedrooms to float
van_housing['num_bedrooms'] = van_housing['num_bedrooms'].apply(lambda x: float(x))

In [15]:
# convert datetime string into datetime object
from datetime import datetime

van_housing['posted'] = pd.to_datetime(van_housing['posted'])

In [16]:
van_housing['neighborhood'].unique()

array([' (Vancouver)', ' (Yaletown, Vancouver)', ' (Fairview, Vancouver)',
       ' (Olympic Village, Vancouver)', ' (north Vancouver)',
       ' (Vancouver-Yaletown)', ' ((North Vancouver Upper Lonsdale))',
       ' (Vancouver - Coal Harbour)', ' (989 Nelson Street)',
       ' (19xx E.35th Avenue)', ' (Sullivan)', ' (5325 West Boulevard)',
       ' (Fernie)', ' (Burnaby Heights)',
       ' (Coquitlam - Burke Mountain area)', ' (West Cambie, Richmond)',
       ' (Burnaby)', ' (Maple Ridge)', ' (Steveston North, Richmond)',
       ' (Kitsilano, Vancouver)',
       ' (West Vancouver / British Properties)',
       ' (Downtown Vancouver / Coal Harbour / Fairmont Pacific Rim)',
       ' (SURREY, WEST NEWTON)', ' (Dunbar)',
       ' (777 richards st and robson st)', ' (West End)',
       ' (Upper Lonsdale, North Vancouver)', ' (Eagle Ridge, Coquitlam)',
       ' (Port Moody)', ' (city of vancouver)', ' (2nd Ave & Bayswater)',
       ' (East Vancouver)', ' (Port Coquitlam)', ' (Burnaby North)

In [17]:
van_housing['neighborhood'] = van_housing['neighborhood'].map(lambda x: x.strip())

In [18]:
# remove the parenthesis from the left and right of the neighborhoods
van_housing['neighborhood'] = van_housing['neighborhood'].map(lambda x: x.lstrip('(').rstrip(')'))

In [19]:
# titlecase them
van_housing['neighborhood'] = van_housing['neighborhood'].str.title()

In [20]:
van_housing['neighborhood'].unique()

array(['Vancouver', 'Yaletown, Vancouver', 'Fairview, Vancouver',
       'Olympic Village, Vancouver', 'North Vancouver',
       'Vancouver-Yaletown', 'North Vancouver Upper Lonsdale',
       'Vancouver - Coal Harbour', '989 Nelson Street',
       '19Xx E.35Th Avenue', 'Sullivan', '5325 West Boulevard', 'Fernie',
       'Burnaby Heights', 'Coquitlam - Burke Mountain Area',
       'West Cambie, Richmond', 'Burnaby', 'Maple Ridge',
       'Steveston North, Richmond', 'Kitsilano, Vancouver',
       'West Vancouver / British Properties',
       'Downtown Vancouver / Coal Harbour / Fairmont Pacific Rim',
       'Surrey, West Newton', 'Dunbar', '777 Richards St And Robson St',
       'West End', 'Upper Lonsdale, North Vancouver',
       'Eagle Ridge, Coquitlam', 'Port Moody', 'City Of Vancouver',
       '2Nd Ave & Bayswater', 'East Vancouver', 'Port Coquitlam',
       'Burnaby North', 'Commercial Drive', 'West Vancouver',
       'Willoughby, Langley', '1705-3096 Windsor Gate Coquitlam',
    

In [21]:
# just take the first name of the neighborhood list, splitting on the '/' delimiter
van_housing['neighborhood'] = van_housing['neighborhood'].apply(lambda x: x.split('/')[0])

In [22]:
van_housing['neighborhood'].unique()

array(['Vancouver', 'Yaletown, Vancouver', 'Fairview, Vancouver',
       'Olympic Village, Vancouver', 'North Vancouver',
       'Vancouver-Yaletown', 'North Vancouver Upper Lonsdale',
       'Vancouver - Coal Harbour', '989 Nelson Street',
       '19Xx E.35Th Avenue', 'Sullivan', '5325 West Boulevard', 'Fernie',
       'Burnaby Heights', 'Coquitlam - Burke Mountain Area',
       'West Cambie, Richmond', 'Burnaby', 'Maple Ridge',
       'Steveston North, Richmond', 'Kitsilano, Vancouver',
       'West Vancouver ', 'Downtown Vancouver ', 'Surrey, West Newton',
       'Dunbar', '777 Richards St And Robson St', 'West End',
       'Upper Lonsdale, North Vancouver', 'Eagle Ridge, Coquitlam',
       'Port Moody', 'City Of Vancouver', '2Nd Ave & Bayswater',
       'East Vancouver', 'Port Coquitlam', 'Burnaby North',
       'Commercial Drive', 'West Vancouver', 'Willoughby, Langley',
       '1705-3096 Windsor Gate Coquitlam', '207 10237 133 Street',
       '5470 Ormidale Street', '15477 93A Av

In [23]:
van_housing['neighborhood'].replace('Fraser And 37Th', 'Fraser', inplace = True)
van_housing['neighborhood'].replace('Clayton Heights, Surrey', 'Surrey', inplace = True)
van_housing['neighborhood'].replace('Mount Plesant', 'Mount Pleasant', inplace = True)
van_housing['neighborhood'].replace('Vancouver, Mount Pleasant East', 'Mount Pleasant', inplace = True)
van_housing['neighborhood'].replace('7428 14Th Avenue, Burnaby', 'Burnaby', inplace = True)
van_housing['neighborhood'].replace('Central Surrey', 'Surrey', inplace = True)
van_housing['neighborhood'].replace('Surrey Central Gateway Skytrain', 'Surrey', inplace = True)
van_housing['neighborhood'].replace('Marpole-Oakridge, Vancouver', 'Marpole', inplace = True)
van_housing['neighborhood'].replace('103-747 3 Street East, North Vancouver, Bc', 'North Vancouver', inplace = True)
van_housing['neighborhood'].replace('Cloverdale (69 Ave And 184 Street', 'Marpole', inplace = True)
van_housing['neighborhood'].replace('Central Lonsdale', 'North Vancouver', inplace = True)
van_housing['neighborhood'].replace('City Of Vancouver', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('Coquitlam Town Centre', 'Coquitlam', inplace = True)


In [25]:
van_housing["neighborhood"].unique()

array(['Vancouver', 'Yaletown, Vancouver', 'Fairview, Vancouver',
       'Olympic Village, Vancouver', 'North Vancouver',
       'Vancouver-Yaletown', 'North Vancouver Upper Lonsdale',
       'Vancouver - Coal Harbour', '989 Nelson Street',
       '19Xx E.35Th Avenue', 'Sullivan', '5325 West Boulevard', 'Fernie',
       'Burnaby Heights', 'Coquitlam - Burke Mountain Area',
       'West Cambie, Richmond', 'Burnaby', 'Maple Ridge',
       'Steveston North, Richmond', 'Kitsilano, Vancouver',
       'West Vancouver ', 'Downtown Vancouver ', 'Surrey, West Newton',
       'Dunbar', '777 Richards St And Robson St', 'West End',
       'Upper Lonsdale, North Vancouver', 'Eagle Ridge, Coquitlam',
       'Port Moody', '2Nd Ave & Bayswater', 'East Vancouver',
       'Port Coquitlam', 'Burnaby North', 'Commercial Drive',
       'West Vancouver', 'Willoughby, Langley',
       '1705-3096 Windsor Gate Coquitlam', '207 10237 133 Street',
       '5470 Ormidale Street', '15477 93A Avenue', 'Tricities',
 

In [26]:
for item in van_housing["neighborhood"]:
    if "West Vancouver" in item:
        van_housing["neighborhood"].replace(item, "West Vancouver", inplace = True)
    elif "North Vancouver" in item:
        van_housing["neighborhood"].replace(item, "North Vancouver", inplace = True)
    elif "East Vancouver" in item:
        van_housing["neighborhood"].replace(item, "East Vancouver", inplace=True)
    elif "Surrey" in item:
        van_housing["neighborhood"].replace(item, "Surrey", inplace=True)
    elif "Burnaby" in item:
        van_housing["neighborhood"].replace(item, "Burnaby", inplace=True)
    elif "Coquitlam" in item:
        van_housing["neighborhood"].replace(item, "Coquitlam", inplace=True)
    elif "Richmond" in item:
        van_housing["neighborhood"].replace(item, "Richmond", inplace=True)
    elif "Vancouver West" in item:
        van_housing["neighborhood"].replace(item, "Vancouver", inplace=True)
    elif "Vancouver" in item:
        van_housing["neighborhood"].replace(item, "Vancouver", inplace= True)

In [27]:
van_housing["neighborhood"].unique()

array(['Vancouver', 'North Vancouver', '989 Nelson Street',
       '19Xx E.35Th Avenue', 'Sullivan', '5325 West Boulevard', 'Fernie',
       'Burnaby', 'Coquitlam', 'Richmond', 'Maple Ridge',
       'West Vancouver', 'Surrey', 'Dunbar',
       '777 Richards St And Robson St', 'West End', 'Port Moody',
       '2Nd Ave & Bayswater', 'East Vancouver', 'Commercial Drive',
       'Willoughby, Langley', '207 10237 133 Street',
       '5470 Ormidale Street', '15477 93A Avenue', 'Tricities',
       'Main And 45Th Avenue', 'Westwood Plateau', 'Como Lake & Clarke',
       'Sullivan Heights', 'Hope', 'Point Grey', 'Fleetwood',
       'British Properties', 'South Cambie', 'Calverhall', 'Room Share',
       'Delta', 'Coquitam', 'Downtown', 'Horseshoe Bay', 'Glenmore',
       'Eagle Harbour', 'Fairview', 'Lower Lonsdale', 'Gibsons',
       'Dundarave', 'Altamont', 'Caulfeild', 'University Vw',
       'Tsawwassen Delta', 'Kerrisdale', 'Pemberton', 'Marpole'],
      dtype=object)

In [28]:
van_housing['neighborhood'].replace('989 Nelson Street', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('19Xx E.35Th Avenue', 'East Vancouver', inplace = True)
van_housing['neighborhood'].replace('Sullivan', 'Surrey', inplace = True)
van_housing['neighborhood'].replace('Dunbar', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('777 Richards St And Robson St', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('West End', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('2Nd Ave & Bayswater', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('Commercial Drive', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('Willoughby, Langley', 'Langley', inplace = True)
van_housing['neighborhood'].replace('207 10237 133 Street', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('207 10237 133 Street', 'Surrey', inplace = True)
van_housing['neighborhood'].replace('5470 Ormidale Street', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('15477 93A Avenue', 'Surrey', inplace = True)
van_housing['neighborhood'].replace('Main And 45Th Avenue', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('Westwood Plateau', 'Coquitlam', inplace = True)
van_housing['neighborhood'].replace('Como Lake & Clarke', 'Coquitlam', inplace = True)
van_housing['neighborhood'].replace('Sullivan Heights', 'Surrey', inplace = True)
van_housing['neighborhood'].replace('Point Grey', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('Fleetwood', 'Surrey', inplace = True)
van_housing['neighborhood'].replace('British Properties', 'West Vancouver', inplace = True)
van_housing['neighborhood'].replace('South Cambie', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('Calverhall', 'North Vancouver', inplace = True)
van_housing['neighborhood'].replace('Downtown', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('Eagle Harbour', 'West Vancouver', inplace = True)
van_housing['neighborhood'].replace('Fairview', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('Lower Lonsdale', 'North Vancouver', inplace = True)
van_housing['neighborhood'].replace('University Vw', 'Vancouver', inplace = True)
van_housing['neighborhood'].replace('Tsawwassen Delta', 'Delta', inplace = True)


In [29]:
van_housing['neighborhood'].unique()

array(['Vancouver', 'North Vancouver', 'East Vancouver', 'Surrey',
       '5325 West Boulevard', 'Fernie', 'Burnaby', 'Coquitlam',
       'Richmond', 'Maple Ridge', 'West Vancouver', 'Port Moody',
       'Langley', 'Tricities', 'Hope', 'Room Share', 'Delta', 'Coquitam',
       'Horseshoe Bay', 'Glenmore', 'Gibsons', 'Dundarave', 'Altamont',
       'Caulfeild', 'Kerrisdale', 'Pemberton', 'Marpole'], dtype=object)

In [30]:
# save to csv
van_housing.to_csv("van_housing_Jun_13_21_clean.csv", index=False)