# Project 2: Web Scraping & Linear Regresssion

Apartment rental prices in Portland
- Scrape Craiglist
- Predict Price

Features: 
- # bedrooms
- Square footage
- 

In [1]:
from bs4 import BeautifulSoup
import requests

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib as plt

In [2]:
# PDX Housing/Apartment Search Page
url = 'https://portland.craigslist.org/search/apa'

In [3]:
response = requests.get(url)
response.status_code

200

In [4]:
page = response.text
soup = BeautifulSoup(page, 'html.parser')

In [5]:
postings = soup.find_all('li', class_='result-row')

In [6]:
type(postings)

bs4.element.ResultSet

In [7]:
print("Number of postings: {}".format(len(postings)))

Number of postings: 120


In [8]:
# First posting - Raw HTML
postings[0]

<li class="result-row" data-pid="7205725894" data-repost-of="6975672005">
<a class="result-image gallery" data-ids="1:00z0z_2K6gAdXIf8a,3:01111_blwa6SZfFAG_0CI0lM,1:00x0x_ivJoPphmd5z,1:00G0G_fYMjCsSR9w2,1:01717_dQq69SMztUa,1:00909_6QyUxbZ3o8M,1:00h0h_4AfEAUWUR8J,1:00D0D_W8GvJQcwPx,1:00a0a_24dfg5bHZf6,1:00a0a_d9eXEb93556,1:00L0L_lrkYoEwZpJG,1:00r0r_4Mrbc2nrhtj,3:00K0K_c3BRFope6sr_0CI0lM" href="https://portland.craigslist.org/mlt/apa/d/portland-spacious-luxury-studio-floor/7205725894.html">
<span class="result-price">$1,350</span>
</a>
<div class="result-info">
<span class="icon icon-star" role="button">
<span class="screen-reader-text">favorite this post</span>
</span>
<time class="result-date" datetime="2020-09-30 10:13" title="Wed 30 Sep 10:13:18 AM">Sep 30</time>
<h2>
<a class="result-title hdrlnk" data-id="7205725894" href="https://portland.craigslist.org/mlt/apa/d/portland-spacious-luxury-studio-floor/7205725894.html" id="postid_7205725894">Spacious &amp; Luxury Studio! Floor To Ce

First loook at how to scrape each individual item: 

Create a function to get all the details from a post:

In [9]:
def get_listing_details(post):
    
    # Present in each post
    date = post.find('time', class_='result-date').text
    title = post.find('a', class_='result-title hdrlnk').text
    link = post.find('a', class_='result-title hdrlnk')['href']
    price = int(post.find('span', class_='result-price').text.strip().replace("$","").replace(",",""))
    
    
    if post.find('span', class_='result-hood') is None:
        hood = np.nan
    else:
        hood = post.find('span', class_='result-hood').text.strip()[1:][:-1]
    
    # Test to for BR - SQFT info
    if post.find('span', class_='housing') is None:
        # Set both BRs and sq-ft to np.nan
        brs = np.nan
        sqft = np.nan
        
    elif len(post.find('span', class_='housing').text.split()) < 3:
        # test to see if we have BR
        if post.find('span', class_='housing').text.split()[0][-2:] == 'br':
            brs = post.find('span', class_='housing').text.split()[0][:-2]
            sqft = np.nan
        else:
            sqft = int(post.find('span', class_='housing').text.split()[0][:-3])
            brs = np.nan
             
    else:
        # We have both BRs and sq-ft      
        brs = post.find('span', class_='housing').text.split()[0][:-2]
        sqft = int(post.find('span', class_='housing').text.split()[2][:-3])     
    
    # Order of elements to be returned
    post_elements = [date, title, link, price, brs, sqft, hood]
    
    return post_elements

Example of scraped listing using function above

In [10]:
get_listing_details(postings[0])

['Sep 30',
 'Spacious & Luxury Studio! Floor To Ceiling Windows',
 'https://portland.craigslist.org/mlt/apa/d/portland-spacious-luxury-studio-floor/7205725894.html',
 1350,
 nan,
 493,
 '2014 SE 11th Ave - 303']

Function to scrape the entire page of postings

In [11]:
def get_page_listings(page):
    
    post_counter = 0
    page_results = []
    
    for post in page:
        listing = get_listing_details(post)
        page_results.append(listing)
        post_counter += 1
        
    print("Scrape Complete!")
    print("Number of Postings Scraped: {}".format(post_counter))        
    return page_results            

In [12]:
# Scrape the page, return postings as variable 'data'
data = get_page_listings(postings)

Scrape Complete!
Number of Postings Scraped: 120


In [13]:
# Assigning headrs for DF
headers = ['date', 'title', 'link', 'price', 'brs', 'sqft', 'hood']

In [14]:
# Creating the dataframe from scraped page
df = pd.DataFrame(data, columns=headers)

In [15]:
df.head()

Unnamed: 0,date,title,link,price,brs,sqft,hood
0,Sep 30,Spacious & Luxury Studio! Floor To Ceiling Win...,https://portland.craigslist.org/mlt/apa/d/port...,1350,,493.0,2014 SE 11th Ave - 303
1,Sep 30,1x1 with Carport & W/D Included! 24-hr Fitness...,https://portland.craigslist.org/clc/apa/d/wils...,1234,1.0,620.0,Wilsonville
2,Sep 30,Charming Upstairs 1BDRM Apartment,https://portland.craigslist.org/mlt/apa/d/port...,1295,1.0,,King
3,Sep 30,LOVE WHERE YOU LIVE!,https://portland.craigslist.org/wsc/apa/d/port...,1199,2.0,756.0,Tigard
4,Sep 30,1 Bedroom/ 1 Bathroom With 3 Months Free Parki...,https://portland.craigslist.org/wsc/apa/d/beav...,1265,1.0,647.0,"6523 NE Cherry Drive Hillsboro, OR"


Testing another page:

In [16]:
url_2 = 'https://portland.craigslist.org/search/apa?s=120'

In [17]:
response = requests.get(url_2)
response.status_code

200

In [18]:
page2 = response.text
soup2 = BeautifulSoup(page2, 'html.parser')

In [19]:
def clpage_to_df(soup_obj):
    '''
    Function to create a DataFrame from one entire craiglist page
    --> Nested Functions: 
            get_page_listings(postings)
                get_listing_details(post)
    '''
    
    postings = soup.find_all('li', class_='result-row')
    data = get_page_listings(postings)
    df = pd.DataFrame(data, columns=headers)
    
    return df

In [20]:
df = clpage_to_df(soup)
df.head()

Scrape Complete!
Number of Postings Scraped: 120


Unnamed: 0,date,title,link,price,brs,sqft,hood
0,Sep 30,Spacious & Luxury Studio! Floor To Ceiling Win...,https://portland.craigslist.org/mlt/apa/d/port...,1350,,493.0,2014 SE 11th Ave - 303
1,Sep 30,1x1 with Carport & W/D Included! 24-hr Fitness...,https://portland.craigslist.org/clc/apa/d/wils...,1234,1.0,620.0,Wilsonville
2,Sep 30,Charming Upstairs 1BDRM Apartment,https://portland.craigslist.org/mlt/apa/d/port...,1295,1.0,,King
3,Sep 30,LOVE WHERE YOU LIVE!,https://portland.craigslist.org/wsc/apa/d/port...,1199,2.0,756.0,Tigard
4,Sep 30,1 Bedroom/ 1 Bathroom With 3 Months Free Parki...,https://portland.craigslist.org/wsc/apa/d/beav...,1265,1.0,647.0,"6523 NE Cherry Drive Hillsboro, OR"
