# Vancouver Real Estate

Brandon Tu

May 3, 2021

---

## Overview

This project is for the purposes of web scraping the available information of property listings off of a real estate website, [REW](https://www.rew.ca/).

In [87]:
# Import Packages

# Fundamentals
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Web Scraping
from selenium import webdriver
from bs4 import BeautifulSoup
import requests

## Web Scraping

In [46]:
# Customize the options
options = webdriver.ChromeOptions()

# Add arguement to ignore certificate errors
options.add_argument("--ignore-certificate-errors")

# Add argument to access web browser in incognito mode
options.add_argument("--incognito")

# Add argument to use Selenium without opening a browser
options.add_argument("--headless")

# Use chrome webdriver as the driver
path = "/Users/Brandon 1/desktop/chromedriver"
driver = webdriver.Chrome(path, chrome_options=options)

  driver = webdriver.Chrome(path, chrome_options=options)


In [47]:
# Get REW webpage
import time

# Enter the website url
url = "https://www.rew.ca"
driver.get(url)

In [48]:
# Click in to the view all listings in Vancouver
driver.find_element_by_xpath("/html/body/footer/div[1]/div/div/div/div[1]/div/ul/li[1]/a").click()

# Create alias to use BeautifulSoup
page_source = driver.page_source

# Use beautifulsoup to scrape the page
soup = BeautifulSoup(page_source, 'lxml')

In [59]:
# Add all of the links for each listing in the website
condition = True
rows_list = []

In [60]:
# Use while there is a next page button to click, scrape all of the urls and then click next
while condition:
    # Get the URLs of all of the listings to enter each listing for scraping
    listings = soup.find_all('a', class_=False, title=True, href=True)
    
    # Use for loop to get all of the listings on the page
    for listing in listings:
    
        # Create the dictionary
        listings_dict = {}

        # Use if, not in statement to add the dictionary to the rows_list if not in the list already
        if {"Address":listing["title"], "href":listing["href"]} not in rows_list:
            listings_dict.update({"Address":listing["title"], "href":listing["href"]})
            rows_list.append(listings_dict)

        # Else do not include
        else:
            pass

    # Click into the next page
    try:
        driver.find_element_by_class_name("paginator-next_page.paginator-control").click()
        
        # Create alias to use BeautifulSoup
        page_source = driver.page_source

        # Use beautifulsoup to scrape the page
        soup = BeautifulSoup(page_source, 'lxml')
    
    # If there are no more next pages, then change condition to False and stop the loop
    except:
        condition=False

In [63]:
# Create new dataframe containing each of the listings and links
listings_df = pd.DataFrame(rows_list, columns=["Address", "href"])
listings_df

Unnamed: 0,Address,href
0,"313-2890 Point Grey Road, Vancouver, BC, V6K 1A9",/properties/3297882/313-2890-point-grey-road-v...
1,"2816 E 8th Avenue, Vancouver, BC, V5M 1W9",/properties/3283919/2816-e-8th-avenue-vancouve...
2,"305-5693 Elizabeth Street, Vancouver, BC, V5Y 3K1",/properties/3310199/305-5693-elizabeth-street-...
3,"3242 W 29th Avenue, Vancouver, BC, V6L 1Y6",/properties/3310191/3242-w-29th-avenue-vancouv...
4,"413-3588 Sawmill Crescent, Vancouver, BC, V5S 0H5",/properties/3309974/413-3588-sawmill-crescent-...
...,...,...
478,"804-138 W 1st Avenue, Vancouver, BC, V5Y 0H5",/properties/3295977/804-138-w-1st-avenue-vanco...
479,"801-528 Beatty Street, Vancouver, BC, V6B 2L3",/properties/3295830/801-528-beatty-street-vanc...
480,"202-3595 W 18th Avenue, Vancouver, BC, V6S 1A9",/properties/3295787/202-3595-w-18th-avenue-van...
481,"408-3480 Main Street, Vancouver, BC, V5V 3N2",/properties/3295785/408-3480-main-street-vanco...


In [86]:
# Create arrays for all of the information that will be scraped from the website
property_sizes = np.array([])
descriptions = np.array([])
list_prices = np.array([])
bedrooms = np.array([])
bathrooms = np.array([])
property_types = np.array([])
lot_sizes = np.array([])
years_built = np.array([])
titles = np.array([])
styles = np.array([])
features = np.array([])
amenities = np.array([])
appliances = np.array([])
communities = np.array([])
days_on_rew = np.array([])

# Use for loop to index through each href and scrape the information off of each page
for idx in range(len(listings_df)-1):
    
    # Alias the hrefs and add with the original url to get the link
    href = listings_df["href"][idx]
    link = url + href
    
    # Use links with Selenium and start scraping the content off of the webpages
    driver.get(link)
    
    # Initialize all of the features as NaN to keep the indexing correct when appending to the arrays
    property_size = None
    description = None
    list_price = None
    num_bedrooms = None
    num_bathrooms = None
    property_type = None
    lot_size = None
    year_built = None
    title = None
    style = None
    num_features = None
    the_amenities = None
    the_appliances = None
    community = None
    days_on_website = None
    
    # Find the property size
    property_size = \
    driver.find_element_by_xpath("/html/body/section/section/div[1]/div[1]/div[1]/div[1]/div[1]/ul/li[3]").text
    
    # Find the description
    description = driver.find_element_by_class_name("listingoverview").text
    
    # Get the <tr> tags to get each individual line of information that is wanted
    body_lines = driver.find_elements_by_tag_name("tr")
    
    # Loop through each line that is found in the tags and alias the values accordingly
    for idx in range(len(body_lines)-1):
        split_body_lines = body_lines[idx].text.split("\n")
        
        # Listing Price
        if split_body_lines[0] == "List Price":
            list_price = split_body_lines[1]
        
        # Number of Bedrooms
        elif split_body_lines[0] == "Bedrooms":
            num_bedrooms = split_body_lines[1]
        
        # Number of Bathrooms
        elif split_body_lines[0] == "Bathrooms":
            num_bathrooms = split_body_lines[1]
        
        # Property Type
        elif split_body_lines[0] == "Property Type":
            property_type = split_body_lines[1]
        
        # Lot Size
        elif split_body_lines[0] == "Lot Size":
            lot_size = split_body_lines[1]
        
        # Year Built
        elif split_body_lines[0] == "Year Built":
            year_built = split_body_lines[1]
        
        # Title/type of property ownership
        elif split_body_lines[0] == "Title":
            title = split_body_lines[1]
            
        # Style of the Property
        elif split_body_lines[0] == "Style":
            style = split_body_lines[1]
        
        # Features
        elif split_body_lines[0] == "Features":
            the_features = split_body_lines[1]
        
        # Amenities found in/around the property
        elif split_body_lines[0] == "Amenities":
            the_amenities = split_body_lines[1]
        
        # Appliances
        elif split_body_lines[0] == "Appliances":
            the_appliances = split_body_lines[1]
        
        # Community/neighbourhood the property is situated in
        elif split_body_lines[0] == "Community":
            community = split_body_lines[1]
        
        # Days on the REW website
        elif split_body_lines[0] == "Days on REW":
            days_on_website = split_body_lines[1]
        
        # Pass if the line is none of the above choices
        else:
            pass
    
    # Append all of the values to the respective arrays
    property_sizes.append(property_size)
    descriptions.append(description)
    list_prices.append(list_price)
    bedrooms.append(num_bedrooms)
    bathrooms.append(num_bathrooms)
    property_types.append(property_type)
    lot_sizes.append(lot_size)
    years_built.append(year_built)
    titles.append(title)
    styles.append(style)
    features.append(the_features)
    amenities.append(the_amenities)
    appliances.append(the_appliances)
    communities.append(community)
    days_on_rew.append(days_on_website)

https://www.rew.ca/properties/3297882/313-2890-point-grey-road-vancouver-bc?page=2&search_params%5Bquery%5D=Vancouver%2C+BC&searchable_id=361&searchable_type=Geography&sort=latest
https://www.rew.ca/properties/3283919/2816-e-8th-avenue-vancouver-bc?page=2&search_params%5Bquery%5D=Vancouver%2C+BC&searchable_id=361&searchable_type=Geography&sort=latest
https://www.rew.ca/properties/3310199/305-5693-elizabeth-street-vancouver-bc?page=2&search_params%5Bquery%5D=Vancouver%2C+BC&searchable_id=361&searchable_type=Geography&sort=latest
https://www.rew.ca/properties/3310191/3242-w-29th-avenue-vancouver-bc?page=2&search_params%5Bquery%5D=Vancouver%2C+BC&searchable_id=361&searchable_type=Geography&sort=latest
https://www.rew.ca/properties/3309974/413-3588-sawmill-crescent-vancouver-bc?page=2&search_params%5Bquery%5D=Vancouver%2C+BC&searchable_id=361&searchable_type=Geography&sort=latest
https://www.rew.ca/properties/3309726/514-950-drake-street-vancouver-bc?page=2&search_params%5Bquery%5D=Vancouv

In [105]:
# Alias the hrefs and add with the original url to get the link
href = listings_df["href"][0]
link = url + href
    
# Use links with BeautifulSoup and start scraping the content off of the webpages
driver.get(link)
property_size.append()

In [161]:
body_lines = driver.find_elements_by_tag_name("tr")
body_lines[6].text.split("\n")

['Bathrooms', '1']