## Data Collection

In [1]:
# Import Python Libraries

# For HTML parsing
from bs4 import BeautifulSoup 
from selenium import webdriver

# For website connections
import requests 

# For data cleanup
import re

# For zipcode search
#!pip install opencage
from opencage.geocoder import OpenCageGeocode


# To prevent overwhelming the server between connections
import time
from time import sleep 

# Display the progress bar
from tqdm import tqdm

# For data wrangling
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# For creating plots
import matplotlib.pyplot as plt
import plotly.graph_objects as go


    


In [2]:
web_driver = webdriver.Chrome()

# Function to collect raw data from url:

def get_page(city, type, beds, page):
  
    url    = f'https://www.torontorentals.com/{city}/{type}?beds={beds}%20&p={page}'
    result = requests.get(url)
    # https://www.torontorentals.com/toronto/condos?beds=1%20&p=2
    # check HTTP response status codes to find if HTTP request has been successfully completed
    if result.status_code >= 100  and result.status_code <= 199:
        print('Informational response')
    if result.status_code >= 200  and result.status_code <= 299:
        print('Successful response')
        web_driver.get(url)
        time.sleep(2)
        web_driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)
        soup = BeautifulSoup(web_driver.page_source,'lxml')
    if result.status_code >= 300  and result.status_code <= 399:
        print('Redirect')
    if result.status_code >= 400  and result.status_code <= 499:
        print('Client error')
    if result.status_code >= 500  and result.status_code <= 599:
        print('Server error')
        
    return soup

#-----------------------------------------------------------------------------------------------------------------------------------

# Data that will be used in the function
house_type = ["Apartment","condo","room","house","studio","basement"]
bed_options = ["0","1","2","3","4","1-2","1-3"]

# Lists that will contain the clean data
listData = []
listingStreet = []
listingCity = []
listingZip = []
listingRent = []
listingBed = []
listingBath = []
listingDim = []
listingType = []
ListingID = []

# Code that implements the above function and the above lists to collect raw data          
  
for page_num in tqdm(range(1,100)):  # Range depends on how many pages you want to analyze
    soup_page                = get_page('toronto', house_type, bed_options, page_num)
            
  
  #Data Collection
     
    #This contains info on all datapoints needed, but will use other links instead to avoid mistakes during the clean up process
    data = soup_page.find_all("div",{"class":"r-listing-card-v"})
    listData.append(data)  
    
    
    # Street, Rent & House type had unique identifiers in the HTML 
    street                   = soup_page.find_all("div",{"class":"r-listing-address q-mb-md q-pl-md"})
    rent                     = soup_page.find_all("a",{"class":"r-listing-price q-my-md q-mr-md q-pl-md"})
    house_type               = soup_page.find_all("span",{"class":"r-listing-type"})

    # Bed, Bath and Dimensions had the same identifier from the HTML
    data_bed_bath_dimensions = soup_page.find_all("span",{"class":"r-listing-infos__label"})
   
    
    
    # Data Cleanup # Appending to Lists
    
    # Street & House type had unique identifiers in the HTML 

    # Address
    str_street = [str(item) for item in street]
    cleaned_street = [sub.replace('<div class="r-listing-address q-mb-md q-pl-md">',"")
                        .replace('</div>',"") for sub in str_street]
    for i in cleaned_street:
        listingStreet.append(i)
        
    # House Type    
    str_house_type = [str(item) for item in house_type]
    cleaned_house_type = [sub.replace('<span class="r-listing-type">',"</span>,")
                            .replace('</span>',"").replace(",","") for sub in str_house_type]
    for i in cleaned_house_type:
        listingType.append(i)    
        
        
    # Price    
    str_rent = [str(item) for item in rent]
    rent_1 = [sub.replace('<a class="r-listing-price q-my-md q-mr-md q-pl-md"','').replace('href="/toronto/','')
                        .replace('</a>',"").replace(",","") for sub in str_rent]
    rent_2 = [item.split(">") for item in rent_1]
    cleaned_rent = [' - '.join(item.split(' - ')[:2]) for _, item in rent_2]
    for i in cleaned_rent:
        listingRent.append(i)
        
    # Bed, Bath & Dimensions    
    str_data_bed_bath_dimensions    = [str(item) for item in data_bed_bath_dimensions]
    for html in str_data_bed_bath_dimensions:
        soup = BeautifulSoup(html, 'html.parser')
        text = soup.text.strip()
    
        if 'bed' in text:
            bed_text = text.split()[0]
            listingBed.append(bed_text)
        elif 'bath' in text:
            bath_text = text.split()[0]
            listingBath.append(bath_text)
        elif 'Ft' in text:
            sq_ft_text = text.split()[0]
            listingDim.append(sq_ft_text)
            
    # For IDs:
    
    # Find all <a> elements with the class "r-listing-price"
    anchor_elements = soup_page.find_all('a', class_='r-listing-price')

    # Iterate through the <a> elements and extract the IDs
    for anchor_element in anchor_elements:
        href = anchor_element['href']

        # Extract the ID from the href attribute using regular expressions
        id_match = re.search(r'id(\d+)', href)
        if id_match:
            listing_id = id_match.group(1)
            ListingID.append(listing_id)
    

  0%|          | 0/99 [00:00<?, ?it/s]

Successful response


  1%|          | 1/99 [00:09<14:49,  9.07s/it]

Successful response


  2%|▏         | 2/99 [00:17<13:51,  8.57s/it]

Successful response


  3%|▎         | 3/99 [00:25<13:27,  8.41s/it]

Successful response


  4%|▍         | 4/99 [00:33<13:13,  8.35s/it]

Successful response


  5%|▌         | 5/99 [00:41<12:57,  8.27s/it]

Successful response


  6%|▌         | 6/99 [00:50<12:45,  8.23s/it]

Successful response


  7%|▋         | 7/99 [00:57<12:28,  8.13s/it]

Successful response


  8%|▊         | 8/99 [01:05<12:15,  8.08s/it]

Successful response


  9%|▉         | 9/99 [01:13<12:00,  8.00s/it]

Successful response


 10%|█         | 10/99 [01:21<11:48,  7.96s/it]

Successful response


 11%|█         | 11/99 [01:29<11:37,  7.92s/it]

Successful response


 12%|█▏        | 12/99 [01:37<11:27,  7.90s/it]

Successful response


 13%|█▎        | 13/99 [01:45<11:19,  7.90s/it]

Successful response


 14%|█▍        | 14/99 [01:53<11:09,  7.88s/it]

Successful response


 15%|█▌        | 15/99 [02:00<11:02,  7.88s/it]

Successful response


 16%|█▌        | 16/99 [02:08<10:55,  7.90s/it]

Successful response


 17%|█▋        | 17/99 [02:17<10:56,  8.00s/it]

Successful response


 18%|█▊        | 18/99 [02:25<10:46,  7.98s/it]

Successful response


 19%|█▉        | 19/99 [02:33<10:37,  7.97s/it]

Successful response


 20%|██        | 20/99 [02:41<10:32,  8.01s/it]

Successful response


 21%|██        | 21/99 [02:49<10:22,  7.98s/it]

Successful response


 22%|██▏       | 22/99 [02:57<10:14,  7.98s/it]

Successful response


 23%|██▎       | 23/99 [03:04<10:02,  7.92s/it]

Successful response


 24%|██▍       | 24/99 [03:12<09:52,  7.90s/it]

Successful response


 25%|██▌       | 25/99 [03:21<09:55,  8.05s/it]

Successful response


 26%|██▋       | 26/99 [03:28<09:45,  8.02s/it]

Successful response


 27%|██▋       | 27/99 [03:36<09:36,  8.00s/it]

Successful response


 28%|██▊       | 28/99 [03:44<09:26,  7.98s/it]

Successful response


 29%|██▉       | 29/99 [03:52<09:15,  7.94s/it]

Successful response


 30%|███       | 30/99 [04:00<09:05,  7.91s/it]

Successful response


 31%|███▏      | 31/99 [04:08<08:57,  7.90s/it]

Successful response


 32%|███▏      | 32/99 [04:16<08:48,  7.89s/it]

Successful response


 33%|███▎      | 33/99 [04:24<08:40,  7.89s/it]

Successful response


 34%|███▍      | 34/99 [04:32<08:33,  7.90s/it]

Successful response


 35%|███▌      | 35/99 [04:39<08:24,  7.89s/it]

Successful response


 36%|███▋      | 36/99 [04:47<08:16,  7.88s/it]

Successful response


 37%|███▋      | 37/99 [04:55<08:08,  7.88s/it]

Successful response


 38%|███▊      | 38/99 [05:03<08:01,  7.89s/it]

Successful response


 39%|███▉      | 39/99 [05:11<07:52,  7.87s/it]

Successful response


 40%|████      | 40/99 [05:19<07:44,  7.87s/it]

Successful response


 41%|████▏     | 41/99 [05:27<07:35,  7.86s/it]

Successful response


 42%|████▏     | 42/99 [05:35<07:28,  7.86s/it]

Successful response


 43%|████▎     | 43/99 [05:42<07:19,  7.86s/it]

Successful response


 44%|████▍     | 44/99 [05:50<07:12,  7.87s/it]

Successful response


 45%|████▌     | 45/99 [05:58<07:05,  7.89s/it]

Successful response


 46%|████▋     | 46/99 [06:06<06:56,  7.86s/it]

Successful response


 47%|████▋     | 47/99 [06:14<06:48,  7.85s/it]

Successful response


 48%|████▊     | 48/99 [06:22<06:39,  7.83s/it]

Successful response


 49%|████▉     | 49/99 [06:30<06:32,  7.85s/it]

Successful response


 51%|█████     | 50/99 [06:37<06:24,  7.84s/it]

Successful response


 52%|█████▏    | 51/99 [06:45<06:19,  7.91s/it]

Successful response


 53%|█████▎    | 52/99 [06:53<06:10,  7.87s/it]

Successful response


 54%|█████▎    | 53/99 [07:01<06:01,  7.87s/it]

Successful response


 55%|█████▍    | 54/99 [07:09<05:53,  7.85s/it]

Successful response


 56%|█████▌    | 55/99 [07:17<05:45,  7.85s/it]

Successful response


 57%|█████▋    | 56/99 [07:25<05:36,  7.83s/it]

Successful response


 58%|█████▊    | 57/99 [07:32<05:28,  7.83s/it]

Successful response


 59%|█████▊    | 58/99 [07:40<05:21,  7.84s/it]

Successful response


 60%|█████▉    | 59/99 [07:48<05:13,  7.83s/it]

Successful response


 61%|██████    | 60/99 [07:56<05:06,  7.85s/it]

Successful response


 62%|██████▏   | 61/99 [08:04<04:58,  7.84s/it]

Successful response


 63%|██████▎   | 62/99 [08:12<04:50,  7.84s/it]

Successful response


 64%|██████▎   | 63/99 [08:19<04:43,  7.86s/it]

Successful response


 65%|██████▍   | 64/99 [08:27<04:36,  7.89s/it]

Successful response


 66%|██████▌   | 65/99 [08:35<04:27,  7.87s/it]

Successful response


 67%|██████▋   | 66/99 [08:43<04:19,  7.85s/it]

Successful response


 68%|██████▊   | 67/99 [08:51<04:11,  7.87s/it]

Successful response


 69%|██████▊   | 68/99 [08:59<04:04,  7.88s/it]

Successful response


 70%|██████▉   | 69/99 [09:07<03:56,  7.88s/it]

Successful response


 71%|███████   | 70/99 [09:15<03:48,  7.88s/it]

Successful response


 72%|███████▏  | 71/99 [09:23<03:40,  7.89s/it]

Successful response


 73%|███████▎  | 72/99 [09:30<03:33,  7.90s/it]

Successful response


 74%|███████▎  | 73/99 [09:38<03:25,  7.90s/it]

Successful response


 75%|███████▍  | 74/99 [09:46<03:19,  7.96s/it]

Successful response


 76%|███████▌  | 75/99 [09:54<03:09,  7.91s/it]

Successful response


 77%|███████▋  | 76/99 [10:02<03:01,  7.91s/it]

Successful response


 78%|███████▊  | 77/99 [10:10<02:53,  7.90s/it]

Successful response


 79%|███████▉  | 78/99 [10:18<02:46,  7.91s/it]

Successful response


 80%|███████▉  | 79/99 [10:26<02:38,  7.91s/it]

Successful response


 81%|████████  | 80/99 [10:34<02:30,  7.93s/it]

Successful response


 82%|████████▏ | 81/99 [10:42<02:23,  7.99s/it]

Successful response


 83%|████████▎ | 82/99 [10:50<02:16,  8.01s/it]

Successful response


 84%|████████▍ | 83/99 [10:58<02:09,  8.07s/it]

Successful response


 85%|████████▍ | 84/99 [11:06<02:01,  8.11s/it]

Successful response


 86%|████████▌ | 85/99 [11:15<01:53,  8.12s/it]

Successful response


 87%|████████▋ | 86/99 [11:23<01:45,  8.14s/it]

Successful response


 88%|████████▊ | 87/99 [11:31<01:37,  8.15s/it]

Successful response


 89%|████████▉ | 88/99 [11:39<01:29,  8.11s/it]

Successful response


 90%|████████▉ | 89/99 [11:47<01:20,  8.10s/it]

Successful response


 91%|█████████ | 90/99 [11:55<01:12,  8.09s/it]

Successful response


 92%|█████████▏| 91/99 [12:03<01:04,  8.10s/it]

Successful response


 93%|█████████▎| 92/99 [12:11<00:56,  8.09s/it]

Successful response


 94%|█████████▍| 93/99 [12:19<00:48,  8.10s/it]

Successful response


 95%|█████████▍| 94/99 [12:28<00:40,  8.14s/it]

Successful response


 96%|█████████▌| 95/99 [12:36<00:32,  8.11s/it]

Successful response


 97%|█████████▋| 96/99 [12:44<00:24,  8.12s/it]

Successful response


 98%|█████████▊| 97/99 [12:52<00:16,  8.24s/it]

Successful response


 99%|█████████▉| 98/99 [13:01<00:08,  8.24s/it]

Successful response


100%|██████████| 99/99 [13:09<00:00,  7.97s/it]


In [3]:
# To obtain the zipcodes for addresses

# Initialize the OpenCageGeocode API key
api_key = "4d9d18c5a56040578558ee2d57caaf6f"  #personal API key

# Initialize the geocoder
geocoder = OpenCageGeocode(api_key)


# Function to extract city and postal code from address
def extract_city_and_zip(address):
    parts = address.split('-')
    if len(parts) > 1:
        city_part = parts[1].strip()  # Get the part after the hyphen and remove leading/trailing spaces
        city = city_part.split(',')[0].strip()  # Extract the city name before the comma and remove spaces
        return city
    return None

# Search for the postal code for each address in the list and add to the list of dictionaries
for address in listingStreet:
    result = geocoder.geocode(address, countrycode="CA")
    if result and 'components' in result[0]:
        components = result[0]['components']
        postal_code = components.get('postcode', 'Postal code not found')
        listingZip.append(postal_code)
    else:
        listingZip.append('Postal code not found')

# Function to extract city and postal code from address
def extract_city_and_zip(address):
    parts = address.split('-')
    if len(parts) > 1:
        right_part = parts[1].strip()  # Get the part after the hyphen and remove leading/trailing spaces
        city_and_province = right_part.split(',')  # Split by comma to separate city and province
        if len(city_and_province) > 1:
            city = city_and_province[0].strip()  # Extract the city name and remove spaces
            return city
    return None

# Apply the function to each element in the ListingStreet list
for address in listingStreet:
    city = extract_city_and_zip(address)
    if city:
        listingCity.append(city)
    else:
        listingCity.append('City not found')

In [4]:
print(len(listingCity))
print(len(listingType))
print(len(listingBed))
print(len(listingBath))
print(len(listingDim))
print(len(listingStreet))
print(len(listingZip))
print(len(listingRent))


850
850
850
841
702
850
850
850


## Creating a Dataframe and Consolidating Cleaned Data

In [5]:
import pandas as pd
import numpy as np  # Import numpy for NaN values

# Find the maximum length among all lists
max_length = max(len(listingCity), len(listingType), len(listingBed), len(listingBath),
                 len(listingDim), len(listingStreet), len(listingZip), len(listingRent))

# Pad the shorter lists with None or NaN to match the maximum length
def pad_list(lst, length, pad_value=None):
    if len(lst) < length:
        return lst + [pad_value] * (length - len(lst))
    else:
        return lst

listingCity = pad_list(listingCity, max_length)
listingType = pad_list(listingType, max_length)
listingBed = pad_list(listingBed, max_length)
listingBath = pad_list(listingBath, max_length)
listingDim = pad_list(listingDim, max_length)
listingStreet = pad_list(listingStreet, max_length)
listingZip = pad_list(listingZip, max_length)
listingRent = pad_list(listingRent, max_length)

# Create the DataFrame
column_names = ["City", "Property Type", "Bedrooms", "Bathrooms", "Square Footage", "Address",
                "Zip code", "Price"]
data = {
    "City": listingCity,
    "Property Type": listingType,
    "Bedrooms": listingBed,
    "Bathrooms": listingBath,
    "Square Footage": listingDim,
    "Address": listingStreet,
    "Zip code": listingZip,
    "Price": listingRent
}

df = pd.DataFrame(data)

# Set the index (if needed)
df.set_index('City', inplace=True)

# Display the DataFrame
print(df)

        Property Type Bedrooms Bathrooms Square Footage                                   Address               Zip code          Price
City                                                                                                                                   
Toronto     apartment      0-3         3           1463     450 Front Street West  - Toronto , ON                M5V 2P1  $2400 - $7872
Toronto     apartment    1-2.5         2            963        118 Balliol Street  - Toronto , ON                M4S 3C4  $2540 - $4005
Toronto     apartment      1-3       2.5           1119           131 Mill Street  - Toronto , ON                M5A 3C4  $2546 - $4437
Toronto     apartment      0-3         2            886        200 Redpath Avenue  - Toronto , ON                M4P 1G4  $2570 - $4980
Toronto     apartment      1-2         2           1050                57 Spadina  - Toronto , ON                M5R 2X3  $3217 - $4536
...               ...      ...       ...        

In [6]:
num_rows = df.shape[0]
print(num_rows)

850


In [7]:
# Save to a csv file to compare with Ottawa
folder = r"D:\Projects\Rentals/"
df.to_excel("rental_data_toronto_1.xlsx")

In [8]:
"""
References:
1. Web Scraping Rentals Website Using Python Beautiful Soup: https://medium.com/swlh/web-scraping-rentals-website-using-beautiful-soup-and-pandas-99e255f27052
2. Chat GPT (For fine-tuning)
3. StackOverflow
4. Google
"""

'\nReferences:\n1. Web Scraping Rentals Website Using Python Beautiful Soup: https://medium.com/swlh/web-scraping-rentals-website-using-beautiful-soup-and-pandas-99e255f27052\n2. Chat GPT (For fine-tuning)\n3. StackOverflow\n4. Google\n'