## Data Collection

In [1]:
# Import Python Libraries

# For HTML parsing
from bs4 import BeautifulSoup 
from selenium import webdriver

# For website connections
import requests 

# For data cleanup
import re

# For zipcode search
#!pip install opencage
from opencage.geocoder import OpenCageGeocode


# To prevent overwhelming the server between connections
import time
from time import sleep 

# Display the progress bar
from tqdm import tqdm

# For data wrangling
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# For creating plots
import matplotlib.pyplot as plt
import plotly.graph_objects as go


    


In [2]:
web_driver = webdriver.Chrome()

# Function to collect raw data from url:

def get_page(city, type, beds, page):
  
    url    = f'https://www.torontorentals.com/{city}/{type}?beds={beds}%20&p={page}'
    result = requests.get(url)
    # https://www.torontorentals.com/toronto/condos?beds=1%20&p=2
    # check HTTP response status codes to find if HTTP request has been successfully completed
    if result.status_code >= 100  and result.status_code <= 199:
        print('Informational response')
    if result.status_code >= 200  and result.status_code <= 299:
        print('Successful response')
        web_driver.get(url)
        time.sleep(2)
        web_driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)
        soup = BeautifulSoup(web_driver.page_source,'lxml')
    if result.status_code >= 300  and result.status_code <= 399:
        print('Redirect')
    if result.status_code >= 400  and result.status_code <= 499:
        print('Client error')
    if result.status_code >= 500  and result.status_code <= 599:
        print('Server error')
        
    return soup

#-----------------------------------------------------------------------------------------------------------------------------------

# Data that will be used in the function
house_type = ["Apartment","condo","room","house","studio","basement"]
bed_options = ["0","1","2","3","4","1-2","1-3"]

# Lists that will contain the clean data
listData = []
listingStreet = []
listingCity = []
listingZip = []
listingRent = []
listingBed = []
listingBath = []
listingDim = []
listingType = []
ListingLink = []
ListingID = []

# Code that implements the above function and the above lists to collect raw data          
  
for page_num in tqdm(range(1,100)):  # Range depends on how many pages you want to analyze
    soup_page                = get_page('ottawa', house_type, bed_options, page_num)
            
  
  #Data Collection
     
    #This contains info on all datapoints needed, but will use other links instead to avoid mistakes during the clean up process
    data = soup_page.find_all("div",{"class":"r-listing-card-v"})
    listData.append(data)  
    
    
    # Street, Rent & House type had unique identifiers in the HTML 
    street                   = soup_page.find_all("div",{"class":"r-listing-address q-mb-md q-pl-md"})
    rent                     = soup_page.find_all("a",{"class":"r-listing-price q-my-md q-mr-md q-pl-md"})
    house_type               = soup_page.find_all("span",{"class":"r-listing-type"})

    # Bed, Bath and Dimensions had the same identifier from the HTML
    data_bed_bath_dimensions = soup_page.find_all("span",{"class":"r-listing-infos__label"})
   

    # Data Cleanup # Appending to Lists
    
    # Street & House type had unique identifiers in the HTML 

    # Address
    str_street              = [str(item) for item in street]
    cleaned_street          = [sub.replace('<div class="r-listing-address q-mb-md q-pl-md">',"")
                               .replace('</div>',"") for sub in str_street]
    for i in cleaned_street:
        listingStreet.append(i)
        
    # House Type    
    str_house_type          = [str(item) for item in house_type]
    cleaned_house_type      = [sub.replace('<span class="r-listing-type">',"</span>,")
                               .replace('</span>',"").replace(",","") for sub in str_house_type]
    for i in cleaned_house_type:
        listingType.append(i)    
        
        
    # Price    
    str_rent                = [str(item) for item in rent]
    rent_1                  = [sub.replace('<a class="r-listing-price q-my-md q-mr-md q-pl-md"','').replace('href="/toronto/','')
                                .replace('</a>',"").replace(",","") for sub in str_rent]
    rent_2                  = [item.split(">") for item in rent_1]
    cleaned_rent            = [' - '.join(item.split(' - ')[:2]) for _, item in rent_2]
    for i in cleaned_rent:
        listingRent.append(i)
        
    # Bed, Bath & Dimensions    
    str_data_bed_bath_dimensions    = [str(item) for item in data_bed_bath_dimensions]
    for html in str_data_bed_bath_dimensions:
        soup = BeautifulSoup(html, 'html.parser')
        text = soup.text.strip()
    
        if 'bed' in text:
            bed_text = text.split()[0]
            listingBed.append(bed_text)
        elif 'bath' in text:
            bath_text = text.split()[0]
            listingBath.append(bath_text)
        elif 'Ft' in text:
            sq_ft_text = text.split()[0]
            listingDim.append(sq_ft_text)
    
    
    # For IDs:
    
    # Find all <a> elements with the class "r-listing-price"
    anchor_elements = soup_page.find_all('a', class_='r-listing-price')

    # Iterate through the <a> elements and extract the IDs
    for anchor_element in anchor_elements:
        href = anchor_element['href']

        # Extract the ID from the href attribute using regular expressions
        id_match = re.search(r'id(\d+)', href)
        if id_match:
            listing_id = id_match.group(1)
            ListingID.append(listing_id)
    

  0%|          | 0/99 [00:00<?, ?it/s]

Successful response


  1%|          | 1/99 [00:09<14:48,  9.07s/it]

Successful response


  2%|▏         | 2/99 [00:17<13:49,  8.55s/it]

Successful response


  3%|▎         | 3/99 [00:25<13:37,  8.51s/it]

Successful response


  4%|▍         | 4/99 [00:33<13:19,  8.41s/it]

Successful response


  5%|▌         | 5/99 [00:42<13:02,  8.32s/it]

Successful response


  6%|▌         | 6/99 [00:50<12:49,  8.28s/it]

Successful response


  7%|▋         | 7/99 [00:58<12:35,  8.21s/it]

Successful response


  8%|▊         | 8/99 [01:06<12:21,  8.15s/it]

Successful response


  9%|▉         | 9/99 [01:14<12:15,  8.17s/it]

Successful response


 10%|█         | 10/99 [01:22<12:00,  8.10s/it]

Successful response


 11%|█         | 11/99 [01:30<11:44,  8.00s/it]

Successful response


 12%|█▏        | 12/99 [01:38<11:33,  7.97s/it]

Successful response


 13%|█▎        | 13/99 [01:46<11:23,  7.95s/it]

Successful response


 14%|█▍        | 14/99 [01:54<11:13,  7.93s/it]

Successful response


 15%|█▌        | 15/99 [02:01<11:06,  7.93s/it]

Successful response


 16%|█▌        | 16/99 [02:10<11:01,  7.97s/it]

Successful response


 17%|█▋        | 17/99 [02:18<10:53,  7.97s/it]

Successful response


 18%|█▊        | 18/99 [02:26<10:49,  8.02s/it]

Successful response


 19%|█▉        | 19/99 [02:34<10:41,  8.02s/it]

Successful response


 20%|██        | 20/99 [02:42<10:34,  8.03s/it]

Successful response


 21%|██        | 21/99 [02:50<10:25,  8.02s/it]

Successful response


 22%|██▏       | 22/99 [02:58<10:19,  8.04s/it]

Successful response


 23%|██▎       | 23/99 [03:06<10:08,  8.00s/it]

Successful response


 24%|██▍       | 24/99 [03:14<09:58,  7.99s/it]

Successful response


 25%|██▌       | 25/99 [03:22<10:05,  8.18s/it]

Successful response


 26%|██▋       | 26/99 [03:30<09:54,  8.14s/it]

Successful response


 27%|██▋       | 27/99 [03:38<09:42,  8.10s/it]

Successful response


 28%|██▊       | 28/99 [03:46<09:29,  8.02s/it]

Successful response


 29%|██▉       | 29/99 [03:54<09:18,  7.98s/it]

Successful response


 30%|███       | 30/99 [04:02<09:11,  7.99s/it]

Successful response


 31%|███▏      | 31/99 [04:10<09:01,  7.96s/it]

Successful response


 32%|███▏      | 32/99 [04:18<08:50,  7.92s/it]

Successful response


 33%|███▎      | 33/99 [04:27<08:59,  8.17s/it]

Successful response


 34%|███▍      | 34/99 [04:35<08:49,  8.15s/it]

Successful response


 35%|███▌      | 35/99 [04:43<08:40,  8.13s/it]

Successful response


 36%|███▋      | 36/99 [04:51<08:29,  8.08s/it]

Successful response


 37%|███▋      | 37/99 [04:59<08:17,  8.03s/it]

Successful response


 38%|███▊      | 38/99 [05:07<08:09,  8.02s/it]

Successful response


 39%|███▉      | 39/99 [05:15<08:07,  8.13s/it]

Successful response


 40%|████      | 40/99 [05:23<07:55,  8.05s/it]

Successful response


 41%|████▏     | 41/99 [05:31<07:45,  8.03s/it]

Successful response


 42%|████▏     | 42/99 [05:39<07:34,  7.97s/it]

Successful response


 43%|████▎     | 43/99 [05:46<07:23,  7.92s/it]

Successful response


 44%|████▍     | 44/99 [05:55<07:17,  7.96s/it]

Successful response


 45%|████▌     | 45/99 [06:02<07:07,  7.92s/it]

Successful response


 46%|████▋     | 46/99 [06:10<06:58,  7.91s/it]

Successful response


 47%|████▋     | 47/99 [06:18<06:52,  7.93s/it]

Successful response


 48%|████▊     | 48/99 [06:26<06:45,  7.96s/it]

Successful response


 49%|████▉     | 49/99 [06:34<06:39,  7.98s/it]

Successful response


 51%|█████     | 50/99 [06:42<06:31,  7.99s/it]

Successful response


 52%|█████▏    | 51/99 [06:50<06:21,  7.94s/it]

Successful response


 53%|█████▎    | 52/99 [06:58<06:11,  7.91s/it]

Successful response


 54%|█████▎    | 53/99 [07:06<06:07,  7.98s/it]

Successful response


 55%|█████▍    | 54/99 [07:14<05:57,  7.94s/it]

Successful response


 56%|█████▌    | 55/99 [07:22<05:48,  7.92s/it]

Successful response


 57%|█████▋    | 56/99 [07:30<05:42,  7.95s/it]

Successful response


 58%|█████▊    | 57/99 [07:38<05:32,  7.92s/it]

Successful response


 59%|█████▊    | 58/99 [07:46<05:26,  7.96s/it]

Successful response


 60%|█████▉    | 59/99 [07:54<05:19,  7.98s/it]

Successful response


 61%|██████    | 60/99 [08:02<05:09,  7.95s/it]

Successful response


 62%|██████▏   | 61/99 [08:10<05:01,  7.92s/it]

Successful response


 63%|██████▎   | 62/99 [08:17<04:52,  7.91s/it]

Successful response


 64%|██████▎   | 63/99 [08:25<04:46,  7.95s/it]

Successful response


 65%|██████▍   | 64/99 [08:33<04:37,  7.93s/it]

Successful response


 66%|██████▌   | 65/99 [08:41<04:29,  7.91s/it]

Successful response


 67%|██████▋   | 66/99 [08:49<04:22,  7.96s/it]

Successful response


 68%|██████▊   | 67/99 [08:57<04:14,  7.94s/it]

Successful response


 69%|██████▊   | 68/99 [09:05<04:05,  7.92s/it]

Successful response


 70%|██████▉   | 69/99 [09:13<03:57,  7.90s/it]

Successful response


 71%|███████   | 70/99 [09:21<03:48,  7.90s/it]

Successful response


 72%|███████▏  | 71/99 [09:29<03:44,  8.03s/it]

Successful response


 73%|███████▎  | 72/99 [09:37<03:36,  8.03s/it]

Successful response


 74%|███████▎  | 73/99 [09:46<03:31,  8.15s/it]

Successful response


 75%|███████▍  | 74/99 [09:53<03:21,  8.05s/it]

Successful response


 76%|███████▌  | 75/99 [10:01<03:11,  8.00s/it]

Successful response


 77%|███████▋  | 76/99 [10:09<03:02,  7.95s/it]

Successful response


 78%|███████▊  | 77/99 [10:17<02:54,  7.94s/it]

Successful response


 79%|███████▉  | 78/99 [10:25<02:47,  7.96s/it]

Successful response


 80%|███████▉  | 79/99 [10:33<02:39,  7.98s/it]

Successful response


 81%|████████  | 80/99 [10:41<02:31,  8.00s/it]

Successful response


 82%|████████▏ | 81/99 [10:49<02:24,  8.03s/it]

Successful response


 83%|████████▎ | 82/99 [10:58<02:19,  8.19s/it]

Successful response


 84%|████████▍ | 83/99 [11:06<02:10,  8.14s/it]

Successful response


 85%|████████▍ | 84/99 [11:14<02:01,  8.09s/it]

Successful response


 86%|████████▌ | 85/99 [11:22<01:52,  8.05s/it]

Successful response


 87%|████████▋ | 86/99 [11:30<01:44,  8.08s/it]

Successful response


 88%|████████▊ | 87/99 [11:38<01:36,  8.04s/it]

Successful response


 89%|████████▉ | 88/99 [11:46<01:28,  8.02s/it]

Successful response


 90%|████████▉ | 89/99 [11:54<01:20,  8.01s/it]

Successful response


 91%|█████████ | 90/99 [12:02<01:12,  8.03s/it]

Successful response


 92%|█████████▏| 91/99 [12:10<01:04,  8.06s/it]

Successful response


 93%|█████████▎| 92/99 [12:18<00:56,  8.07s/it]

Successful response


 94%|█████████▍| 93/99 [12:26<00:48,  8.07s/it]

Successful response


 95%|█████████▍| 94/99 [12:34<00:40,  8.05s/it]

Successful response


 96%|█████████▌| 95/99 [12:42<00:32,  8.09s/it]

Successful response


 97%|█████████▋| 96/99 [12:51<00:24,  8.15s/it]

Successful response


 98%|█████████▊| 97/99 [12:59<00:16,  8.20s/it]

Successful response


 99%|█████████▉| 98/99 [13:07<00:08,  8.19s/it]

Successful response


100%|██████████| 99/99 [13:15<00:00,  8.04s/it]


In [3]:
# To obtain the zipcodes for addresses

# Initialize the OpenCageGeocode API key
api_key = "4d9d18c5a56040578558ee2d57caaf6f"  #personal API key

# Initialize the geocoder
geocoder = OpenCageGeocode(api_key)


# Function to extract city and postal code from address
def extract_city_and_zip(address):
    parts = address.split('-')
    if len(parts) > 1:
        city_part = parts[1].strip()  # Get the part after the hyphen and remove leading/trailing spaces
        city = city_part.split(',')[0].strip()  # Extract the city name before the comma and remove spaces
        return city
    return None

# Search for the postal code for each address in the list and add to the list of dictionaries
for address in listingStreet:
    result = geocoder.geocode(address, countrycode="CA")
    if result and 'components' in result[0]:
        components = result[0]['components']
        postal_code = components.get('postcode', 'Postal code not found')
        listingZip.append(postal_code)
    else:
        listingZip.append('Postal code not found')

# Function to extract city and postal code from address
def extract_city_and_zip(address):
    parts = address.split('-')
    if len(parts) > 1:
        right_part = parts[1].strip()  # Get the part after the hyphen and remove leading/trailing spaces
        city_and_province = right_part.split(',')  # Split by comma to separate city and province
        if len(city_and_province) > 1:
            city = city_and_province[0].strip()  # Extract the city name and remove spaces
            return city
    return None

# Apply the function to each element in the ListingStreet list
for address in listingStreet:
    city = extract_city_and_zip(address)
    if city:
        listingCity.append(city)
    else:
        listingCity.append('City not found')

In [4]:
print(len(listingCity))
print(len(listingType))
print(len(listingBed))
print(len(listingBath))
print(len(listingDim))
print(len(listingStreet))
print(len(listingZip))
print(len(listingRent))
print(len(ListingID))

762
762
762
751
439
762
762
762
762


## Creating a Dataframe and Consolidating Cleaned Data

In [5]:
import pandas as pd
import numpy as np  # Import numpy for NaN values

# Find the maximum length among all lists
max_length = max(len(listingCity), len(listingType), len(listingBed), len(listingBath),
                 len(listingDim), len(listingStreet), len(listingZip), len(listingRent))

# Pad the shorter lists with None or NaN to match the maximum length
def pad_list(lst, length, pad_value=None):
    if len(lst) < length:
        return lst + [pad_value] * (length - len(lst))
    else:
        return lst

listingCity = pad_list(listingCity, max_length)
listingType = pad_list(listingType, max_length)
listingBed = pad_list(listingBed, max_length)
listingBath = pad_list(listingBath, max_length)
listingDim = pad_list(listingDim, max_length)
listingStreet = pad_list(listingStreet, max_length)
listingZip = pad_list(listingZip, max_length)
listingRent = pad_list(listingRent, max_length)

# Create the DataFrame
column_names = ["City", "Property Type", "Bedrooms", "Bathrooms", "Square Footage", "Address",
                "Zip code", "Price"]
data = {
    "City": listingCity,
    "Property Type": listingType,
    "Bedrooms": listingBed,
    "Bathrooms": listingBath,
    "Square Footage": listingDim,
    "Address": listingStreet,
    "Zip code": listingZip,
    "Price": listingRent
}

df = pd.DataFrame(data)

# Set the index (if needed)
df.set_index('City', inplace=True)

# Display the DataFrame
print(df)

               Property Type Bedrooms Bathrooms Square Footage                                    Address Zip code          Price
City                                                                                                                             
Ottawa             apartment      1-2         2            856                90 Champagne  - Ottawa , ON  K1S 4P1  $2100 - $2900
City not found     apartment      0-1         1            756       253 - 257 York Street  - Ottawa , ON  K1N 5T9  $1675 - $1850
City not found     apartment      0-1         1            470                86-92 Hinton  - Ottawa , ON  K1Y 2Z7  $1700 - $2200
Ottawa             apartment      0-2         1            167                  256 Rideau  - Ottawa , ON  K1N 0A9  $1285 - $2970
City not found     apartment        1         1           1010      150-152 Osgoode Street  - Ottawa , ON  K1N 8A4          $1400
...                      ...      ...       ...            ...                            

In [6]:
num_rows = df.shape[0]
print(num_rows)

762


In [7]:
folder = r"D:\Projects\Rentals/"
df.to_excel("rental_data_ottawa_1.xlsx")

In [8]:
"""
References:
1. Web Scraping Rentals Website Using Python Beautiful Soup: https://medium.com/swlh/web-scraping-rentals-website-using-beautiful-soup-and-pandas-99e255f27052
2. Chat GPT (For fine-tuning)
3. StackOverflow
4. Google
"""

'\nReferences:\n1. Web Scraping Rentals Website Using Python Beautiful Soup: https://medium.com/swlh/web-scraping-rentals-website-using-beautiful-soup-and-pandas-99e255f27052\n2. Chat GPT (For fine-tuning)\n3. StackOverflow\n4. Google\n'