## Data Collection

In [68]:
# Import Python Libraries

# For HTML parsing
from bs4 import BeautifulSoup 
from selenium import webdriver

# For website connections
import requests 

# For data cleanup
import re

# For zipcode search
#!pip install opencage
from opencage.geocoder import OpenCageGeocode


# To prevent overwhelming the server between connections
import time
from time import sleep 

# Display the progress bar
from tqdm import tqdm

# For data wrangling
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# For creating plots
import matplotlib.pyplot as plt
import plotly.graph_objects as go


    


In [69]:
web_driver = webdriver.Chrome()

# Function to collect raw data from url:

def get_page(city, type, beds, page):
  
    url    = f'https://www.torontorentals.com/{city}/{type}?beds={beds}%20&p={page}'
    result = requests.get(url)
    # https://www.torontorentals.com/toronto/condos?beds=1%20&p=2
    # check HTTP response status codes to find if HTTP request has been successfully completed
    if result.status_code >= 100  and result.status_code <= 199:
        print('Informational response')
    if result.status_code >= 200  and result.status_code <= 299:
        print('Successful response')
        web_driver.get(url)
        time.sleep(2)
        web_driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)
        soup = BeautifulSoup(web_driver.page_source,'lxml')
    if result.status_code >= 300  and result.status_code <= 399:
        print('Redirect')
    if result.status_code >= 400  and result.status_code <= 499:
        print('Client error')
    if result.status_code >= 500  and result.status_code <= 599:
        print('Server error')
        
    return soup

#-----------------------------------------------------------------------------------------------------------------------------------

# Data that will be used in the function
house_type = ["Apartment","condo","room","house","studio","basement"]
bed_options = ["0","1","2","3","4","1-2","1-3"]

# Lists that will contain the clean data
listData = []
listingStreet = []
listingCity = []
listingZip = []
listingRent = []
listingBed = []
listingBath = []
listingDim = []
listingType = []
ListingLink = []
ListingID = []

# Code that implements the above function and the above lists to collect raw data          
  
for page_num in tqdm(range(1,2)):  # Range depends on how many pages you want to analyze
    soup_page                = get_page('ottawa', house_type, bed_options, page_num)
            
  
  #Data Collection
     
    #This contains info on all datapoints needed, but will use other links instead to avoid mistakes during the clean up process
    data = soup_page.find_all("div",{"class":"r-listing-card-v"})
    listData.append(data)  
    
    
    # Street, Rent & House type had unique identifiers in the HTML 
    street                   = soup_page.find_all("div",{"class":"r-listing-address q-mb-md q-pl-md"})
    rent                     = soup_page.find_all("a",{"class":"r-listing-price q-my-md q-mr-md q-pl-md"})
    house_type               = soup_page.find_all("span",{"class":"r-listing-type"})

    # Bed, Bath and Dimensions had the same identifier from the HTML
    data_bed_bath_dimensions = soup_page.find_all("span",{"class":"r-listing-infos__label"})
   

    # Data Cleanup # Appending to Lists
    
    # Street & House type had unique identifiers in the HTML 

    # Address
    str_street              = [str(item) for item in street]
    cleaned_street          = [sub.replace('<div class="r-listing-address q-mb-md q-pl-md">',"")
                               .replace('</div>',"") for sub in str_street]
    for i in cleaned_street:
        listingStreet.append(i)
        
    # House Type    
    str_house_type          = [str(item) for item in house_type]
    cleaned_house_type      = [sub.replace('<span class="r-listing-type">',"</span>,")
                               .replace('</span>',"").replace(",","") for sub in str_house_type]
    for i in cleaned_house_type:
        listingType.append(i)    
        
        
    # Price    
    str_rent                = [str(item) for item in rent]
    rent_1                  = [sub.replace('<a class="r-listing-price q-my-md q-mr-md q-pl-md"','').replace('href="/toronto/','')
                                .replace('</a>',"").replace(",","") for sub in str_rent]
    rent_2                  = [item.split(">") for item in rent_1]
    cleaned_rent            = [' - '.join(item.split(' - ')[:2]) for _, item in rent_2]
    for i in cleaned_rent:
        listingRent.append(i)
        
    # Bed, Bath & Dimensions    
    str_data_bed_bath_dimensions    = [str(item) for item in data_bed_bath_dimensions]
    cleaned_dimensions_bath_bed     = [sub.replace('<span data-current-language="en-US"',"")
                                      .replace('<span class="r-listing-infos__label">',"")
                                      .replace('data-msgid=',"").replace('>bed</span></span>',"")
                                      .replace('>bath</span></span>',"").replace('>Ft</span></span>',"")
                                       for sub in str_data_bed_bath_dimensions]

    combined_info = []
    listingCombinedInfo = []
    for i in range(0, len(cleaned_dimensions_bath_bed), 3):
        bed                 = cleaned_dimensions_bath_bed[i].split()[0] if i < len(cleaned_dimensions_bath_bed) else "N/A"
        bath                = cleaned_dimensions_bath_bed[i + 1].split()[0] if i + 1 < len(cleaned_dimensions_bath_bed) else "N/A"
        sqft                = cleaned_dimensions_bath_bed[i + 2].split()[0] if i + 2 < len(cleaned_dimensions_bath_bed) else "N/A"

        combined_info.append(f"{bed} bed, {bath} bath, {sqft} ft")
        listingCombinedInfo.append(combined_info)

    # Now combined_info contains the desired combined strings with the specified order and "N/A" for missing values.

    for item in combined_info:
        parts               = item.split(', ')
        
        bed_part            = parts[0].split()[0]
        listingBed.append(bed_part)
        
        bath_part           = parts[1].split()[0]
        listingBath.append(bath_part)
        
        dim_part            = parts[2].split()[0]
        listingDim.append(dim_part)  
        
        
   
    # For IDs:
    
    # Find all <a> elements with the class "r-listing-price"
    anchor_elements = soup_page.find_all('a', class_='r-listing-price')

    # Iterate through the <a> elements and extract the IDs
    for anchor_element in anchor_elements:
        href = anchor_element['href']

        # Extract the ID from the href attribute using regular expressions
        id_match = re.search(r'id(\d+)', href)
        if id_match:
            listing_id = id_match.group(1)
            ListingID.append(listing_id)
    

  0%|          | 0/1 [00:00<?, ?it/s]

Successful response


100%|██████████| 1/1 [00:08<00:00,  8.56s/it]


In [23]:
# To obtain the zipcodes for addresses

# Initialize the OpenCageGeocode API key
api_key = "4d9d18c5a56040578558ee2d57caaf6f"  #personal API key

# Initialize the geocoder
geocoder = OpenCageGeocode(api_key)


# Function to extract city and postal code from address
def extract_city_and_zip(address):
    parts = address.split('-')
    if len(parts) > 1:
        city_part = parts[1].strip()  # Get the part after the hyphen and remove leading/trailing spaces
        city = city_part.split(',')[0].strip()  # Extract the city name before the comma and remove spaces
        return city
    return None

# Search for the postal code for each address in the list and add to the list of dictionaries
for address in listingStreet:
    result = geocoder.geocode(address, countrycode="CA")
    if result and 'components' in result[0]:
        components = result[0]['components']
        postal_code = components.get('postcode', 'Postal code not found')
        listingZip.append(postal_code)
    else:
        listingZip.append('Postal code not found')

# Function to extract city and postal code from address
def extract_city_and_zip(address):
    parts = address.split('-')
    if len(parts) > 1:
        right_part = parts[1].strip()  # Get the part after the hyphen and remove leading/trailing spaces
        city_and_province = right_part.split(',')  # Split by comma to separate city and province
        if len(city_and_province) > 1:
            city = city_and_province[0].strip()  # Extract the city name and remove spaces
            return city
    return None

# Apply the function to each element in the ListingStreet list
for address in listingStreet:
    city = extract_city_and_zip(address)
    if city:
        listingCity.append(city)
    else:
        listingCity.append('City not found')

In [73]:
print(len(listingCity))
print(len(listingType))
print(len(listingBed))
print(len(listingBath))
print(len(listingDim))
print(len(listingStreet))
print(len(listingZip))
print(len(listingRent))
print(len(listingCombinedInfo))
print(len(ListingID))

0
10
9
9
9
10
0
10
9
10
['708878', '408746', '518420', '580823', '561296', '392871', '612300', '344041', '255883', '582285']


### Cleaning Bed Bath and SQ Ft

In [88]:
# Create a dictionary with the data, using None for missing values
data = {
    "id": ListingID,
    "bedroom": listingBed + [None] * (len(ListingID) - len(listingBed)),
    "bathroom": listingBath + [None] * (len(ListingID) - len(listingBath)),
    "sq ft": listingDim + [None] * (len(ListingID) - len(listingDim))
}

# Create a DataFrame from the dictionary
df_incorrect = pd.DataFrame(data)

# Create dictionaries with the IDs and the data from bed, bath and sq ft

# Initialize empty dictionaries
bedroom_dict = {}
bathroom_dict = {}
sqft_dict = {}

# Iterate through the DataFrame rows and populate the dictionaries
for index, row in df_incorrect.iterrows():
    listing_id = row["id"]
    
    # Create dictionaries with ID and respective values
    bedroom_dict[listing_id] = row["bedroom"]
    bathroom_dict[listing_id] = row["bathroom"]
    sqft_dict[listing_id] = row["sq ft"]
    

# Clean-up
print(bedroom_dict)
print(bathroom_dict)
print(sqft_dict)

# Sample dictionaries
bedroom_dict = {'708878': '1-2', '408746': '0-1', '518420': '1', '580823': '1', '561296': '1', '392871': '2', '612300': '1-2', '344041': '1-2', '255883': '2', '582285': None}
bathroom_dict = {'708878': '2', '408746': '1', '518420': '756', '580823': '167', '561296': '470', '392871': '1-2.5', '612300': '2', '344041': '2', '255883': '954', '582285': None}
sqft_dict = {'708878': '856', '408746': '0-1', '518420': '1', '580823': '0-2', '561296': '1-2', '392871': '1', '612300': '1179', '344041': '1-2', '255883': 'N/A', '582285': None}

# Initialize a combined dictionary
combined_dict = {}

# Iterate through keys in the dictionaries and combine them
for key in bedroom_dict.keys():
    combined_dict[key] = {
        'bedroom': bedroom_dict.get(key),
        'bathroom': bathroom_dict.get(key),
        'sqft': sqft_dict.get(key)
    }

# Print the combined dictionary
print(combined_dict)

{'708878': '1-2', '408746': '0-1', '518420': '1', '580823': '1', '561296': '1', '392871': '2', '612300': '1-2', '344041': '1-2', '255883': '2', '582285': None}
{'708878': '2', '408746': '1', '518420': '756', '580823': '167', '561296': '470', '392871': '1-2.5', '612300': '2', '344041': '2', '255883': '954', '582285': None}
{'708878': '856', '408746': '0-1', '518420': '1', '580823': '0-2', '561296': '1-2', '392871': '1', '612300': '1179', '344041': '1-2', '255883': 'N/A', '582285': None}
{'708878': {'bedroom': '1-2', 'bathroom': '2', 'sqft': '856'}, '408746': {'bedroom': '0-1', 'bathroom': '1', 'sqft': '0-1'}, '518420': {'bedroom': '1', 'bathroom': '756', 'sqft': '1'}, '580823': {'bedroom': '1', 'bathroom': '167', 'sqft': '0-2'}, '561296': {'bedroom': '1', 'bathroom': '470', 'sqft': '1-2'}, '392871': {'bedroom': '2', 'bathroom': '1-2.5', 'sqft': '1'}, '612300': {'bedroom': '1-2', 'bathroom': '2', 'sqft': '1179'}, '344041': {'bedroom': '1-2', 'bathroom': '2', 'sqft': '1-2'}, '255883': {'b

In [92]:
# Initialize a combined dictionary
combined_dict = {}

# Helper function to move values between keys
def move_value(source_dict, dest_dict, key, field_name):
    if key in source_dict and source_dict[key] is not None:
        dest_dict[key][field_name] = source_dict[key]
        source_dict[key] = None

# Process each key and correct errors
for key in bedroom_dict.keys():
    combined_dict[key] = {'bedroom': None, 'bathroom': None, 'sqft': None}

    # Correct errors based on conditions
    if bedroom_dict[key] and '-' in bedroom_dict[key]:
        combined_dict[key]['bedroom'] = bedroom_dict[key]
        bedroom_dict[key] = None
    if bathroom_dict[key] and '-' in bathroom_dict[key]:
        combined_dict[key]['bedroom'] = bathroom_dict[key]
        bathroom_dict[key] = None
    if sqft_dict[key] and '-' in sqft_dict[key]:
        combined_dict[key]['bedroom'] = sqft_dict[key]
        sqft_dict[key] = None

    # Move values to the correct keys
    move_value(bedroom_dict, combined_dict, key, 'bedroom')
    move_value(bathroom_dict, combined_dict, key, 'bathroom')
    move_value(sqft_dict, combined_dict, key, 'sqft')

# Print the corrected combined dictionary
print(combined_dict)

{'708878': {'bedroom': None, 'bathroom': None, 'sqft': None}, '408746': {'bedroom': None, 'bathroom': None, 'sqft': None}, '518420': {'bedroom': None, 'bathroom': None, 'sqft': None}, '580823': {'bedroom': None, 'bathroom': None, 'sqft': None}, '561296': {'bedroom': None, 'bathroom': None, 'sqft': None}, '392871': {'bedroom': None, 'bathroom': None, 'sqft': None}, '612300': {'bedroom': None, 'bathroom': None, 'sqft': None}, '344041': {'bedroom': None, 'bathroom': None, 'sqft': None}, '255883': {'bedroom': None, 'bathroom': None, 'sqft': None}, '582285': {'bedroom': None, 'bathroom': None, 'sqft': None}}


## Creating a Dataframe and Consolidating Cleaned Data

In [32]:
import pandas as pd
import numpy as np  # Import numpy for NaN values

# Find the maximum length among all lists
max_length = max(len(listingCity), len(listingType), len(listingBed), len(listingBath),
                 len(listingDim), len(listingStreet), len(listingZip), len(listingRent))

# Pad the shorter lists with None or NaN to match the maximum length
def pad_list(lst, length, pad_value=None):
    if len(lst) < length:
        return lst + [pad_value] * (length - len(lst))
    else:
        return lst

listingCity = pad_list(listingCity, max_length)
listingType = pad_list(listingType, max_length)
listingBed = pad_list(listingBed, max_length)
listingBath = pad_list(listingBath, max_length)
listingDim = pad_list(listingDim, max_length)
listingStreet = pad_list(listingStreet, max_length)
listingZip = pad_list(listingZip, max_length)
listingRent = pad_list(listingRent, max_length)

# Create the DataFrame
column_names = ["City", "Property Type", "Bedrooms", "Bathrooms", "Square Footage", "Address",
                "Zip code", "Price"]
data = {
    "City": listingCity,
    "Property Type": listingType,
    "Bedrooms": listingBed,
    "Bathrooms": listingBath,
    "Square Footage": listingDim,
    "Address": listingStreet,
    "Zip code": listingZip,
    "Price": listingRent
}

df = pd.DataFrame(data)

# Set the index (if needed)
df.set_index('City', inplace=True)

# Display the DataFrame
print(df)

               Property Type Bedrooms Bathrooms Square Footage                                    Address Zip code          Price
City                                                                                                                             
Ottawa             apartment      1-2         2            856                90 Champagne  - Ottawa , ON  K1S 4P1  $2100 - $2900
City not found     apartment      0-1         1            0-1       253 - 257 York Street  - Ottawa , ON  K1N 5T9  $1675 - $1850
City not found     apartment        1       756              1                86-92 Hinton  - Ottawa , ON  K1Y 2Z7  $1700 - $2200
City not found     apartment        1       167            0-2      150-152 Osgoode Street  - Ottawa , ON  K1N 8A4          $1400
Ottawa             apartment        1       470            1-2                  256 Rideau  - Ottawa , ON  K1N 0A9  $1375 - $2500
...                      ...      ...       ...            ...                            

In [33]:
num_rows = df.shape[0]
print(num_rows)

805


In [28]:
folder = r"D:\Projects\Rentals/"
df.to_excel("rental_data_ottawa_1.xlsx")

In [29]:
"""
References:
1. Web Scraping Rentals Website Using Python Beautiful Soup: https://medium.com/swlh/web-scraping-rentals-website-using-beautiful-soup-and-pandas-99e255f27052
2. Chat GPT (For fine-tuning)
3. StackOverflow
4. Google
"""

'\nReferences:\n1. Web Scraping Rentals Website Using Python Beautiful Soup: https://medium.com/swlh/web-scraping-rentals-website-using-beautiful-soup-and-pandas-99e255f27052\n2. Chat GPT (For fine-tuning)\n3. StackOverflow\n4. Google\n'