# Initial step would be to ingest data from different sources through APIs.
#### The goal is to get the data for 
* Canadian Population data
* Employment rates and income levels
* Government Policies and Taxation, such as Mortgage rates
* Urbanization and Infrastructure Development 
* Economic Indicators such as GDP Growth, Inflation rates, trade agreements 
* Migration and demographics just as immigration, inter-province migrations etc
* Land and Zoning Laws
* Climate and Environmental Factors
* Rental market trends and short term rentals uch as Airbnb to inflate prices
* Energy cost and building materials

#### Importing libraries

In [8]:
import pandas as pd
import requests
from bs4 import BeautifulSoup


### Pulling population data via web scraping from WorldPopulationReview.com and exploing that data

In [9]:
url = "https://worldpopulationreview.com/canadian-provinces"
response = requests.get(url)


In [41]:
soup = BeautifulSoup(response.text, "html.parser")


Upon inspecting the webpage, we find that the table we are looking to scrape is under the Div "m-0 rounded-xl border border-wpr-table_border max-h-[60vh] overflow-auto p-0" and the table class is "wpr-table !border-none"

In [40]:
table_div = soup.find('div', class_='m-0 rounded-xl border border-wpr-table_border max-h-[60vh] overflow-auto p-0')
table = table_div.find('table', class_='wpr-table !border-none')

In [50]:
if table is None:
    print("Table not found")
else:
    # Extract the header of the table
    header_row = table.find_all('th')
    headers = [header.text.strip() for header in header_row[:6]]  # Only keep the first 6 columns as headers
    print("Headers:")
    print(headers)
    
    
    # Extract all the rows of the table
    rows = []
    for row in table.find_all('tr')[1:]:  # Skip the header row
        cols = row.find_all('td')

        # Check if the row has data
        if cols:
            # Extract the province name from the first column's <a> tag
            province = row.find('th').find('a').text.strip()  # Province is in the first <th> column with <a> tag
            data = [province] + [col.text.strip() for col in cols]  # Add province name to the data

            # Skip rows that don't match the number of columns
            if len(data) == len(headers):
                rows.append(data)

    # Print out the rows
    print("\nRows:")
    for r in rows:
        print(r)

    # Create a DataFrame for easy data manipulation
    df = pd.DataFrame(rows, columns=headers)

    # Print the DataFrame to check the scraped data
    print(df)

Headers:
['Province', '2024 Pop.', '2022 Pop.', '2016 Pop.', '2011 Pop.', 'Change']

Rows:
['Ontario', '15,996,989', '15,262,660', '13,448,494', '12,851,821', '2.19%']
['Quebec', '9,030,684', '8,751,352', '8,164,361', '7,903,000', '1.27%']
['British Columbia', '5,646,467', '5,368,266', '4,648,055', '4,400,057', '2.46%']
['Alberta', '4,849,906', '4,601,314', '4,067,175', '3,645,257', '2.22%']
['Manitoba', '1,484,135', '1,420,228', '1,278,365', '1,208,268', '1.88%']
['Saskatchewan', '1,231,043', '1,205,119', '1,098,352', '1,053,960', '1.44%']
['Nova Scotia', '1,072,545', '1,030,953', '923,598', '921,727', '1.89%']
['New Brunswick', '850,894', '820,786', '747,101', '751,171', '1.64%']
['Newfoundland', '541,391', '528,818', '519,716', '514,536', '0.51%']
['Prince Edward Island', '177,081', '172,707', '142,907', '140,204', '2.72%']
['Yukon', '45,750', '43,964', '35,874', '33,897', '3.09%']
['Northwest Territory', '44,920', '45,602', '41,786', '41,462', '0.91%']
['Nunavut', '40,758', '40,586

In [55]:
# Convert numeric columns (population and change values) to integers and remove commas
for col in df.columns[1:]:  # Skip the 'Province' column
    df[col] = df[col].replace({',': '', '%':''}, regex=True)  # Remove commas
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, coercing errors to NaN

# Print the cleaned DataFrame
print(df)

                Province  2024 Pop.  2022 Pop.  2016 Pop.  2011 Pop.  Change
0                Ontario   15996989   15262660   13448494   12851821    2.19
1                 Quebec    9030684    8751352    8164361    7903000    1.27
2       British Columbia    5646467    5368266    4648055    4400057    2.46
3                Alberta    4849906    4601314    4067175    3645257    2.22
4               Manitoba    1484135    1420228    1278365    1208268    1.88
5           Saskatchewan    1231043    1205119    1098352    1053960    1.44
6            Nova Scotia    1072545    1030953     923598     921727    1.89
7          New Brunswick     850894     820786     747101     751171    1.64
8           Newfoundland     541391     528818     519716     514536    0.51
9   Prince Edward Island     177081     172707     142907     140204    2.72
10                 Yukon      45750      43964      35874      33897    3.09
11   Northwest Territory      44920      45602      41786      41462    0.91