# Toronto Clustering Part1: Retrieving the data from Wikipedia and then processing it

In [None]:
import numpy as np
import pandas as pd


In [6]:
import requests
wiki_page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

In [7]:
from bs4 import BeautifulSoup
wiki_soup=BeautifulSoup(wiki_page, 'html.parser')

In [8]:
postal_code=[]
boroughs=[]
nhoods=[]

In [9]:
wiki_soup.find('table').find_all('tr')

# find all the rows of the table
wiki_soup.find('table').find_all('tr')

# for each row of the table, find all the table data
for row in wiki_soup.find('table').find_all('tr'):
    cells = row.find_all('td')

In [11]:
for row in wiki_soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postal_code.append(cells[0].text)
        boroughs.append(cells[1].text)
        nhoods.append(cells[2].text.rstrip('\n'))


In [15]:
toronto_df = pd.DataFrame({"postal_code": postal_code,
                           "borough": boroughs,
                           "neighborhood": nhoods})

toronto_df.head()

Unnamed: 0,postal_code,borough,neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [16]:
toronto_df_dropna = toronto_df[toronto_df.borough != "Not assigned"].reset_index(drop=True)
toronto_df_dropna.head()

Unnamed: 0,postal_code,borough,neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [18]:
toronto_df_cleaned= toronto_df_dropna.groupby(["postal_code", "borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_cleaned.head()

Unnamed: 0,postal_code,borough,neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [19]:
for index, row in toronto_df_cleaned.iterrows():
    if row["neighborhood"] == "Not assigned":
        row["neighborhood"] = row["borough"]
        
toronto_df_cleaned.head()

Unnamed: 0,postal_code,borough,neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [22]:
cols = ["postal_code", "borough", "neighborhood"]
test_df = pd.DataFrame(columns=cols)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(toronto_df_cleaned[toronto_df_cleaned["postal_code"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,postal_code,borough,neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Woodbine Gardens, Parkview Hill"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Maryvale, Wexford"
7,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo..."


## Toronto Clustering part 2: Loading the Geographical coordinates for the neighborhood data

In [23]:
!wget https://cocl.us/Geospatial_data

In [25]:
coord_data=pd.read_csv('/Users/aadiharan99/OneDrive/VSCode workspace/Geospatial_data.csv')

In [26]:
coord_data.rename(columns={"Postal Code": "postal_code"}, inplace=True)

In [28]:
toronto_df_new = toronto_df_cleaned.merge(coord_data, on="postal_code", how="left")
toronto_df_new.head()

Unnamed: 0,postal_code,borough,neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [32]:
toronto_df_new

Unnamed: 0,postal_code,borough,neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


In [34]:
# column_names = ["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"]
test_df1 = pd.DataFrame()

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df1 = test_df1.append(toronto_df_new[toronto_df_new["postal_code"]==postcode], ignore_index=True)
    
test_df1

Unnamed: 0,postal_code,borough,neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Maryvale, Wexford",43.750072,-79.295849
7,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442


## Toronto Clustering part 3: 