# Segmenting and Clustering Neighborhoods in Toronto

### Install necessary libraries

In [1]:
!pip install requests bs4 pandas lxml geocoder



### Import necessary libraries

In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import geocoder



Get a request for the target table, and get it into dataframe format

In [3]:
req = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(req.content, "lxml")
table = soup.find_all("table")[0]
df = pd.read_html(str(table))[0]
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Filter all the rows where the borough is "Not assigned"

In [4]:
df_f = df.loc[~df.Borough.str.contains("Not assigned") ]
df_f.reset_index(drop=True, inplace=True)
df_f


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### Verify if the postal codes are unique

In [40]:
postal_codes = df_f["Postal Code"].unique()
print(f"There are {len(postal_codes)} unique postal codes for {len(df_f)} rows.")
if len(postal_codes) == len(df_f):
    print("There are no duplicate postal codes.")    

There are 103 unique postal codes for 103 rows.
There are no duplicate postal codes.


In [41]:
df_f.loc[df_f["Postal Code"].str.contains(postal_codes[3])] 

Unnamed: 0,Postal Code,Borough,Neighbourhood
3,M6A,North York,"Lawrence Manor, Lawrence Heights"


In [42]:
print(len(df_f), len(postal_codes))

103 103


### Check for unassigned Burroughs

In [9]:
df_f.loc[df_f["Neighbourhood"].str.contains("Not assigned")]

Unnamed: 0,Postal Code,Borough,Neighbourhood


### Try to get data from geocoder 

In [12]:
postal_dict = {}

for postal_code in postal_codes:

    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google(f'{postal_code}, Toronto, Ontario')
        lat_lng_coords = g.latlng
        print(".")
    print(postal_code, lat_lng_coords)
    postal_dict[postal_code] = lat_lng_coords

postal_dict


.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.


KeyboardInterrupt: 

In [13]:
g

<[REQUEST_DENIED] Google - Geocode [empty]>

Didn't work, better try the csv file

In [17]:
new_df = pd.read_csv("Geospatial_Coordinates.csv")
new_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [63]:
lats = []
longs = []
for postal_code in df["Postal Codes"]:
    row2 = new_df.loc[new_df["Postal Code"]==postal_code]
    lats.append(row.Latitude.values[0])
    longs.append(row.Longitude.values[0])

df_f["Latitude"] = lats
df_f["Longitude"] = longs
df_f.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
