# Neighborhood clustering in Toronto

In [96]:
import numpy as np 
import pandas as pd
import bs4
import requests
#!pip install geocoder
import geocoder 
#!conda install -c conda-forge folium=0.5.0 --yes
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

### Lets scrape the wiki web page and load the table into a pandas dataframe

In [97]:
#lets assign the url to the wiki page and insatntiate a beautifulsoup object with the get request output
url ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
toronto_wiki = requests.get(url).text
soup_object_toronto=bs4.BeautifulSoup(toronto_wiki,'lxml')

In [98]:
#first we locate the table 
table=soup_object_toronto.find('table')

#we parse the table and extract only three variables - 'PostalCode','Borough','Neighborhood'

Toronto_post_bor_neigh = []
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        Toronto_post_bor_neigh.append(cell)
#we creata dataframe from the extracted nisted list
df=pd.DataFrame(Toronto_post_bor_neigh)

### lets preprocess the data as suggested 

In [99]:
# let preprocess the data 
'''
1- Only process the cells that have an assigned borough.
2-For PstcodeId with multiple neighborhoods, aggregate with , 
3-If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

'''
df1 = df[df.Borough != 'Not assigned']
df2 = df1.groupby(['PostalCode','Borough'], sort=False).agg(', '.join)
df2.reset_index(inplace=True)

df2['Neighborhood'] = np.where(df2['Neighborhood'] == 'Not assigned',df2['Borough'], df2['Neighborhood'])
df2.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### lets get the cooridnates of the neighboors and define the two methods available

In [100]:
# get the latitude and the longitude coordinates of each neighborhood. 


###### takes to much time with the loop 
def get_lat_lon_neighborhood(names):
    Pos_lat_lon=[]
    for post_id in names:
        print(post_id)
    
    # initialize your variable to Non
        lat_lng_coords = None

    # loop until you get the coordinates
        while(lat_lng_coords is None):
            g = geocoder.google('{}'.format(post_id))
            lat_lng_coords = g.latlng

        latitude = lat_lng_coords[0]
        longitude = lat_lng_coords[1]
        Pos_lat_lon.append([row[post_id,latitude,longitude]])
    
    return Pos_lat_lon

######## use the csv file 
#load the csv in daatframe
la_lo_csv = pd.read_csv('https://cocl.us/Geospatial_data')
la_lo_csv.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


### lets match the postal code between the toronto postal database and the reference csv file

In [101]:
# now lets match the postal Code with the PostcodeID and get a merged dataframe
df_lo_la_toronto = pd.merge(df2, la_lo_csv, left_on='PostalCode', right_on='Postal Code')
df_lo_la_toronto.head(10)
#lets drop the Post code its just duplicated 
df_lo_la_toronto=df_lo_la_toronto.drop('Postal Code',axis=1)
df_lo_la_toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Lets filter only entries with Borough containing 'Toronto'

In [102]:
df_containing_tornto = df_lo_la_toronto[df_lo_la_toronto["Borough"].str.contains("Toronto")].reset_index(drop=True)

In [103]:
df_containing_tornto.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


### Lets plot all the neighborhoods with a borough containing Toronto

In [104]:
# create map
latitude=43.653226
longitude=-79.3831843

map_toronto = folium.Map(location=[latitude,longitude],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(df_containing_tornto['Latitude'],df_containing_tornto['Longitude'],df_containing_tornto['Borough'],df_containing_tornto['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

### Lets try to clusster the neighboors based on their coordinate Lat,Lon

In [106]:
toronto_clustering_coordinates = df_containing_tornto.drop(['PostalCode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = 6,random_state=0).fit(toronto_clustering)
kmeans.labels_
df_containing_tornto['CLuster Labels'] = kmeans.labels_

In [107]:
df_containing_tornto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,CLuster Labels
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,3
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0


### lets plot on a map the different element of each cluster

In [108]:
# create map
latitude=43.653226
longitude=-79.3831843
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(len(np.unique(df_containing_tornto['CLuster Labels'])))
ys = [i + x + (i*x)**2 for i in range(len(np.unique(df_containing_tornto['CLuster Labels'])))]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_containing_tornto['Latitude'], df_containing_tornto['Longitude'], df_containing_tornto['Neighborhood'], df_containing_tornto['CLuster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### lets aggregated neighborhoods of the same borough and visualize them and compare the result with the clustering based on coordinates

In [109]:
#how many neighborhood by borough
df_containing_tornto.groupby('Borough').count()

Unnamed: 0_level_0,PostalCode,Neighborhood,Latitude,Longitude,CLuster Labels
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Central Toronto,9,9,9,9,9
Downtown Toronto,17,17,17,17,17
Downtown TorontoStn A PO Boxes25 The Esplanade,1,1,1,1,1
East Toronto,4,4,4,4,4
East TorontoBusiness reply mail Processing Centre969 Eastern,1,1,1,1,1
East YorkEast Toronto,1,1,1,1,1
West Toronto,6,6,6,6,6


In [110]:
# transform borough into numerical
df_containing_tornto['boroough_num'] = pd.Categorical(df_containing_tornto.Borough).codes
df_containing_tornto.dtypes

PostalCode         object
Borough            object
Neighborhood       object
Latitude          float64
Longitude         float64
CLuster Labels      int32
boroough_num         int8
dtype: object

In [111]:
#lets plot the different neighborhoodds of each borough eith different colors
# create map
latitude=43.653226
longitude=-79.3831843
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(len(np.unique(df_containing_tornto['boroough_num'])))
ys = [i + x + (i*x)**2 for i in range(len(np.unique(df_containing_tornto['boroough_num'])))]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_containing_tornto['Latitude'], df_containing_tornto['Longitude'], df_containing_tornto['Neighborhood'], df_containing_tornto['boroough_num']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## The results are very much close, we can suppose that the clustering based on coordinates can be found by only grouping neighborhoods based on their original borough :D 