# Clustering Neighborhoods in Toronto
---
### Scrape Wikipedia for Table

<b>Import necessary modules</b>

In [9]:
import pandas as pd
import requests
!pip install lxml
import lxml.html as lh
!pip install geopy
print('Success')

Success


<b>Scrape wikipedia page for tr elements</b>

In [10]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
doc = lh.fromstring(page.content)
elements = doc.xpath('//tr')

<b>Get headers from first row</b>

In [11]:
headers = []

for e in elements[0]:
    headers.append(e.text_content().replace('\n',''))
    
headers

['Postal code', 'Borough', 'Neighborhood']

<b>Fill new dataframe with data</b>

In [12]:
pre_df = []
elements[1:]

for row in elements[1:]:
    l = []
    for e in row:
        l.append(e.text_content().replace('\n',''))
    pre_df.append(l)
    
pre_df = pre_df[0:-4]

In [13]:
df = pd.DataFrame(pre_df, columns=headers)
df = df[df['Borough'] != 'Not assigned'].reset_index(drop=True)
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


<b>The size of the dataframe</b>

In [14]:
df.shape

(103, 3)

In [33]:
geolocator = Nominatim(user_agent="Toronto")
g = geolocator.geocode('M4A, Toronto, Ontario'.format(postal_code), timeout=1000)
print(str(type(g)))

<class 'NoneType'>


<b>Add latitude and longitudes</b>

In [39]:
from geopy.geocoders import Nominatim # import geocoder

geolocator = Nominatim(user_agent="Toronto")

lat_list = []
long_list = []

#Populate latitude list and longitude list
for postal_code in df['Postal code']:
    g = geolocator.geocode('{}, Toronto, Ontario'.format(postal_code), timeout=1000)
    if(str(type(g)) != "<class 'NoneType'>"):
        lat_list.append(g.latitude)
        long_list.append(g.longitude)
    else:
        lat_list.append(0)
        long_list.append(0)

In [40]:
df['Latitude'] = lat_list
df['Longitude'] = long_list

<b>Here, I tried to get the latitude and longitude from the API, but it failed on many of the zip codes.</b>

In [46]:
df.sort_values(by=['Postal code'], inplace=True, ignore_index=True)
df

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.653482,-79.383935
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.653482,-79.383935
2,M1E,Scarborough,Guildwood / Morningside / West Hill,0.000000,0.000000
3,M1G,Scarborough,Woburn,43.765717,-79.221898
4,M1H,Scarborough,Cedarbrae,0.000000,0.000000
...,...,...,...,...,...
98,M9N,York,Weston,0.000000,0.000000
99,M9P,Etobicoke,Westmount,0.000000,0.000000
100,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...,43.695166,-79.550890
101,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...,0.000000,0.000000


<b>Get latitudes and longitudes from given csv file</b>

In [70]:
latlong = pd.read_csv('Geospatial_Coordinates.csv')

In [71]:
latlong.sort_values(by=['Postal Code'], inplace=True, ignore_index=True)

In [72]:
latlong

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [73]:
df['Latitude'] = latlong['Latitude']
df['Longitude'] = latlong['Longitude']
df

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...,43.688905,-79.554724
101,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...,43.739416,-79.588437


<b>Start analysis and mapping</b>

In [79]:
df_toronto = df[df['Borough'].str.contains("Toronto")]
df_toronto

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188
42,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,Moore Park / Summerhill East,43.689574,-79.38316
49,M4V,Central Toronto,Summerhill West / Rathnelly / South Hill / For...,43.686412,-79.400049


In [85]:
import folium

map_clusters = folium.Map(location=[43.662301, -79.389494], zoom_start=11)

# add markers to the map
markers_colors = []
for lat, lon, poi in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood']):
    label = folium.Popup(str(poi), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        fill=True,
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters