# Segmenting and Clustering Neighborhoods in Toronto

Alex Wilson

### Import Libraries

In [70]:
!pip install geocoder

import pandas as pd
import geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 7.2MB/s ta 0:00:011
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


### Retrieve data

In [95]:
# Fetch main list of M postal codes from Wikipedia
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
df.describe()

Unnamed: 0,Postal Code,Borough,Neighborhood
count,180,180,180
unique,180,11,100
top,M7K,Not assigned,Not assigned
freq,1,77,77


### Clean data

In [96]:
df.rename(columns={"Postal Code": "PostalCode"}, inplace=True)

# Clean up "Not Assigned Values"
df = df[(df['Borough'] != 'Not assigned')].reset_index(drop=True) # Remove unassigned boroughs

# No further work to do, since it looks like Wikipedia have simplified the rest of the dataset (flattening Neighborhood values into a multi-valued CSV field).
print(df[(df['Neighborhood'] == 'Not assigned')].size)

df.head(12)

0


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [102]:
df.shape

(103, 3)

### Add Geolocation to PostalCode

In [103]:
# This entire block is commented out because IBM seems to be permanently rate-limited by Google!
# Bing/OpenStreetMap/etc were the same!
#
# # A helper function for retrieving coordinates from Google
# def add_lat_lng(postal_code):
#     print(postal_code)
#     g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#     return pd.Series(g.latlng)

# # Create a new dataframe, iterate over it to retrieve Latitude & Longitude, and then join with the original
# postalcode_lat_lng = df['PostalCode'].apply(add_lat_lng)
# postalcode_lat_lng.columns = ['Latitude', 'Longitude']

# Backup - Use some precomputed geo data!
postalcode_lat_lng = pd.read_csv("https://cocl.us/Geospatial_data")

# Join postalcode_lat_lng data frame
df = df.join(postalcode_lat_lng.set_index('Postal Code'), 'PostalCode')
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
