# Toronto Neighborhood Mining

In [12]:
import pandas as pd
import numpy as np
import requests

from geocoder import enrich_neighborhoods_with_geocoder, map_neighborhoods

## 1. Wrangle and Clean Toronto Neighborhood Data
<a id="clean"></a>

The data for clustering Toronto's neighborhoods will be sourced by wikipedia. This dataset is indexed by postal code and needs to be scrubbed for unassigned zones.

In [2]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

html_content=requests.get(wiki_url).content
df = pd.read_html(html_content)[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
df.columns = ['PostalCode', 'Borough', 'Neighborhood']
# Ignore cells with a borough that is 'Not assigned'
unassigned_boroughs_indeces = df[df['Borough'] == 'Not assigned'].index
df.drop(unassigned_boroughs_indeces, inplace=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


In [4]:
# Examine dataframe for 'Not assigned' neighborhoods
df[df['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
9,M9A,Queen's Park,Not assigned


In [5]:
# Name 'Not assigned' neighborhoods after borough name
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df['Borough']

# Check that out transformation worked - mask applied to dataframe should be empty
df[df['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


In [6]:
formatted_df = df.groupby(['PostalCode', 'Borough'], as_index=False).agg({'Neighborhood': lambda x: ', '.join(x)})
formatted_df.reset_index()
formatted_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [7]:
formatted_df.shape

(103, 3)

### Combine wikipedia data with geocoder data

Since Wikipedia dataset does not include zip code coordinates, we should hydrate dataset with longitude and latitude from Geocoder in order to access Foursquare data.

In [8]:
df.describe()

Unnamed: 0,PostalCode,Borough,Neighborhood
count,210,210,210
unique,103,11,207
top,M9V,Etobicoke,St. James Town
freq,8,44,2


In [9]:
# We expected all Neighborhood values to be unique
df.drop(["PostalCode", "Borough"], axis=1, inplace=True)
df.info()
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210 entries, 2 to 285
Data columns (total 1 columns):
Neighborhood    210 non-null object
dtypes: object(1)
memory usage: 13.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 1 columns):
Neighborhood    207 non-null object
dtypes: object(1)
memory usage: 1.7+ KB


In [10]:
address = 'Toronto, Ontario'

In [11]:
enrich_neighborhoods_with_geocoder(df, address)
df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Parkwoods,43.7588,-79.320197
1,Victoria Village,43.732658,-79.311189
2,Harbourfront,43.64008,-79.38015
3,Lawrence Heights,43.722778,-79.450933
4,Lawrence Manor,43.722079,-79.437507


In [17]:
# Examine new columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 3 columns):
Neighborhood    207 non-null object
Latitude        197 non-null float64
Longitude       197 non-null float64
dtypes: float64(2), object(1)
memory usage: 5.0+ KB


In [18]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Parkwoods,43.758800,-79.320197
1,Victoria Village,43.732658,-79.311189
2,Harbourfront,43.640080,-79.380150
3,Lawrence Heights,43.722778,-79.450933
4,Lawrence Manor,43.722079,-79.437507
...,...,...,...
192,Kingsway Park South West,43.650352,-79.500009
193,Mimico NW,43.616677,-79.496805
194,The Queensway West,43.623618,-79.514764
195,Royal York South West,43.648183,-79.511296


### Visualize Toronto neighborhoods
<a id="vis-neighborhoods"></a>

In [19]:
m = map_neighborhoods(df, address)
m

_If viewing on github, please view folium maps here: https://nbviewer.jupyter.org/_

### Save neighborhood coordinates dataset

In [20]:
df.to_csv('data/toronto_neighborhood_coords.csv')

In [14]:
# df = pd.read_csv('data/toronto_neighborhood_coords.csv', index_col=0)
# df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Parkwoods,43.758800,-79.320197
1,Victoria Village,43.732658,-79.311189
2,Harbourfront,43.640080,-79.380150
3,Lawrence Heights,43.722778,-79.450933
4,Lawrence Manor,43.722079,-79.437507
...,...,...,...
192,Kingsway Park South West,43.650352,-79.500009
193,Mimico NW,43.616677,-79.496805
194,The Queensway West,43.623618,-79.514764
195,Royal York South West,43.648183,-79.511296
