# Segmenting and Clustering Neighborhoods in Toronto

## Data Wrangling

In [1]:
import pandas as pd
import numpy as np
import requests # library to handle requests

Firstly, retrive the data from wikipedia.

In [78]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

data = pd.read_html(url)[0]
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Removing borough which is "Not assigned"

In [79]:
data=data[data['Borough'].str.contains("Not assigned")==False]
data.sort_values(by='Postcode', ascending=True, inplace=True)
data.index = range(len(data.index))

In [151]:
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Port Union
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Highland Creek


In [71]:
data['Borough'].value_counts()

Etobicoke           45
North York          38
Scarborough         37
Downtown Toronto    37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

Grouping data with more than one neighborhood can exist in one postal code area

In [150]:
data1 = data.copy()

canada_data = data1.groupby('Postcode').agg(lambda x: list(x))
canada_data.reset_index(level=0, inplace=True)

for i,borough in enumerate(canada_data['Borough']) :
    canada_data.loc[i, 'Borough'] = borough[0]
    
for i,neigh in enumerate(canada_data['Neighbourhood']) :
    canada_data.loc[i, 'Neighbourhood'] =  ", ".join(neigh)

# replacing all Neighbourhood with "Not assigned" with its Borough    
canada_data.loc[canada_data[canada_data['Neighbourhood'].str.contains("Not assigned")==True]['Neighbourhood'].index,'Neighbourhood'] = canada_data['Borough']
canada_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [152]:
canada_data.shape

(103, 3)

The shape of our new dataframe is (103, 3)

---

In [160]:
csv_path = 'https://cocl.us/Geospatial_data'
geocoder = pd.read_csv(csv_path)
geocoder.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [162]:
geocoder.shape

(103, 3)

In [163]:
geocoder.sort_values(by='Postal Code', ascending=True, inplace=True)

In [172]:
canada_data['Latitude'] = geocoder['Latitude']
canada_data['Longitude'] = geocoder['Longitude']

In [175]:
canada_data.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Oakridge, Clairlea",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Scarborough Village West, Cliffside",43.716316,-79.239476
9,M1N,Scarborough,"Cliffside West, Birch Cliff",43.692657,-79.264848
