# Segmenting and Clustering Neighborhoods in Toronto

## Part 1

### Step 1 - Import the requiered dependencies

In [1]:
# Pandas & Numpy
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# JSON files manipulation
from pandas.io.json import json_normalize
import json
# Requests library
import requests
# Clustering library
from sklearn.cluster import KMeans
# Ploting library
import matplotlib.cm as cm
import matplotlib.colors as colors
# Maps library
import folium
print('Dependencies imported')

Dependencies imported


### Step 2 - extract, transform and scrap data into PD

#### a) Assign the URL variable and extract content

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki=requests.get(url)

#### b) Trasform content into PD adding [0] at the end for DF format

In [3]:
df=pd.read_html(wiki.content,header=0)[0]

#### c) Start the scrapping proccess filtering the DF into a new one without 'Not assigned' values for the 'Borough' column

In [4]:
df1=df[df.Borough!='Not assigned']

#### d) Group the 'Neighborhood' with the same postal code

In [5]:
df2=df1.groupby(['Postal code','Borough'])['Neighborhood'].apply(lambda x:','.join(x))
df2=df2.reset_index()
df2.rename(columns={'Postal code':'PostalCode'},inplace=True)
df2.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West
9,M1N,Scarborough,Birch Cliff / Cliffside West


#### e) Replace possible 'Not assigned' values on the  'Neighborhood' column with the same 'Borough'

In [6]:
df2.Neighborhood.replace('Not assigned',df1.Borough,inplace=True)

#### f) Display the shape of the DF

In [7]:
df2.shape

(103, 3)

## PART 2

### Step 3 GEOCODER

#### a) Assign the URL variable and extract content

In [8]:
url2='https://cocl.us/Geospatial_data'
dfg=pd.read_csv(url2)
dfg.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### b) Compare shape between DF2 and DFG

In [9]:
df2.shape==dfg.shape

True

#### c) Merge the two DFs

In [10]:
df2=df2.join(dfg.set_index('Postal Code'), on='PostalCode')
df2.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.727929,-79.262029
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.711112,-79.284577
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.692657,-79.264848
