# Segmenting and Clustering Neighborhoods in Toronto

## Table of Contents



1. [Download and Explore Dataset](#1.-Download-and-Explore-Dataset)
2. [Explore and Visualize Neighborhoods in Toronto](#2.-Explore-and-Visualize-Neighborhoods-in-Toronto)
3. [Analyze Each Neighborhood](#3.-Analyze-each-Neighborhood)
4. [Cluster Neighborhoods](#4.-Cluster-Neighborhoods)
5. [Examine Clusters](#5.-Examine-Clusters)


In [44]:
import pandas as pd
import numpy as np
import geocoder
import requests
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
#packages.geo_utils.geocode
print('libraries imported')

libraries imported


In [45]:
print('done')

done


## 1. Download and Explore Dataset

#### Read table from html file

In [46]:
link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
res = requests.get(link) 
  
soup = BeautifulSoup(res.content, 'html5lib') 
results = soup.find('div', attrs = {'id':'container'})

#### obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe 

In [49]:
df_boroughs = results[0]
df_boroughs_drop_na = df_boroughs.drop(
    df_boroughs[df_boroughs.Borough == 'Not assigned'].index,
    axis=0).reset_index(drop=True)
print(df_boroughs_drop_na.shape)
df_boroughs_drop_na.head(12)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,Malvern / Rouge
7,M3B,North York,Don Mills
8,M4B,East York,Parkview Hill / Woodbine Gardens
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [50]:
postcodes = df_boroughs_drop_na['Postal code'].unique()


#### Combine Neighborghoods where more than one neighborhood exists in one postal code area

In [51]:
boroughs = []
neighborhoods = []

for code in postcodes:
    temp_postcode_df = df_boroughs_drop_na[df_boroughs_drop_na['Postal code'] == code]
    boroughs.append(temp_postcode_df['Borough'].sort_values().unique()[0])
    neighborhoods.append(', '.join(temp_postcode_df['Neighborhood'].sort_values().unique()))

In [52]:
tor_boroughs_combined = pd.DataFrame({'Postal code': postcodes, 'Borough': boroughs,
                                      'Neighborhood': neighborhoods}) 

In [54]:
print(tor_boroughs_combined.shape)
tor_boroughs_combined.head(10)

(103, 3)


Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,Malvern / Rouge
7,M3B,North York,Don Mills
8,M4B,East York,Parkview Hill / Woodbine Gardens
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 

In [55]:
# select all neighborhoods with Not Assigned and replace with value of Borough
tor_boroughs_combined.loc[
    tor_boroughs_combined[tor_boroughs_combined['Neighborhood'] == 'Not assigned'].index,'Neighborhood']= \
    tor_boroughs_combined[tor_boroughs_combined['Neighborhood'] == 'Not assigned']['Borough']

In [56]:
# Check sample if replaced
tor_boroughs_combined[tor_boroughs_combined['Borough'] == "Queen's Park"]

Unnamed: 0,Postal code,Borough,Neighborhood


In [57]:
tor_boroughs_combined.shape

(103, 3)

#### Define function to get latitude and longitude from postal code

In [58]:
def get_postal_code_lat_lon(postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('Toronto, ON {}, Canada'.format(postal_code))
      lat_lng_coords = g.latlng
    
    return lat_lng_coords

     latitude = lat_lng_coords[0]
     longitude = lat_lng_coords[1]

In [59]:
## Test function with sample postal code

In [60]:
# print('Sample latitude and longitude',get_postal_code_lat_lon('M4B'))

#### Use provide geocoded CSV due to issues with geocoder (failed above)

In [61]:
tor_boroughs_geocodes = pd.read_csv(r'https://cocl.us/Geospatial_data')
print('Shape geocodes', tor_boroughs_geocodes.shape)
tor_boroughs_geocodes.head(10)

Shape geocodes (103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


#### Join Geocoded Postal codes with previous dataframe records

In [69]:
tor_boroughs_geocoded =  tor_boroughs_combined.merge(dfgeocodes, how='left', left_on='Postcode', right_on='Postal Code')
tor_boroughs_geocoded.drop('Postal Code',axis=1,inplace=True)
print('Shape after joining lat lons geocodes', dfgeocodes.shape)
tor_boroughs_geocoded.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Rouge,43.806686,-79.19435
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.16049
2,M1E,Scarborough,"West Hill, Morningside, Guildwood",43.763573,-79.18871
3,M1G,Scarborough,Woburn,43.770992,-79.21691
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.23947
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.26202
7,M1L,Scarborough,"Oakridge, Golden Mile, Clairlea",43.711112,-79.28457
8,M1M,Scarborough,"Scarborough Village West, Cliffside, Cliffcrest",43.716316,-79.239476
9,M1N,Scarborough,"Cliffside West, Birch Cliff",43.692657,-79.264848
