***
# Applied Data Science Capstone Project
## Week 3 -  Segmenting and Clustering Neighborhoods in Toronto
## Notebook 3 - Explore and cluster the neighborhoods in Toronto
***

### Import Packages

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## Part 1 - Preparing Canada Postal Data

### Read the postal codes of Canada using Wikipedia page

In [2]:
# Read Canada Postal Code from Wikipedia table
#
can_post_URL  = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df_can = pd.read_html(can_post_URL)[0]
df_can.rename(columns = {'Postcode':'PostalCode'}, inplace = True) 
df_can.rename(columns = {'Neighbourhood':'Neighborhood'}, inplace = True) 
print('Dimension of the Dataframe is', df_can.shape)
df_can.head()

Dimension of the Dataframe is (287, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Filter Data
- Ignore cells with a borough that is Not assigned
- Borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
- Combines neighborhoods that exist in one postal code area (Separated with a comma)

In [3]:
# Drop "Not assigned" Boroughs
#  
df_can_filter1 = df_can[(df_can['Borough'] != 'Not assigned')]
df_can_filter1.reset_index(drop=True, inplace=True)
print('The Dataframe Dimension after dropping Boroughs with "Not Assigned" values is',df_can_filter1.shape)
df_can_filter1.head()

The Dataframe Dimension after dropping Boroughs with "Not Assigned" values is (210, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [4]:
# Change "Not assigned" Neighborhood with the Borough Value
#
df_can_filter2=df_can_filter1.copy()
df_can_filter2.loc[df_can_filter2['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df_can_filter2['Borough']
df_can_filter2.reset_index(drop=True, inplace=True)
df_can_filter2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [5]:
# Merge Rows with same PostalCode
#
df_can_filter2=df_can_filter2.sort_values(by ='Neighborhood', ascending=[False] )
df_can_group = df_can_filter2.groupby(['PostalCode']).agg({'Borough':'first','Neighborhood': ', '.join }) 
df_can_group = df_can_group.reset_index()
df_can_group.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"West Hill, Morningside, Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Display the dimension (Shape) of the final Dataframe

In [6]:
# Display final Dataframe Dimension
#
table_dim = df_can_group.shape
print('The Final Dataframe Dimension is',table_dim)

The Final Dataframe Dimension is (103, 3)


***
## Part 2 - Obtain Latitude and Longitud Data
***

In [7]:
# Read Geo Data from the file provided in the instructions
#
geo_data='http://cocl.us/Geospatial_data'
df_geo=pd.read_csv(geo_data,float_precision='round_trip')
df_geo = df_geo.rename(columns={'Postal Code': 'PostalCode'})  #Rename the Column to match the postal code dataframe
df_geo.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the two dataframes

In [8]:
# Merge the two dataframes
#
df_geo_combine = pd.merge(df_can_group, df_geo)
df_geo_combine=df_geo_combine.sort_values(by ='PostalCode')
df_geo_combine.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"West Hill, Morningside, Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Display the dimension (Shape) of the final Geo Dataframe

In [9]:
# Display the Geo Dataframe Dimension
table_geo_dim = df_geo_combine.shape
print('The Final Merged Dataframe Dimension is =',table_geo_dim)

The Final Merged Dataframe Dimension is = (103, 5)


## Part 3 - Explore and Cluster the neighborhoods in Toronto

### Create a Dataframe of Toronto Boroughs

In [10]:
#  FIlter Toronto Borough
#
df_toronto = df_geo_combine[df_geo_combine['Borough'].str.contains("Toronto")]
df_toronto = df_toronto.sort_values(by ='Borough')
df_toronto = df_toronto.reset_index(drop=True)
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5R,Central Toronto,"Yorkville, The Annex, North Midtown",43.67271,-79.405678
1,M5P,Central Toronto,"Forest Hill West, Forest Hill North",43.696948,-79.411307
2,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
3,M4P,Central Toronto,Davisville North,43.712751,-79.390197
4,M4R,Central Toronto,North Toronto West,43.715383,-79.405678


### In the following two cells I separate  the Neighborhood column and eliminate the Commas in the df_toronto dataframe only to get the  unique count of Neighborhoods 

In [11]:
def explode_str(df, col, sep):
    s = df[col]
    i = np.arange(len(s)).repeat(s.str.count(sep) + 1)
    return df.iloc[i].assign(**{col: sep.join(s).split(sep)})

def explode_list(df, col):
    s = df[col]
    i = np.arange(len(s)).repeat(s.str.len())
    return df.iloc[i].assign(**{col: np.concatenate(s)})

In [12]:
df_toronto_neigh = explode_str(df_toronto, 'Neighborhood', ',')
df_toronto_neigh = df_toronto_neigh.reset_index(drop=True)
df_toronto_neigh.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5R,Central Toronto,Yorkville,43.67271,-79.405678
1,M5R,Central Toronto,The Annex,43.67271,-79.405678
2,M5R,Central Toronto,North Midtown,43.67271,-79.405678
3,M5P,Central Toronto,Forest Hill West,43.696948,-79.411307
4,M5P,Central Toronto,Forest Hill North,43.696948,-79.411307


In [13]:
print('The Neighborhoods Dataframe Dimension is',table_geo_dim)

The Neighborhoods Dataframe Dimension is (103, 5)


### Summary of numbers of Postal Codes, Boroughs and Neigboorhoods

In [14]:
n_postals  = df_toronto_neigh['PostalCode'].nunique()
n_boroughs = df_toronto_neigh['Borough'].nunique()
n_neighs   = df_toronto_neigh['Neighborhood'].shape[0]
print('-------------------------------------------------------')
print('Number of postal codes   = ',n_postals)
print('Number of Boroughs       =  ',n_boroughs)
print('Number of Neighborhoods  = ',n_neighs)
print('-------------------------------------------------------')

-------------------------------------------------------
Number of postal codes   =  39
Number of Boroughs       =   4
Number of Neighborhoods  =  74
-------------------------------------------------------


### Use GeoPy to get the city of Toronto Geo Coordinates

In [15]:
address = 'Toronto, CA'
geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
tor_latitude = location.latitude
tor_longitude = location.longitude
print('The geograpical coordinate of Toronto, Canada are {}, {}.'.format(tor_latitude, tor_longitude))

The geograpical coordinate of Toronto, Canada are 43.653963, -79.387207.


## Toronto Neighborhoods Map
### Note:  I use the Combine Neighborhood Dataframe because I have the Geo Coordinates by Postal Code thus I grouped the cluster by Borough and group of Neibourhoods pertaining to a Borough

In [16]:
# create map of Toronto using latitude and longitude values
#
map_toronto = folium.Map(location=[tor_latitude, tor_longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'],
                                           df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto