# Segmenting and Clustering Neighborhoods in Toronto

##### Install and import appropriate libraries

In [200]:
!pip -q install lxml
!pip -q install geopy
!pip -q install folium

In [201]:
import pandas as pd
import lxml
import numpy as np
import requests # library to handle requests
import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

##### Download Wikipedia data, rename columns appropriately, ignore cells that have boroughs that are"Not Assigned"

In [202]:
df_wiki=pd.read_html('https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050')[0]
df_wiki=df_wiki[df_wiki['Borough'].str.find("Not assigned")==-1]
df_wiki.reset_index(inplace=True, drop=True)
df_wiki.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)
df_wiki.head(1)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods


##### Download geospatial data

In [203]:
gs_data=pd.read_csv('https://cocl.us/Geospatial_data')
gs_data.head(1)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353


#####   Combine cells that have the same Postal code but different neighborhoods.  If a cell doesn't have a "Not Assigned"  neighborhood use the borough geodata instead !!! Assign Lat/Long by Postcode.

In [204]:
df_wiki_gs=df_wiki.copy()
df_wiki_gs['Latitude']=pd.Series(dtype=float)
df_wiki_gs['Longitude']=pd.Series(dtype=float)
for i in range(0,df_wiki_gs.shape[0]):  
    if df_wiki_gs.iloc[i,2].find('Not assigned')!=-1:
        df_wiki_gs.iloc[i,2]=df_wiki.iloc[i,1]
    for j in range(0, gs_data.shape[0]):
        if df_wiki_gs.iloc[i,0]==gs_data.iloc[j,0]:
            df_wiki_gs.iloc[i,3]=gs_data.iloc[j,1]
            df_wiki_gs.iloc[i,4]=gs_data.iloc[j,2]
    k=i
    while i<df_wiki_gs.shape[0]-1 and df_wiki_gs.iloc[i,0]==df_wiki_gs.iloc[i+1,0]:
        df_wiki_gs.iloc[k,2]=df_wiki_gs.iloc[k,2]+', '+df_wiki_gs.iloc[i+1,2]
        df_wiki_gs.iloc[i+1,2]='Delete'
        i=i+1     
        
df_wiki_gs=df_wiki_gs[~df_wiki_gs['Neighborhood'].str.contains('Delete', regex=False)]
df_wiki_gs.reset_index(inplace=True, drop=True)

In [205]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_wiki['Borough'].unique()),
        df_wiki_gs.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


# Deliverable #1

In [206]:
df_wiki_gs.loc[:,['Postcode','Borough', 'Neighborhood']].head(12)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


# Deliverable #2

In [207]:
df_wiki_gs.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


##### Get Toronto's coordinates

In [208]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(f'The geograpical coordinate of {address} are latitude {latitude}, longitude {longitude}')

The geograpical coordinate of Toronto, Canada are latitude 43.653963, longitude -79.387207


##### I want to explore various areas of Toronto by postcode. Let's generate a map with Toronto with markerrs for postcodes

In [209]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, postcode in zip(df_wiki_gs['Latitude'], df_wiki_gs['Longitude'], df_wiki_gs['Borough'], df_wiki_gs['Postcode']):
    label = '{}, {}'.format(postcode, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [210]:
CLIENT_ID =  'ZFJACRK522OEELKT1D2KXZV4CRMLOZJDUVWVUHYQ0GNMD11R' # your Foursquare ID
CLIENT_SECRET =  'ETDNQOIP3E0WLCC0HPAEUZ5E4B3YIAHIB2XTMSCLZLTN0ZOI' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

##### Create a function to get venues by coordinates and use it to get all venues for the coordinates we have

In [211]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):

            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request

        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [212]:
toronto_venues = getNearbyVenues(names=df_wiki_gs['Postcode'],latitudes=df_wiki_gs['Latitude'],longitudes=df_wiki_gs['Longitude'])

In [213]:
print(toronto_venues.shape)
toronto_venues.head(1)

(2226, 7)


Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park


##### Analyze Each area defined by a Postcode

In [214]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Postcode'] = toronto_venues['Postcode'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.tail()

Unnamed: 0,Postcode,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
2221,M8Z,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2222,M8Z,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2223,M8Z,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2224,M8Z,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2225,M8Z,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### Group rows by postcodes and by getting the mean of frequency in eacg category

In [215]:
toronto_grouped = toronto_onehot.groupby('Postcode').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Postcode,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [216]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

##### Let' see the most common 10 venues in each postcode

In [217]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postcode'] = toronto_grouped['Postcode']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Yoga Studio,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Donut Shop
1,M1C,Bar,Yoga Studio,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Donut Shop,Falafel Restaurant
2,M1E,Electronics Store,Mexican Restaurant,Spa,Medical Center,Bank,Rental Car Location,Intersection,Yoga Studio,Discount Store,Dim Sum Restaurant
3,M1G,Coffee Shop,Soccer Field,Korean Restaurant,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
4,M1H,Gas Station,Bakery,Fried Chicken Joint,Caribbean Restaurant,Athletics & Sports,Thai Restaurant,Bank,Hakka Restaurant,Drugstore,Donut Shop


##### Cluster postcodes

In [218]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4,
       0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       4, 0, 0, 2, 0, 0, 0, 4, 0, 4, 0, 0])

In [219]:
# add clustering labels

neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_.astype(int))

In [220]:
toronto_merged = df_wiki_gs

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Postcode'), on='Postcode')
toronto_merged=toronto_merged.dropna(axis=0)
toronto_merged.head(1) 

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,4.0,Park,Food & Drink Shop,Yoga Studio,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Doner Restaurant


# Deliverable #3

In [221]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]

colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Postcode'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
     
map_clusters

##### What are the most common venues in the 5 clusters ?

In [233]:
#pd.set_option('expand_frame_repr', True)
for i in range(0,5):
   print('\033[1;43m'+'Cluster',i,'\033[0m')
   for j in (3,6):
        print (', '.join(toronto_merged.loc[toronto_merged['Cluster Labels'] == i, toronto_merged.columns[[0,1] + list(range(5,10))][j]].head(5)))

[1;43mCluster 0 [0m
Hockey Arena, Coffee Shop, Clothing Store, Coffee Shop, Fast Food Restaurant
Portuguese Restaurant, Park, Coffee Shop, Creperie, Dessert Shop
[1;43mCluster 1 [0m
Garden
Dessert Shop
[1;43mCluster 2 [0m
Gift Shop
Dessert Shop
[1;43mCluster 3 [0m
Cafeteria
Dim Sum Restaurant
[1;43mCluster 4 [0m
Park, Park, Park, Park, Park
Dog Run, Yoga Studio, Dog Run, Donut Shop, Yoga Studio


##### Where would I like to move ?  I think I would like cluster 4  (on the map the yellow markers)