# Segmenting and Clustering Neighborhoods in Toronto

##### Install and import appropriate libraries

In [1]:
!pip -q install lxml
!pip -q install geopy
!pip -q install folium

In [2]:
import pandas as pd
import lxml
import numpy as np
import requests # library to handle requests
import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

##### Download Wikipedia data, rename columns appropriately, ignore cells that have boroughs that are"Not Assigned"

In [3]:
df_wiki=pd.read_html('https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050')[0]
df_wiki=df_wiki[df_wiki['Borough'].str.find("Not assigned")==-1]
df_wiki.reset_index(inplace=True, drop=True)
df_wiki.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)
df_wiki.head(1)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods


##### Download geospatial data

In [None]:
gs_data=pd.read_csv('https://cocl.us/Geospatial_data')
gs_data.head(1)

#####   Combine cells that have the same Postal code but different neighborhoods.  If a cell doesn't have a "Not Assigned"  neighborhood use the borough geodata instead !!! Assign Lat/Long by Postcode.

In [None]:
df_wiki_gs=df_wiki.copy()
df_wiki_gs['Latitude']=pd.Series(dtype=float)
df_wiki_gs['Longitude']=pd.Series(dtype=float)
for i in range(0,df_wiki_gs.shape[0]):  
    if df_wiki_gs.iloc[i,2].find('Not assigned')!=-1:
        df_wiki_gs.iloc[i,2]=df_wiki.iloc[i,1]
    for j in range(0, gs_data.shape[0]):
        if df_wiki_gs.iloc[i,0]==gs_data.iloc[j,0]:
            df_wiki_gs.iloc[i,3]=gs_data.iloc[j,1]
            df_wiki_gs.iloc[i,4]=gs_data.iloc[j,2]
    k=i
    while i<df_wiki_gs.shape[0]-1 and df_wiki_gs.iloc[i,0]==df_wiki_gs.iloc[i+1,0]:
        df_wiki_gs.iloc[k,2]=df_wiki_gs.iloc[k,2]+', '+df_wiki_gs.iloc[i+1,2]
        df_wiki_gs.iloc[i+1,2]='Delete'
        i=i+1     
        
df_wiki_gs=df_wiki_gs[~df_wiki_gs['Neighborhood'].str.contains('Delete', regex=False)]
df_wiki_gs.reset_index(inplace=True, drop=True)

In [None]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_wiki['Borough'].unique()),
        df_wiki_gs.shape[0]
    )
)

# Deliverable #1

In [None]:
df_wiki_gs.loc[:,['Postcode','Borough', 'Neighborhood']].head(12)

# Deliverable #2

In [None]:
df_wiki_gs.head(12)

##### Get Toronto's coordinates

In [None]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(f'The geograpical coordinate of {address} are latitude {latitude}, longitude {longitude}')

##### I want to explore various areas of Toronto by postcode. Let's generate a map with Toronto with markerrs for postcodes

In [None]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, postcode in zip(df_wiki_gs['Latitude'], df_wiki_gs['Longitude'], df_wiki_gs['Borough'], df_wiki_gs['Postcode']):
    label = '{}, {}'.format(postcode, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [None]:
CLIENT_ID =  'ZFJACRK522OEELKT1D2KXZV4CRMLOZJDUVWVUHYQ0GNMD11R' # your Foursquare ID
CLIENT_SECRET =  'ETDNQOIP3E0WLCC0HPAEUZ5E4B3YIAHIB2XTMSCLZLTN0ZOI' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

##### Create a function to get venues by coordinates and use it to get all venues for the coordinates we have

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):

            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request

        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
toronto_venues = getNearbyVenues(names=df_wiki_gs['Postcode'],latitudes=df_wiki_gs['Latitude'],longitudes=df_wiki_gs['Longitude'])

In [None]:
print(toronto_venues.shape)
toronto_venues.head(1)

##### Analyze Each area defined by a Postcode

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Postcode'] = toronto_venues['Postcode'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.tail()

##### Group rows by postcodes and by getting the mean of frequency in eacg category

In [None]:
toronto_grouped = toronto_onehot.groupby('Postcode').mean().reset_index()
toronto_grouped.head()

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

##### Let' see the most common 10 venues in each postcode

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postcode'] = toronto_grouped['Postcode']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

##### Cluster postcodes

In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

In [None]:
# add clustering labels

neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_.astype(int))

In [None]:
toronto_merged = df_wiki_gs

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Postcode'), on='Postcode')
toronto_merged=toronto_merged.dropna(axis=0)
toronto_merged.head(1) 

# Deliverable #3

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]

colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Postcode'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
     
map_clusters

##### What are the most common venues in the 5 clusters ?

In [None]:
#pd.set_option('expand_frame_repr', True)
for i in range(0,5):
   print('\033[1;43m'+'Cluster',i,'\033[0m')
   for j in (3,6):
        print (', '.join(toronto_merged.loc[toronto_merged['Cluster Labels'] == i, toronto_merged.columns[[0,1] + list(range(5,10))][j]].head(5)))

##### Where would I like to move ?  I think I would like cluster 4  (on the map the yellow markers)