# Toronto Neighborhood Clustering

In [1]:
!pip install beautifulsoup4
!pip install geocoder
!pip install folium



In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import geocoder
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

## Scraping of data from the web

I now try to use the `read_html` method from **pandas** to obtain data about the neighborhoods in Toronto.

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tables = pd.read_html(url)[0] #getting only the first table
tables.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
tbl_munge = tables[tables['Borough'] != 'Not assigned'] \
.groupby(['Postcode', 'Borough']) \
.apply(lambda x: ', '.join(x['Neighbourhood'])) \
.reset_index()

tbl_munge.rename(columns = {'Postcode': 'PostalCode', 0 : 'Neighborhood'}, inplace= True)
tbl_munge.head()



Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Shown below is the dimension of the newly made table:

In [5]:
tbl_munge.shape

(103, 3)

## Determining longitude and latitude using `geocoder` library

Since I am getting mostly a denial of request from the API calls in geocoder, I opted to just use the provided csv file for geospatial data.

In [6]:
geo_dat = pd.read_csv('https://cocl.us/Geospatial_data')
geo_dat.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
tbl_fnl = tbl_munge.join(geo_dat.set_index(['Postal Code']), 
              on = 'PostalCode', how = 'left')

tbl_fnl.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [8]:
tbl_fnl.shape

(103, 5)

## Recreating clustering and map visualization

Similar to what was done in the New York neighborhoods, the same analyses will be done but with Toronto Neighborhoods. This will involve clustering according to establishments in each and ultimately visualizing clusters using a folium map.

### API calls to Foursquare

In [9]:
CLIENT_ID = 'YBKPFMJPAJJHYUKA3I0E5GBQIFPENK2R54Y0YE3D5YBPNCWU' #  Foursquare ID
CLIENT_SECRET = 'SEUXNGKDKKGT2H4EI1LXTRN4X5XQOZSSQMEOJUMLEZV1IHHE' #  Foursquare Secret
VERSION = '20180605' #  API version
LIMIT = 100

In [10]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
toronto_venues = getNearbyVenues(names=tbl_fnl['Neighborhood'],
                                   latitudes=tbl_fnl['Latitude'],
                                   longitudes=tbl_fnl['Longitude'])

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, 

In [13]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Chris Effects Painting,43.784343,-79.163742,Construction & Landscaping
2,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Marina Spa,43.766,-79.191,Spa


In [14]:
toronto_venues.shape

(2228, 7)

### Analysis of each Neighborhood

In [15]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix = '', prefix_sep = '')

# append Venue name and neighborhood
toronto_onehot.insert(0, 'Neighborhood_col', toronto_venues['Neighborhood'])
toronto_onehot.head()

Unnamed: 0,Neighborhood_col,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


The one-hot encoded variables will be aggregated by taking the mean of each variable by Neighborhoo|d

In [16]:
toronto_onehot.Neighborhood_col.value_counts()

Adelaide, King, Richmond                                                                                                                  100
Ryerson, Garden District                                                                                                                  100
First Canadian Place, Underground city                                                                                                    100
Commerce Court, Victoria Hotel                                                                                                            100
Design Exchange, Toronto Dominion Centre                                                                                                  100
Harbourfront East, Toronto Islands, Union Station                                                                                         100
St. James Town                                                                                                                            100
Stn A 

In [17]:
toronto_onehot_grouped = toronto_onehot.groupby('Neighborhood_col').mean().reset_index()
toronto_onehot_grouped.shape

(99, 269)

In [18]:
from sklearn.cluster import KMeans

X = toronto_onehot_grouped.drop('Neighborhood_col',axis = 1)
clstr = KMeans(n_clusters = 5)
clstr.fit(X)

toronto_onehot_grouped['Cluster'] = clstr.fit_predict(X)

Determine the top *n* venue categories per Neighborhood

In [19]:
def ordinal(value):

    try:
        value = int(value)
    except ValueError:
        return value

    if value % 100//10 != 1:
        if value % 10 == 1:
            ordval = u"%d%s" % (value, "st")
        elif value % 10 == 2:
            ordval = u"%d%s" % (value, "nd")
        elif value % 10 == 3:
            ordval = u"%d%s" % (value, "rd")
        else:
            ordval = u"%d%s" % (value, "th")
    else:
        ordval = u"%d%s" % (value, "th")

    return ordval

In [20]:
toronto_grp_cnt = toronto_venues.groupby(['Neighborhood', 'Venue Category']) \
.size().reset_index().rename(columns= {0:'count'}) \
.sort_values(by = ['Neighborhood', 'count'], axis = 0, ascending = False) \
.groupby(['Neighborhood', 'count']) \
.apply(lambda x: ', '.join(x['Venue Category'])) \
.reset_index().rename(columns = {0:'Venue'})

toronto_grp_cnt['rank_'] = toronto_grp_cnt.groupby('Neighborhood').cumcount() + 1
toronto_grp_cnt['venue_rank'] = toronto_grp_cnt.rank_.apply(ordinal)


## length sequence 

In [21]:
toronto_grp_cnt.head()

Unnamed: 0,Neighborhood,count,Venue,rank_,venue_rank
0,"Adelaide, King, Richmond",1,"Brazilian Restaurant, Building, Burger Joint, ...",1,1st
1,"Adelaide, King, Richmond",2,"American Restaurant, Asian Restaurant, Bakery,...",2,2nd
2,"Adelaide, King, Richmond",3,"Bar, Steakhouse, Sushi Restaurant",3,3rd
3,"Adelaide, King, Richmond",4,"Café, Thai Restaurant",4,4th
4,"Adelaide, King, Richmond",5,Restaurant,5,5th


A pivot table was made to display the highest ranked venues in terms of frequencies by Neighborhood. This is for easy determination of characteristics within a cluster.

In [22]:
toronto_grp_pivot = toronto_grp_cnt.query('rank_ <= 5') \
.pivot(index = 'Neighborhood', columns = 'venue_rank', values = 'Venue')

In [23]:
toronto_grp_pivot.head()

venue_rank,1st,2nd,3rd,4th,5th
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Adelaide, King, Richmond","Brazilian Restaurant, Building, Burger Joint, ...","American Restaurant, Asian Restaurant, Bakery,...","Bar, Steakhouse, Sushi Restaurant","Café, Thai Restaurant",Restaurant
Agincourt,"Breakfast Spot, Latin American Restaurant, Lou...",,,,
"Agincourt North, L'Amoreaux East, Milliken, Steeles East","Coffee Shop, Park, Playground",,,,
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown","Beer Store, Discount Store, Fast Food Restaura...",Grocery Store,,,
"Alderwood, Long Branch","Coffee Shop, Dance Studio, Gym, Pharmacy, Pool...",Pizza Place,,,


Predicted clusters were then appended to the pivot table containing the 1st to 5th Venues in terms of frequency.

In [24]:
toronto_grp_pivot['Cluster'] = [toronto_onehot_grouped[toronto_onehot_grouped.Neighborhood_col == x].reset_index().at[0,'Cluster'] for x in np.array(toronto_grp_pivot.index)]




In [25]:
toronto_grp_pivot.head()

venue_rank,1st,2nd,3rd,4th,5th,Cluster
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond","Brazilian Restaurant, Building, Burger Joint, ...","American Restaurant, Asian Restaurant, Bakery,...","Bar, Steakhouse, Sushi Restaurant","Café, Thai Restaurant",Restaurant,0
Agincourt,"Breakfast Spot, Latin American Restaurant, Lou...",,,,,0
"Agincourt North, L'Amoreaux East, Milliken, Steeles East","Coffee Shop, Park, Playground",,,,,4
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown","Beer Store, Discount Store, Fast Food Restaura...",Grocery Store,,,,0
"Alderwood, Long Branch","Coffee Shop, Dance Studio, Gym, Pharmacy, Pool...",Pizza Place,,,,0


#### First Cluster

In [26]:
toronto_grp_pivot.query('Cluster == 0')

venue_rank,1st,2nd,3rd,4th,5th,Cluster
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond","Brazilian Restaurant, Building, Burger Joint, ...","American Restaurant, Asian Restaurant, Bakery,...","Bar, Steakhouse, Sushi Restaurant","Café, Thai Restaurant",Restaurant,0
Agincourt,"Breakfast Spot, Latin American Restaurant, Lou...",,,,,0
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown","Beer Store, Discount Store, Fast Food Restaura...",Grocery Store,,,,0
"Alderwood, Long Branch","Coffee Shop, Dance Studio, Gym, Pharmacy, Pool...",Pizza Place,,,,0
"Bathurst Manor, Downsview North, Wilson Heights","Bank, Bridal Shop, Deli / Bodega, Diner, Fried...",Coffee Shop,,,,0
Bayview Village,"Bank, Café, Chinese Restaurant, Japanese Resta...",,,,,0
"Bedford Park, Lawrence Manor East","American Restaurant, Butcher, Café, Comfort Fo...","Coffee Shop, Italian Restaurant, Restaurant, S...",,,,0
Berczy Park,"Art Gallery, BBQ Joint, Bagel Shop, Basketball...","Bakery, Beer Bar, Café, Cheese Shop, Farmers M...",Cocktail Bar,Coffee Shop,,0
"Birch Cliff, Cliffside West","Café, College Stadium, General Entertainment, ...",,,,,0
"Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe","Beer Store, Café, Coffee Shop, Cosmetics Shop,...",,,,,0


#### Second Cluster

In [27]:
toronto_grp_pivot.query('Cluster == 1')

venue_rank,1st,2nd,3rd,4th,5th,Cluster
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Emery, Humberlea",Baseball Field,,,,,1
"Highland Creek, Rouge Hill, Port Union","Bar, Construction & Landscaping",,,,,1
"Humber Bay, King's Mill Park, Kingsway Park South East, Mimico NE, Old Mill South, The Queensway East, Royal York South East, Sunnylea","Baseball Field, Construction & Landscaping",,,,,1


#### Third Cluster

In [28]:
toronto_grp_pivot.query('Cluster == 2')

venue_rank,1st,2nd,3rd,4th,5th,Cluster
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Rouge, Malvern",Fast Food Restaurant,,,,,2


#### Fourth Cluster

This cluster usually has neighborhoods with places of recreation outdoors.

In [29]:
toronto_grp_pivot.query('Cluster == 3')

venue_rank,1st,2nd,3rd,4th,5th,Cluster
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Moore Park, Summerhill East","Playground, Trail",,,,,3
Scarborough Village,Playground,,,,,3


#### Fifth Cluster

Neighborhoods in this cluster usually have places that pertain to food such as restaurants, beer stores, cafes etc.

In [30]:
toronto_grp_pivot.query('Cluster == 4')

venue_rank,1st,2nd,3rd,4th,5th,Cluster
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Agincourt North, L'Amoreaux East, Milliken, Steeles East","Coffee Shop, Park, Playground",,,,,4
"CFB Toronto, Downsview East","Airport, Bus Stop, Park",,,,,4
Caledonia-Fairbanks,"Market, Women's Store",Park,,,,4
East Toronto,"Coffee Shop, Convenience Store",Park,,,,4
Parkwoods,"Food & Drink Shop, Park",,,,,4
Rosedale,"Playground, Trail",Park,,,,4
"Silver Hills, York Mills","Cafeteria, Park",,,,,4
"The Kingsway, Montgomery Road, Old Mill North","Park, Pool, River",,,,,4
Weston,Convenience Store,Park,,,,4
York Mills West,"Bank, Convenience Store",Park,,,,4


### Visualization of neighborhoods

Using `Folium`, these neighborhoods were visualized according to their clusters in Toronto.

In [31]:
toronto_loc = [43.6532, -79.3832]
kclusters = 5

# create map
map_clusters = folium.Map(location=toronto_loc, zoom_start=11)


# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add Cluster labels to tbl_fnl
merged_df = toronto_onehot_grouped \
.set_index('Neighborhood_col') \
.join(
    tbl_fnl[['Longitude', 'Latitude', 'Neighborhood']].set_index('Neighborhood'),
    on = 'Neighborhood_col',
    how = 'left'
)


merged_df.head()

Unnamed: 0_level_0,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Cluster,Longitude,Latitude
Neighborhood_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,...,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0,-79.384568,43.650571
Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-79.262029,43.7942
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,-79.284577,43.815252
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-79.588437,43.739416
"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-79.543484,43.602414


In [32]:

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(merged_df['Latitude'], merged_df['Longitude'], 
                                  merged_df['Neighborhood'], merged_df['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters