In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_excel('toronto.xlsx')
print(df.shape)
df.head()

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# Loading and Cleaning the Data
1. I have copied the table onto an excel sheet and loaded it into a dataframe
2. I've removed all the data points for which a Borough wasn't assigned
3. For all the Neighbourhoods which weren't assigned, I have assigned them their Borough's name

In [3]:
for i in range(len(df)):
    if df.loc[i,'Borough'] == 'Not assigned':
        df.drop(i,inplace=True)

df.reset_index(inplace=True,drop=True)  

for i in range(len(df)):
    if df.loc[i,'Neighborhood'] == 'Not assigned':
        df.loc[i,'Neighborhood'] = df.loc[i,'Borough']

print(df.shape)
df.head()

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# Getting the Coordinates
Since geocoder does not give results when Nomatinatim is used with Postal code,
- I used Neighbourhoods to get the Latitudes and Longitudes.
- When there are multiple Neighbourhoods for a postal code, I took the avergage of their co-ordinates as the coordinates as the postal code.
- If the Neighborhood coordinates are not available (5 cases), I am dropping the row

In [4]:
import requests
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize

In [5]:
lat= np.zeros(len(df))
lng= np.zeros(len(df))
n = 0
for i in range(len(df)):
    if ',' not in df.loc[i,'Neighborhood']:
        address = df.loc[i,'Neighborhood'] + ', Toronto, Ontario'
        geoloc = Nominatim()
        loc = geoloc.geocode(address)
        if (loc is None) == False:
            #print(loc.latitude,loc.longitude)
            lat[i] = loc.latitude
            lng[i] = loc.longitude
        else:
            n += 1
        #print(i)
    else:
        neighs = df.loc[i,'Neighborhood'].split(',')
        lat_m = []
        lng_m = []
        for j in neighs:
            #print(j)
            address_m = j + ', Toronto, Ontario'
            geoloc = Nominatim()
            loc = geoloc.geocode(address_m)
            if (loc is None) == False:
                lat_m.append(float(loc.latitude))
                lng_m.append(float(loc.longitude))
            else:
                n+=1
        lat_mean = np.array(lat_m).mean()
        lng_mean = np.array(lng_m).mean()
        lat[i] = lat_mean
        lng[i] = lng_mean

lat[0:5],lng[0:5],n

  import sys
  ret = ret.dtype.type(ret / rcount)


(array([43.7587999 , 43.732658  , 43.65039285, 43.7224286 , 43.659659  ]),
 array([-79.3201966 , -79.3111892 , -79.3703032 , -79.44421995,
        -79.3903399 ]),
 14)

In [6]:
df['Latitude'] = lat
df['Longitude'] = lng
df.dropna(inplace=True)
df = df[df['Latitude']!=0]
df.reset_index(drop=True,inplace=True)
print(df.shape)
df.head()

(97, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7588,-79.320197
1,M4A,North York,Victoria Village,43.732658,-79.311189
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650393,-79.370303
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.722429,-79.44422
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.659659,-79.39034


# Venues in the Neighborhoods
- Used the Fouraquare API to get the most popular 100 venues in each area in Toronto
- Created a Dataframe that shows the different kinds of venues that an area has and the number of them in each type. However this isnt classified and the direct label from the database is used. (Restaurants of different cuisines are not clubbed together)
- Used K-means clustering algorithm to group them into 5 different clusters.
- Each venue is shown with a cluster-based colour coded marker on the map of Toronto

In [7]:
CLIENT_ID = 'V0IOJXOPEJUKA435MGKJRMZGECVQ1W2H2O0ABKF3BORHYKVF' 
CLIENT_SECRET = 'ZLF3WOTLHONE2F1QK0DUDSTF02GYA2LQI5TYSSHJ0S22ERPQ' 
VERSION = '20180604'
LIMIT = 100

In [8]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID,CLIENT_SECRET,VERSION,lat,lng,radius,LIMIT)          
        results = requests.get(url).json()["response"]['groups'][0]['items']        
        venues_list.append([(name,lat,lng,v['venue']['name'],v['venue']['location']['lat'],v['venue']['location']['lng'],v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue','Venue Latitude','Venue Longitude','Venue Category']
    return(nearby_venues)    

In [9]:
toronto_venues = getNearbyVenues(names=df['Neighborhood'],latitudes=df['Latitude'],longitudes=df['Longitude'])

In [10]:
toronto_dummies = pd.get_dummies(toronto_venues['Venue Category'])
toronto_dummies['Neighborhood'] = toronto_venues['Neighborhood']
print(toronto_dummies.shape)
toronto_dummies.head()

(2506, 277)


Unnamed: 0,ATM,Accessories Store,Afghan Restaurant,African Restaurant,American Restaurant,Antique Shop,Argentinian Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
toronto_grouped = toronto_dummies.groupby('Neighborhood').sum()
toronto_grouped

Unnamed: 0_level_0,ATM,Accessories Store,Afghan Restaurant,African Restaurant,American Restaurant,Antique Shop,Argentinian Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Agincourt,0,0,0,0,0,0,0,0,0,2,...,0,0,0,1,0,0,0,0,0,0
"Alderwood, Long Branch",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Bathurst Manor, Wilson Heights, Downsview North",0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bayview Village,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Bedford Park, Lawrence Manor East",0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Willowdale, Willowdale West",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Woburn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Woodbine Heights,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
York Mills West,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
from sklearn.cluster import KMeans

In [13]:
km = KMeans(n_clusters= 5)
km.fit(toronto_grouped.values)
toronto_grouped['clusters'] = km.labels_

In [14]:
for i,j in zip(toronto_grouped.index,toronto_grouped['clusters']):
    for k in range(len(df)):
        if df.loc[k,'Neighborhood'] == i:
            df.loc[k,'Clusters'] = j
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Clusters
0,M3A,North York,Parkwoods,43.7588,-79.320197,3.0
1,M4A,North York,Victoria Village,43.732658,-79.311189,3.0
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650393,-79.370303,4.0
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.722429,-79.44422,3.0
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.659659,-79.39034,4.0


In [15]:
import folium

In [16]:
toronto = 'Toronto'
geoloc = Nominatim()
loc_t = geoloc.geocode(toronto)
toronto_map = folium.Map(location=[loc_t.latitude,loc_t.longitude],zoom_start=12)
colours = ['red','blue','yellow','green','orange','pink','gray','white','purple']

for i in range(len(df)):
    for j in df['Clusters'].unique():
        if df.loc[i,'Clusters'] == j:
            folium.features.CircleMarker(
                [df.loc[i,'Latitude'], df.loc[i,'Longitude']],
                radius=5,
                poup=df.loc[i,'Borough'],
                fill=True,
                color=colours[int(j)],
                fill_color=colours[int(j)],
                fill_opacity=1.0
            ).add_to(toronto_map)
toronto_map

  
