### Environment setup

In [1]:
import pandas as pd
import numpy as np
import requests
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
from geopy.geocoders import Nominatim

### Data import

I import the data from the csv I saved in my computer into a `df` called tab_lal

In [2]:
tab_lal = pd.read_csv('tab_lal.csv')

In [3]:
tab_lal.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


As suggested, I have limited the boroughs to those that contain the word Toronto

In [4]:
tab_tor_bor = tab_lal[tab_lal['Borough'].str.contains('Toronto')]

In [5]:
tab_tor_bor.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


### Venue information retrieval

#### Define Foursquare Credentials and Version

In [39]:
print('credentials deleted')

credentials deleted


#### Venue information retrieval function
I create a function to retrieve nearby venue information

In [8]:
def getNeaVen(nam, lat, lon, rad=500, lim=500):
    ven_lis=[]
    for nam_, lat_, lon_ in zip(nam, lat, lon):
        #print(nam)
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat_, 
            lon_, 
            rad, 
            lim)
            
        # make the GET request
        res = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        ven_lis.append([(
            nam_, 
            lat_, 
            lon_, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in res])

    nea_ven = pd.DataFrame([item for ven_lis in ven_lis for item in ven_lis])
    nea_ven.columns = ['nei', 'nei_lat', 'nei_lon', 'ven', 'ven_lat', 'ven_lon', 'ven_cat']
    
    return(nea_ven)

I call the function to create a pandas dataframe with the retrieved information

In [13]:
tor_ven = getNeaVen(nam=tab_tor_bor['Neighborhood'],
                    lat=tab_tor_bor['Latitude'],
                    lon=tab_tor_bor['Longitude'])

In [14]:
print(tor_ven.shape)

(1573, 7)


In [15]:
tor_ven.head()

Unnamed: 0,nei,nei_lat,nei_lon,ven,ven_lat,ven_lon,ven_cat
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


I check how many venues were returned for each neighborhood

In [16]:
tor_ven.groupby('nei_lat').count()

Unnamed: 0_level_0,nei,nei_lon,ven,ven_lat,ven_lon,ven_cat
nei_lat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
43.628947,15,15,15,15,15,15
43.636847,23,23,23,23,23,23
43.640816,100,100,100,100,100,100
43.644771,55,55,55,55,55,55
43.646435,96,96,96,96,96,96
43.647177,100,100,100,100,100,100
43.647927,42,42,42,42,42,42
43.648198,100,100,100,100,100,100
43.648429,100,100,100,100,100,100
43.64896,13,13,13,13,13,13


I check the number of unique categories in all the returned venues

In [17]:
print('There are {} uniques categories.'.format(len(tor_ven['ven_cat'].unique())))

There are 230 uniques categories.


## Neighbourhood analysis

To analyse each neighbourhood I first create a one hot encoding dataframe

In [18]:
tor_ohe = pd.get_dummies(tor_ven[['ven_cat']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tor_ohe['nei'] = tor_ven['nei'] 

# move neighborhood column to the first column
tor_ohe = tor_ohe[[tor_ohe.columns[-1]] + list(tor_ohe.columns[:-1])]
tor_ohe.head()

Unnamed: 0,nei,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


I group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [19]:
tor_gro = tor_ohe.groupby('nei').mean().reset_index()

In [20]:
num_top_ven = 10

for nei in tor_gro['nei']:
    print("----"+nei+"----")
    tem = tor_gro[tor_gro['nei'] == nei].T.reset_index()
    tem.columns = ['ven','fre']
    tem = tem.iloc[1:]
    tem['fre'] = tem['fre'].astype(float)
    tem = tem.round({'fre': 2})
    print(tem.sort_values('fre', ascending=False).reset_index(drop=True).head(num_top_ven))
    print('\n')

----Adelaide, King, Richmond----
                   ven   fre
0          Coffee Shop  0.09
1                 Café  0.05
2           Restaurant  0.04
3       Clothing Store  0.03
4                  Gym  0.03
5  American Restaurant  0.03
6        Deli / Bodega  0.03
7      Thai Restaurant  0.03
8                Hotel  0.03
9   Seafood Restaurant  0.02


----Berczy Park----
                  ven   fre
0         Coffee Shop  0.07
1  Seafood Restaurant  0.04
2        Cocktail Bar  0.04
3              Bakery  0.04
4      Farmers Market  0.04
5          Restaurant  0.04
6            Beer Bar  0.04
7  Italian Restaurant  0.04
8                Café  0.04
9         Cheese Shop  0.04


----Brockton, Exhibition Place, Parkdale Village----
                      ven   fre
0                    Café  0.13
1          Breakfast Spot  0.09
2             Coffee Shop  0.09
3               Nightclub  0.09
4               Pet Store  0.04
5      Italian Restaurant  0.04
6                 Stadium  0.04
7  Furn

                        ven   fre
0                      Park  0.25
1                    Lawyer  0.25
2                  Bus Line  0.25
3               Swim School  0.25
4                   Airport  0.00
5       Moroccan Restaurant  0.00
6  Mediterranean Restaurant  0.00
7               Men's Store  0.00
8             Metro Station  0.00
9        Mexican Restaurant  0.00


----Little Portugal, Trinity----
                             ven   fre
0                            Bar  0.10
1                     Restaurant  0.07
2          Vietnamese Restaurant  0.05
3  Vegetarian / Vegan Restaurant  0.05
4                    Men's Store  0.05
5               Asian Restaurant  0.05
6                           Café  0.05
7                    Yoga Studio  0.02
8             Miscellaneous Shop  0.02
9                         Bistro  0.02


----Moore Park, Summerhill East----
                         ven   fre
0                       Park  0.33
1                 Playground  0.33
2                Su

I create a function that sorts the venues in descending order

In [21]:
def retMosComVen(row, num_top_ven):
    row_cat = row.iloc[1:]
    row_cat_sor = row_cat.sort_values(ascending=False)
    
    return row_cat_sor.index.values[0:num_top_ven]

I create a new dataframe with the top 10 venues for each neighbourhood

In [22]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['nei']
for ind in np.arange(num_top_ven):
    try:
        columns.append('{}{} Most common venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most common venue'.format(ind+1))

# create a new dataframe
nei_ven_sor = pd.DataFrame(columns=columns)
nei_ven_sor['nei'] = tor_gro['nei']

for ind in np.arange(tor_gro.shape[0]):
    nei_ven_sor.iloc[ind, 1:] = retMosComVen(tor_gro.iloc[ind, :], num_top_ven)

nei_ven_sor.head()

Unnamed: 0,nei,1st Most common venue,2nd Most common venue,3rd Most common venue,4th Most common venue,5th Most common venue,6th Most common venue,7th Most common venue,8th Most common venue,9th Most common venue,10th Most common venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Restaurant,American Restaurant,Clothing Store,Thai Restaurant,Deli / Bodega,Gym,Hotel,Pizza Place
1,Berczy Park,Coffee Shop,Cheese Shop,Seafood Restaurant,Café,Beer Bar,Bakery,Italian Restaurant,Farmers Market,Restaurant,Cocktail Bar
2,"Brockton, Exhibition Place, Parkdale Village",Café,Nightclub,Coffee Shop,Breakfast Spot,Grocery Store,Bakery,Performing Arts Venue,Pet Store,Climbing Gym,Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Spa,Garden Center,Garden,Fast Food Restaurant,Farmers Market,Comic Shop,Pizza Place,Restaurant
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Lounge,Harbor / Marina,Sculpture Garden,Airport Food Court,Airport Gate,Bar,Boat or Ferry,Boutique,Coffee Shop


## Cluster Neighborhoods

I then run k-means to cluster the neighborhood into 5 clusters

In [23]:
# set number of clusters
kclusters = 5

tor_gro_clu = tor_gro.drop('nei', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_gro_clu)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

I create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [24]:
# add clustering labels
nei_ven_sor.insert(0, 'clu_lab', kmeans.labels_)

tor_mer = tab_tor_bor

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
tor_mer = tor_mer.join(nei_ven_sor.set_index('nei'), on='Neighborhood')

tor_mer.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,clu_lab,1st Most common venue,2nd Most common venue,3rd Most common venue,4th Most common venue,5th Most common venue,6th Most common venue,7th Most common venue,8th Most common venue,9th Most common venue,10th Most common venue
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Pub,Trail,Neighborhood,Yoga Studio,Dumpling Restaurant,Dog Run,Doner Restaurant,Donut Shop,Electronics Store
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Italian Restaurant,Coffee Shop,Bookstore,Furniture / Home Store,Ice Cream Shop,Yoga Studio,Restaurant,Spa,Japanese Restaurant
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Fast Food Restaurant,Pizza Place,Sushi Restaurant,Ice Cream Shop,Sandwich Place,Liquor Store,Burrito Place,Fish & Chips Shop,Restaurant,Italian Restaurant
43,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Bakery,Brewery,American Restaurant,Gastropub,Yoga Studio,Cheese Shop,Italian Restaurant,Bookstore
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Park,Lawyer,Bus Line,Swim School,Yoga Studio,Dog Run,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant


### Toronto map

I visualise the resulting clusters

In [27]:
adr = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
loc = geolocator.geocode(adr)
lat = loc.latitude
lon = loc.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(lat, lon))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [29]:
# create map
map_clusters = folium.Map(location=[lat, lon], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_mer['Latitude'], tor_mer['Longitude'], tor_mer['Neighborhood'], tor_mer['clu_lab']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

I examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster.

### Cluster 1

In [35]:
tor_mer.loc[tor_mer['clu_lab'] == 0, tor_mer.columns[[1] + list(range(5, tor_mer.shape[1]))]]

Unnamed: 0,Borough,clu_lab,1st Most common venue,2nd Most common venue,3rd Most common venue,4th Most common venue,5th Most common venue,6th Most common venue,7th Most common venue,8th Most common venue,9th Most common venue,10th Most common venue
37,East Toronto,0,Health Food Store,Pub,Trail,Neighborhood,Yoga Studio,Dumpling Restaurant,Dog Run,Doner Restaurant,Donut Shop,Electronics Store
41,East Toronto,0,Greek Restaurant,Italian Restaurant,Coffee Shop,Bookstore,Furniture / Home Store,Ice Cream Shop,Yoga Studio,Restaurant,Spa,Japanese Restaurant
42,East Toronto,0,Fast Food Restaurant,Pizza Place,Sushi Restaurant,Ice Cream Shop,Sandwich Place,Liquor Store,Burrito Place,Fish & Chips Shop,Restaurant,Italian Restaurant
43,East Toronto,0,Café,Coffee Shop,Bakery,Brewery,American Restaurant,Gastropub,Yoga Studio,Cheese Shop,Italian Restaurant,Bookstore
45,Central Toronto,0,Park,Hotel,Food & Drink Shop,Sandwich Place,Department Store,Breakfast Spot,Gym,Coworking Space,Doner Restaurant,Farmers Market
46,Central Toronto,0,Clothing Store,Coffee Shop,Yoga Studio,Sporting Goods Shop,Health & Beauty Service,Fast Food Restaurant,Mexican Restaurant,Diner,Dessert Shop,Chinese Restaurant
47,Central Toronto,0,Pizza Place,Sandwich Place,Dessert Shop,Gym,Café,Sushi Restaurant,Italian Restaurant,Coffee Shop,Indian Restaurant,Bar
49,Central Toronto,0,Coffee Shop,Pub,Pizza Place,Light Rail Station,Liquor Store,Restaurant,Sports Bar,Bank,Fried Chicken Joint,Supermarket
51,Downtown Toronto,0,Coffee Shop,Park,Café,Chinese Restaurant,Bakery,Pizza Place,Italian Restaurant,Pub,Restaurant,Jewelry Store
52,Downtown Toronto,0,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Restaurant,Yoga Studio,Pub,Burger Joint,Hotel,Gastropub,Mediterranean Restaurant


#### Cluster 2

In [33]:
tor_mer.loc[tor_mer['clu_lab'] == 1, tor_mer.columns[[1] + list(range(5, tor_mer.shape[1]))]]

Unnamed: 0,Borough,clu_lab,1st Most common venue,2nd Most common venue,3rd Most common venue,4th Most common venue,5th Most common venue,6th Most common venue,7th Most common venue,8th Most common venue,9th Most common venue,10th Most common venue
48,Central Toronto,1,Park,Playground,Summer Camp,Discount Store,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
50,Downtown Toronto,1,Park,Playground,Trail,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


#### Cluster 3

In [36]:
tor_mer.loc[tor_mer['clu_lab'] == 2, tor_mer.columns[[1] + list(range(5, tor_mer.shape[1]))]]

Unnamed: 0,Borough,clu_lab,1st Most common venue,2nd Most common venue,3rd Most common venue,4th Most common venue,5th Most common venue,6th Most common venue,7th Most common venue,8th Most common venue,9th Most common venue,10th Most common venue
63,Central Toronto,2,Garden,Pool,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


#### Cluster 4

In [37]:
tor_mer.loc[tor_mer['clu_lab'] == 3, tor_mer.columns[[1] + list(range(5, tor_mer.shape[1]))]]

Unnamed: 0,Borough,clu_lab,1st Most common venue,2nd Most common venue,3rd Most common venue,4th Most common venue,5th Most common venue,6th Most common venue,7th Most common venue,8th Most common venue,9th Most common venue,10th Most common venue
64,Central Toronto,3,Trail,Jewelry Store,Bus Line,Sushi Restaurant,Yoga Studio,Dog Run,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space


#### Cluster 5

In [38]:
tor_mer.loc[tor_mer['clu_lab'] == 4, tor_mer.columns[[1] + list(range(5, tor_mer.shape[1]))]]

Unnamed: 0,Borough,clu_lab,1st Most common venue,2nd Most common venue,3rd Most common venue,4th Most common venue,5th Most common venue,6th Most common venue,7th Most common venue,8th Most common venue,9th Most common venue,10th Most common venue
44,Central Toronto,4,Park,Lawyer,Bus Line,Swim School,Yoga Studio,Dog Run,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant
