### Task 3: Exploring and clustering the neighbourhoods in Toronto

In [27]:
import numpy as np #Library to handle data in vectorised manner
import pandas as pd #Library for data analysis
import json #Library to handle JSON files
from geopy.geocoders import Nominatim
GeoLocator = Nominatim(user_agent='my-application') #Convert an address into latitude and longitude
import requests #handle requests

#Matplotlib plotting modules
import matplotlib.cm as cm 
import matplotlib.colors as colors

#import k means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

print('Libraries imported')

Libraries imported


In [24]:
toronto_task1_csv= 'Toronto.TASK_1_df.csv'
toronto_task2_csv= 'Toronto.TASK_2_df.csv'

In [28]:
toronto_neighbourhoods = pd.read_csv(toronto_task2_csv)
toronto_neighbourhoods.shape
toronto_neighbourhoods.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,Scarborough,"Highland Creek,Port Union,Rouge Hill",43.784535,-79.160497
2,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,Scarborough,Woburn,43.770992,-79.216917
4,Scarborough,Cedarbrae,43.773136,-79.239476


### GEOPY TO GET LAT AND LONG VAL

In [37]:
address='Toronto, Ontario Canada'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print("The geographical coordinate of {}, are {} and {}".format(address,location.latitude,location.longitude))

  


The geographical coordinate of Toronto, Ontario Canada, are 43.653963 and -79.387207


### Create a map of Toronto with neighbourhoods superimposed on top

In [62]:
#create a map of Toronto
map_toronto = folium.Map(location=[latitude,longitude],zoom_start=11)
#add markets to the map
for lat,lng, borough,neighbourhood in zip(toronto_neighbourhoods['Latitude'],toronto_neighbourhoods['Longitude'],
                                          toronto_neighbourhoods['Borough'],toronto_neighbourhoods['Neighbourhood']):
    label='{},{}'.format(neighbourhood,borough)
    label= folium.Popup(label,parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=4,
    popup=label,
    color='blue',
    fill= True,
    fill_color='#87cefa',
    fill_opacity=0.5,
    parse_html=False
    ).add_to(map_toronto)

In [66]:
toronto_data= toronto_neighbourhoods[toronto_neighbourhoods['Borough'].str.contains("Toronto")].reset_index(drop=True)
toronto_data.shape

(38, 4)

In [75]:
toronto_data.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,East Toronto,The Beaches,43.676357,-79.293031
1,East Toronto,"Riverdale,The Danforth West",43.679557,-79.352188
2,East Toronto,"India Bazaar,The Beaches West",43.668999,-79.315572
3,East Toronto,Studio District,43.659526,-79.340923
4,Central Toronto,Lawrence Park,43.72802,-79.38879


### Recreate map with new markets for Toronto Neighbourhoods

In [68]:
for lat,lng,borough,neighbourhood in zip(toronto_data['Latitude'],toronto_data['Longitude'],
                                        toronto_data['Borough'],toronto_data['Neighbourhood']):
    label='{},{}'.format(neighbourhood,borough)
    label= folium.Popup(label,parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=4,
    popup=label,
    color='blue',
    fill= True,
    fill_color='#87cefa',
    fill_opacity=0.5,
    parse_html=False
    ).add_to(map_toronto)

In [71]:
map_toronto

### 1. Exploring Neighbourhood in Toronto

In [101]:
def getNearbyVenues(names, latitudes,longitudes, radius = 500):
    #pass in Neighbourhood names, lat and long
    venues_list=[]
    for name,lat,lng in zip(names,latitudes,longitudes):
        #print(name)
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius,
        LIMIT 
        )
    #GET REQUEST 
        results = requests.get(url).json()['response']['groups'][0]['items']
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
        
    nearby_venues=pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns=['Neighbourhood',
                          'Neighbourhood Latitude',
                          'Neighbourhood Longitude',
                          'Venue',
                          'Venue Latitude',
                          'Venue Longitude',
                          'Venue Category']
    return(nearby_venues)

In [158]:
toronto_neighbourhoods= toronto_data
toronto_venues = getNearbyVenues(names=toronto_neighbourhoods['Neighbourhood'],
                  latitudes=toronto_neighbourhoods['Latitude'],
                  longitudes= toronto_neighbourhoods['Longitude'])

In [159]:
toronto_venues.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
4,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood


In [160]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",30,30,30,30,30,30
"Bathurst Quay,CN Tower,Harbourfront West,Island airport,King and Spadina,Railway Lands,South Niagara",16,16,16,16,16,16
Berczy Park,30,30,30,30,30,30
"Brockton,Exhibition Place,Parkdale Village",22,22,22,22,22,22
Business Reply Mail Processing Centre 969 Eastern,19,19,19,19,19,19
"Cabbagetown,St. James Town",30,30,30,30,30,30
Central Bay Street,30,30,30,30,30,30
"Chinatown,Grange Park,Kensington Market",30,30,30,30,30,30
Christie,15,15,15,15,15,15
Church and Wellesley,30,30,30,30,30,30


In [161]:
print('There are {} unique categories'.format(len(toronto_venues['Venue Category'].unique())))

There are 194 unique categories


### Analyse each Neighbourhood

In [162]:
#one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']],prefix='',prefix_sep='')

#add neighbourhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood']

# move Neighbourhood to first column
fixed_columns =[toronto_onehot.columns[-1]]+ list(toronto_onehot.columns[:-1])
toronto_onehot=toronto_onehot[fixed_columns]
toronto_onehot.shape

(831, 195)

In [163]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0
1,"Bathurst Quay,CN Tower,Harbourfront West,Islan...",0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0
3,"Brockton,Exhibition Place,Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455
4,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632


In [164]:
num_top_venues=5
for neigh in toronto_grouped['Neighbourhood']:
    print("-----"+ neigh + "-----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood']== neigh].T.reset_index()
    temp.columns=['venue','freq']
    temp=temp.iloc[1:]
    temp['freq']=temp['freq'].astype(float)
    temp=temp.round({'freq':2})
    print(temp.sort_values('freq',ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

-----Adelaide,King,Richmond-----
              venue  freq
0        Steakhouse  0.10
1       Pizza Place  0.07
2              Café  0.07
3  Asian Restaurant  0.07
4             Hotel  0.07


-----Bathurst Quay,CN Tower,Harbourfront West,Island airport,King and Spadina,Railway Lands,South Niagara-----
              venue  freq
0    Airport Lounge  0.12
1   Airport Service  0.12
2  Airport Terminal  0.12
3           Airport  0.06
4               Bar  0.06


-----Berczy Park-----
                venue  freq
0  Seafood Restaurant  0.07
1            Beer Bar  0.07
2      Farmers Market  0.07
3         Coffee Shop  0.07
4        Cocktail Bar  0.07


-----Brockton,Exhibition Place,Parkdale Village-----
                  venue  freq
0           Coffee Shop  0.09
1        Breakfast Spot  0.09
2                  Café  0.09
3           Yoga Studio  0.05
4  Caribbean Restaurant  0.05


-----Business Reply Mail Processing Centre 969 Eastern-----
           venue  freq
0    Yoga Studio  0.05
1    Pi

In [165]:
def return_most_common_venues(row, num_top_venues):
    row_categories =row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending= False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [166]:
num_top_venues=10
indicators=['st','nd','rd']

#create columns according to number of top venues
columns=['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1,indicators[ind]))
    except:
        columns.append('{}th most common venue'.format(ind+1))
#create new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind,1:] = return_most_common_venues(toronto_grouped.iloc[ind,:], num_top_venues)
    

neighbourhoods_venues_sorted.shape

(38, 11)

### Clustering Neighbourhood

In [167]:
kclusters=10
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood',1)
kmeans = KMeans(n_clusters= kclusters,random_state=1).fit(toronto_grouped_clustering)
print(kmeans.labels_[0:10])
print(len(kmeans.labels_))

[0 9 2 6 0 2 2 6 6 0]
38


In [168]:
toronto_neighbourhoods.shape

(38, 4)

In [169]:
toronto_merged = toronto_neighbourhoods
#add clustering labels
toronto_merged['Cluster labels'] = kmeans.labels_
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

In [173]:
#create map
map_clusters = folium.Map(location=[latitude,longitude],zoom_start=11)

#set colour scheme for the clusters
x= np.arange(kclusters)
ys=[i+x+(i*x)**2 for i in range(kclusters)]
colors_array=cm.rainbow(np.linspace(0,1,len(ys)))
rainbow=[colors.rgb2hex(i) for i in colors_array]

#add Markers to the Map
marker_colors=[]
for lat, lon, poi,cluster in zip(toronto_merged['Latitude'],toronto_merged['Longitude'],toronto_merged['Neighbourhood'],kmeans.labels_):
    label= folium.Popup(str(poi)+'Cluster'+str(cluster),parse_html=True)
    folium.CircleMarker(
        [lat,lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

In [174]:
map_clusters