# Coursera Capstone Part 3

## The Battle of Neighbourhoods

In [232]:
#importing necessary libraries

import pandas as pd
import numpy as np
import requests
from pandas.io.json import json_normalize
import folium
import json
from sklearn.cluster import KMeans

# Step 1: Importing Dataset

In [3]:
df_toronto = pd.read_csv('toronto_neighbour') # importing the csv file from part 2

df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Step 2: Extracting the venues of each neighbourhood

In [39]:
venues_list = [] #declaring an empty list for getting the venues around a neighbourhood

for i in range(0,len(df_toronto)):
    
    # defining credentials
    CLIENT_ID = 'foursqare_credential'
    CLIENT_SECRET = 'foursquare_secret'
    V = '20180605'
    LATITUDE = df_toronto.loc[i,'Latitude']
    LONGITUDE = df_toronto.loc[i,'Longitude']
    NEIGHBOURHOOD = df_toronto.loc[i,'Neighbourhood']
    RADIUS = 500
    LIMIT = 20
    
    #defining the api
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    V,
    LATITUDE,
    LONGITUDE,
    RADIUS,
    LIMIT)
    
    #getting the json file
    res = requests.get(url).json()["response"]["groups"][0]["items"]
    
    venues_list.append([(NEIGHBOURHOOD, 
                      v['venue']['name'],
                      v['venue']['location']['lat'],
                      v['venue']['location']['lng'],
                      v['venue']['categories'][0]['name'])
                      for v in res])


nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list]) #defining dataframe


In [45]:
nearby_venues.columns=['Neighbourhood','Venue Name','Venue Latitude','Venue Longitude','Venue Category'] #naming the columns

nearby_venues.head()

Unnamed: 0,Neighbourhood,Venue Name,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,Victoria Village,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [59]:
nearby_venues[['Venue Category']].nunique() #Checking the number of unique values in the category section

Venue Category    211
dtype: int64

# Step 3: Data Preprocessing

In [81]:
toronto_n = pd.get_dummies(nearby_venues[['Venue Category']], prefix="", prefix_sep="") #Transforming each category into 0 or 1

toronto_n.head()

Unnamed: 0,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,...,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
ddf = nearby_venues['Neighbourhood'] #Extracting the neighbourhood column

In [82]:
toronto_n.insert(0,'Neighbourhood',ddf) #inserting the column Neighbourhood in the first position 

In [83]:
toronto_n.head()

Unnamed: 0,Neighbourhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
toronto_neigh = toronto_n.groupby('Neighbourhood').mean().reset_index() #Grouping by the neighbourhoods and getting the mean
                                                                        #number of values each of venues

toronto_neigh.head()

Unnamed: 0,Neighbourhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
new_toronto = toronto_neigh.copy() #creating of the dataset for further work(Just my own preference!)

new_toronto.head()

Unnamed: 0,Neighbourhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [227]:
#Creating a list of lists of the top five most occuring venues in each neighbourhood

listing=[]
for hoods in new_toronto['Neighbourhood']:   
    
    temp = new_toronto[new_toronto['Neighbourhood'] == hoods].T.reset_index()
    
    temp.columns = ['venue','freq']
    
    temp = temp.iloc[1:,1:]
    
    temp['freq'] = temp['freq'].astype('float')
    
    
    listing.append([hoods,temp.sort_values(by='freq',ascending=False).head(5).values])



In [228]:
#Transforming the values into a dataframe

for i in range(len(listing)):
    if i == 0:
        top_five = pd.DataFrame(listing[i][1]).T
    else:
        top_five = pd.concat([top_five, pd.DataFrame(listing[i][1]).T],axis=0).reset_index(drop=True)
top_five

Unnamed: 0,0,1,2,3,4
0,0.250000,0.250000,0.250000,0.250000,0.000000
1,0.285714,0.142857,0.142857,0.142857,0.142857
2,0.100000,0.100000,0.050000,0.050000,0.050000
3,0.250000,0.250000,0.250000,0.250000,0.000000
4,0.100000,0.100000,0.100000,0.050000,0.050000
...,...,...,...,...,...
91,0.100000,0.100000,0.100000,0.050000,0.050000
92,0.166667,0.166667,0.166667,0.166667,0.166667
93,0.500000,0.250000,0.250000,0.000000,0.000000
94,0.142857,0.142857,0.142857,0.142857,0.142857


In [229]:
top_five.columns = ['Highest','2nd Highest','3rd Highest','4th Highest','5th Highest'] #Renaming the columns

top_five.head()

Unnamed: 0,Highest,2nd Highest,3rd Highest,4th Highest,5th Highest
0,0.25,0.25,0.25,0.25,0.0
1,0.285714,0.142857,0.142857,0.142857,0.142857
2,0.1,0.1,0.05,0.05,0.05
3,0.25,0.25,0.25,0.25,0.0
4,0.1,0.1,0.1,0.05,0.05


In [230]:
top_five.insert(0, 'Neighbourhood', new_toronto['Neighbourhood']) #Inserting the Neighbourhood column

top_five.head()

Unnamed: 0,Neighbourhood,Highest,2nd Highest,3rd Highest,4th Highest,5th Highest
0,Agincourt,0.25,0.25,0.25,0.25,0.0
1,"Alderwood, Long Branch",0.285714,0.142857,0.142857,0.142857,0.142857
2,"Bathurst Manor, Wilson Heights, Downsview North",0.1,0.1,0.05,0.05,0.05
3,Bayview Village,0.25,0.25,0.25,0.25,0.0
4,"Bedford Park, Lawrence Manor East",0.1,0.1,0.1,0.05,0.05


In [235]:
k = 5 #Specifing the number of cluster. You can run a cross validation for the best cluster but I just wanted 5 clusters

km = KMeans(n_clusters=k,random_state=0).fit(top_five.iloc[:,1:]) #fitting the KMeans model

In [237]:
top_five.insert(1,'Cluster No',km.labels_) #Inserting the cluster no.

top_five.head()

Unnamed: 0,Neighbourhood,Cluster No,Highest,2nd Highest,3rd Highest,4th Highest,5th Highest
0,Agincourt,0,0.25,0.25,0.25,0.25,0.0
1,"Alderwood, Long Branch",4,0.285714,0.142857,0.142857,0.142857,0.142857
2,"Bathurst Manor, Wilson Heights, Downsview North",2,0.1,0.1,0.05,0.05,0.05
3,Bayview Village,0,0.25,0.25,0.25,0.25,0.0
4,"Bedford Park, Lawrence Manor East",2,0.1,0.1,0.1,0.05,0.05


In [238]:
top_five['Cluster No'].value_counts() #counting the number of neighbourhoods that fall in each cluster

2    45
4    19
0    16
3     9
1     7
Name: Cluster No, dtype: int64

In [239]:
toronto_lat = 43.6532 #Defining latitude of Toronto
toronto_long = -79.3832 #Defining longitude of Toronto

# Step 4: Visualizing the Result

In [252]:
#Visualizing the graph for each cluster with different colors

toronto_map = folium.Map(location = [toronto_lat,toronto_long],zoom_start = 10) #Plotting the map of Toronto


#Plotting neighbourhoods

for hood in top_five['Neighbourhood']:
    
    temp = df_toronto[df_toronto['Neighbourhood']==hood]
    lat = temp['Latitude'].iloc[0]
    long = temp['Longitude'].iloc[0]
    cluster = top_five.loc[top_five['Neighbourhood']==hood,'Cluster No'].iloc[0]
    
    if cluster == 0:
        folium.CircleMarker(location=[lat,long], 
                            radius=5,
                            color='red',
                            ).add_to(toronto_map)
    
    elif cluster == 1:
        folium.CircleMarker(location=[lat,long], 
                            radius=5,
                            color='blue',
                            ).add_to(toronto_map)
    
    elif cluster == 2:
        folium.CircleMarker(location=[lat,long], 
                            radius=5,
                            color='green',
                            ).add_to(toronto_map)
        
    elif cluster == 3:
        folium.CircleMarker(location=[lat,long], 
                            radius=5,
                            color='black',
                            ).add_to(toronto_map)
    
    elif cluster == 4:
        folium.CircleMarker(location=[lat,long], 
                            radius=5,
                            color='yellow',
                            ).add_to(toronto_map)

toronto_map