# Capstone Project

This notebook will be used for my IBM Data Science Capstone Project!

In [1]:
import pandas as pd
import numpy as np
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
from geopy.geocoders import Nominatim
import requests

In [2]:
tor_data=pd.read_csv('toronto_geo_data.csv').drop('Unnamed: 0', axis=1)
tor_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [3]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geographical coordinate of Toronto are 43.653963, -79.387207.


In [4]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(tor_data['Latitude'], tor_data['Longitude'], tor_data['Borough'], tor_data['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [5]:
CLIENT_ID = 'YNC4DJD21CJ0M3BKMNQO5V021W3T1UO5MIAHAUWMFEBQJF0R' # your Foursquare ID
CLIENT_SECRET = 'NK5IXFBQH2WZKPIYCFVP2LLA2RVWTQVFSEIFMDEB3XDEY4Z1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT=100

In [6]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Run the above function and save the forusquare data as a csv for faster performace in later runs of code.

In [7]:
#tor_venues = getNearbyVenues(names=tor_data['Postcode'],
                                   #latitudes=tor_data['Latitude'],
                                   #longitudes=tor_data['Longitude']
                                  #)
#tor_venues.to_csv('tor_venues_all.csv')

In [8]:
tor_venues=pd.read_csv('tor_venues_all.csv').drop('Unnamed: 0', axis=1)
tor_venues.head()

Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
1,M1B,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
2,M1B,43.806686,-79.194353,Staples Morningside,43.800285,-79.196607,Paper / Office Supplies Store
3,M1B,43.806686,-79.194353,Harvey's,43.80002,-79.198307,Restaurant
4,M1B,43.806686,-79.194353,Caribbean Wave,43.798558,-79.195777,Caribbean Restaurant


In [9]:
# one hot encoding
tor_onehot = pd.get_dummies(tor_venues[['Venue Category']], prefix="", prefix_sep="")

# add postcode column back to dataframe
tor_onehot['Postcode']=tor_venues["Postcode"]
# move neighborhood column to the first column
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]

print(tor_onehot.shape)
tor_onehot.head()

(4913, 328)


Unnamed: 0,Postcode,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
tor_rest_all=tor_onehot.groupby(by='Postcode').sum().reset_index()

columns=tor_rest_all.columns
rest_col=[]
for col in columns:
    if 'Restaurant' in col:
        rest_col.append(col)
    if 'Postcode' in col:
        rest_col.append(col)
tor_rest_all=tor_rest_all[rest_col]

tor_rest_all['Total Restaurants']=tor_rest_all.sum(axis=1)

tor_data['isin']=tor_data['Postcode'].isin(tor_rest_all['Postcode'])
tor_data_isin=tor_data[tor_data['isin']==True]
tor_rest_all['Postcode Latitude']=tor_data_isin["Latitude"]
tor_rest_all["Postcode Longitude"]=tor_data_isin["Longitude"]

cols = tor_rest_all.columns.tolist()
cols.insert(1, cols.pop(cols.index('Postcode Latitude')))
cols.insert(2, cols.pop(cols.index('Postcode Longitude')))
tor_rest_all=tor_rest_all[cols]

tor_rest_all.head()

Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Afghan Restaurant,American Restaurant,Asian Restaurant,Belgian Restaurant,Brazilian Restaurant,Cajun / Creole Restaurant,Cantonese Restaurant,...,Taiwanese Restaurant,Tapas Restaurant,Thai Restaurant,Theme Restaurant,Tibetan Restaurant,Turkish Restaurant,Udon Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Total Restaurants
0,M1B,43.806686,-79.194353,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
1,M1C,43.784535,-79.160497,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,M1E,43.763573,-79.188711,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
3,M1G,43.770992,-79.216917,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,M1H,43.773136,-79.239476,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,8


In [11]:
tor_mex=tor_onehot.drop(tor_onehot.columns[~tor_onehot.columns.str.contains('Mexican')], axis=1)
# add postcode column back to dataframe
tor_mex['Postcode']=tor_venues["Postcode"]

# move neighborhood column to the first column
fixed_columns = [tor_mex.columns[-1]] + list(tor_mex.columns[:-1])
tor_mex = tor_mex[fixed_columns]
tor_mex=tor_mex.groupby('Postcode').sum().reset_index()
tor_data['isin']=tor_data['Postcode'].isin(tor_mex['Postcode'])
tor_data_isin=tor_data[tor_data['isin']==True]
tor_mex['Postcode Latitude']=tor_data_isin["Latitude"]
tor_mex["Postcode Longitude"]=tor_data_isin["Longitude"]
tor_mex['Total Restaurants']=tor_rest_all['Total Restaurants']
tor_mex['Percent Mexican']=(tor_mex['Mexican Restaurant']/tor_mex['Total Restaurants']).round(2)
tor_mex=tor_mex.fillna(0)
tor_mex['Percent Mexican'].astype(int)

cols = tor_mex.columns.tolist()
cols.insert(1, cols.pop(cols.index('Postcode Latitude')))
cols.insert(2, cols.pop(cols.index('Postcode Longitude')))
tor_mex=tor_mex[cols]

tor_mex.head()

Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Mexican Restaurant,Total Restaurants,Percent Mexican
0,M1B,43.806686,-79.194353,0,6,0.0
1,M1C,43.784535,-79.160497,0,1,0.0
2,M1E,43.763573,-79.188711,0,3,0.0
3,M1G,43.770992,-79.216917,0,3,0.0
4,M1H,43.773136,-79.239476,0,8,0.0


In [12]:
tor_mex_sort_perc=tor_mex.sort_values('Percent Mexican', ascending=False).reset_index(drop=True)
tor_mex_sort_perc.head(10)

Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Mexican Restaurant,Total Restaurants,Percent Mexican
0,M9B,43.667856,-79.532242,1,2,0.5
1,M6E,43.693781,-79.428191,2,6,0.33
2,M1L,43.711112,-79.284577,1,3,0.33
3,M8V,43.662744,-79.321558,1,5,0.2
4,M4R,43.712751,-79.390197,2,11,0.18
5,M6C,43.709577,-79.445073,1,6,0.17
6,M7R,43.662301,-79.389494,2,19,0.11
7,M5S,43.67271,-79.405678,3,27,0.11
8,M5T,43.662696,-79.400049,3,30,0.1
9,M5A,43.66586,-79.38316,2,20,0.1


In [13]:
tor_mex_sort_tot=tor_mex.sort_values('Total Restaurants', ascending=False).reset_index(drop=True)
tor_mex_sort_tot.head(10)

Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Mexican Restaurant,Total Restaurants,Percent Mexican
0,M2N,43.789053,-79.408493,0,38,0.0
1,M4S,43.715383,-79.405678,2,36,0.06
2,M6G,43.689026,-79.453512,3,35,0.09
3,M4Y,43.667967,-79.367675,1,32,0.03
4,M4K,43.685347,-79.338106,0,32,0.0
5,M5R,43.696948,-79.411307,2,31,0.06
6,M6J,43.669005,-79.442259,1,30,0.03
7,M5T,43.662696,-79.400049,3,30,0.1
8,M5X,43.646435,-79.374846,0,30,0.0
9,M7A,43.651571,-79.48445,1,29,0.03


In [14]:
# set number of clusters
kclusters = 3

tor_mex_clust = tor_mex.drop(['Postcode','Postcode Latitude', 'Postcode Longitude','Mexican Restaurant'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_mex_clust)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [15]:
tor_mex["Cluster Labels"]=kmeans.labels_

In [16]:
tor_mex.head()

Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Mexican Restaurant,Total Restaurants,Percent Mexican,Cluster Labels
0,M1B,43.806686,-79.194353,0,6,0.0,2
1,M1C,43.784535,-79.160497,0,1,0.0,2
2,M1E,43.763573,-79.188711,0,3,0.0,2
3,M1G,43.770992,-79.216917,0,3,0.0,2
4,M1H,43.773136,-79.239476,0,8,0.0,2


In [17]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_mex['Postcode Latitude'], tor_mex['Postcode Longitude'], tor_mex['Postcode'], tor_mex['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Cluster 0: red
# Description of this cluster
As you can see on the map, this cluster is located mostly in what is likely the suburb areas around Downtown Toronto, but do include some areas in downtown.  
If you look at the data below, you can see that these postcode areas have a medium number of restaurants and small-medium 

In [18]:
clust0=tor_mex.loc[tor_mex['Cluster Labels'] == 0]
print(clust0['Percent Mexican'].mean())
clust0.sort_values('Percent Mexican',ascending=False)

0.030000000000000002


Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Mexican Restaurant,Total Restaurants,Percent Mexican,Cluster Labels
45,M4R,43.712751,-79.390197,2,11,0.18,0
85,M7R,43.662301,-79.389494,2,19,0.11,0
52,M5A,43.66586,-79.38316,2,20,0.1,0
37,M4G,43.676357,-79.293031,1,10,0.1,0
13,M1T,43.781638,-79.304302,1,12,0.08,0
36,M4E,43.695344,-79.318389,1,15,0.07,0
75,M6H,43.669542,-79.422564,1,16,0.06,0
91,M8Z,43.636258,-79.498509,1,16,0.06,0
58,M5J,43.650571,-79.384568,1,19,0.05,0
38,M4H,43.70906,-79.363452,0,12,0.0,0


Cluster 1: Purple

In [19]:
clust1=tor_mex.loc[tor_mex['Cluster Labels'] == 1]
print(clust1['Percent Mexican'].mean())
clust1.sort_values('Percent Mexican',ascending=False)

0.0352


Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Mexican Restaurant,Total Restaurants,Percent Mexican,Cluster Labels
65,M5S,43.67271,-79.405678,3,27,0.11,1
66,M5T,43.662696,-79.400049,3,30,0.1,1
74,M6G,43.689026,-79.453512,3,35,0.09,1
81,M6P,43.673185,-79.487262,2,22,0.09,1
44,M4P,43.72802,-79.38879,2,25,0.08,1
56,M5G,43.644771,-79.373306,2,26,0.08,1
46,M4S,43.715383,-79.405678,2,36,0.06,1
64,M5R,43.696948,-79.411307,2,31,0.06,1
39,M4J,43.705369,-79.349372,1,27,0.04,1
77,M6K,43.647927,-79.41975,1,24,0.04,1


Cluster 2: Teal

In [20]:
clust2=tor_mex.loc[tor_mex['Cluster Labels'] == 2]
print(clust2['Percent Mexican'].mean())
clust2.sort_values('Percent Mexican',ascending=False)

0.030600000000000002


Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Mexican Restaurant,Total Restaurants,Percent Mexican,Cluster Labels
93,M9B,43.667856,-79.532242,1,2,0.5,2
7,M1L,43.711112,-79.284577,1,3,0.33,2
73,M6E,43.693781,-79.428191,2,6,0.33,2
87,M8V,43.662744,-79.321558,1,5,0.2,2
72,M6C,43.709577,-79.445073,1,6,0.17,2
0,M1B,43.806686,-79.194353,0,6,0.0,2
43,M4N,43.659526,-79.340923,0,0,0.0,2
49,M4W,43.686412,-79.400049,0,2,0.0,2
62,M5N,43.733283,-79.41975,0,7,0.0,2
67,M5V,43.653206,-79.400049,0,0,0.0,2
