# Week 3 peer-graded Assignment-Notebook 3

In [5]:
import json
import pandas as pd 
import numpy as np

from geopy.geocoders import Nominatim
import folium
import requests 
from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
!pip install geopy

Collecting geopy
  Downloading geopy-2.1.0-py3-none-any.whl (112 kB)
Collecting geographiclib<2,>=1.49
  Downloading geographiclib-1.50-py3-none-any.whl (38 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-2.1.0


In [4]:
!pip install folium

Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [6]:

page = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
page_data = page[0]
#will keep a copy of the original data in page_data
data = page_data[page_data['Borough'] !='Not assigned']

geospatial_data = pd.read_csv("https://cocl.us/Geospatial_data")
#merge datasets on the Postal Code column
tdata = pd.merge(data,geospatial_data,on=['Postal Code'])

In [7]:

#verify data schema
tdata.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [8]:

city = "Toronto, ON"
geolocator = Nominatim(user_agent="py_toronto_explorer")
location = geolocator.geocode(city)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [9]:

mapT = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, label in zip(tdata['Latitude'], tdata['Longitude'], tdata['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(mapT)  
    
mapT


In [14]:
CLIENT_ID = 'XXX' # your Foursquare ID
CLIENT_SECRET = 'XXX' # your Foursquare Secret
VERSION = '20180605'
LIMIT = 100
## get venues function

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
        except Exception as e:
            continue
     # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
print("[*] working...")
t_venues = getNearbyVenues(tdata['Neighbourhood'],tdata['Latitude'],tdata['Longitude'])
print("Done: Collected venue details.")

[*] working...
Done: Collected venue details.


In [16]:
t_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Brookbanks Pool,43.751389,-79.332184,Pool
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [17]:
#onehot
t_onehot = pd.get_dummies(t_venues[['Venue Category']], prefix="", prefix_sep="")
# set neighborhood column values
t_onehot['Neighborhood'] = t_venues['Neighborhood'] 

#group by neighborhood, get average count per venue category
t_oh_grouped = t_onehot.groupby('Neighborhood').mean().reset_index()

k = 5 # number of clusters, k
# drop the neighborhood column then use as inuput to k-means
t_oh_grouped_clust_src = t_oh_grouped.drop('Neighborhood',axis=1)

In [18]:

kmeans = KMeans(n_clusters=k, random_state=0).fit(t_oh_grouped_clust_src)
kmeans.labels_[1:20]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [21]:
t_oh_grouped['n_name'] = t_oh_grouped['Neighborhood'].astype(str) # add neighbourhood name string column
s_nnames = t_oh_grouped['n_name'] #series containing only neighbourhood names
s_cluster_label = pd.Series(kmeans.labels_) #series containing the cluster labels
neighborhood_clusters = pd.DataFrame({"Neighbourhood":s_nnames,"Cluster":s_cluster_label}) #dataset with neighbourhood name and their corresponding custer label
n_with_clust_names = pd.merge(neighborhood_clusters,tdata, on=['Neighbourhood'],how="inner" ) #add geo long and lat data to the neighbourhood cluster dataset

In [22]:
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

mapT = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, label, cluster_label in zip(n_with_clust_names['Latitude'], n_with_clust_names['Longitude'], n_with_clust_names['Neighbourhood'],n_with_clust_names['Cluster']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster_label-1],
        fill=True,
        fill_color=rainbow[cluster_label-1],
        parse_html=False).add_to(mapT)  
    
mapT

In [23]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [24]:
t_grouped = t_oh_grouped.drop('n_name',axis=1)
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
t_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
t_neighborhoods_venues_sorted['Neighborhood'] = t_grouped['Neighborhood']

for ind in np.arange(t_oh_grouped.shape[0]):
    t_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(t_grouped.iloc[ind, :], num_top_venues)

t_neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Lounge,Latin American Restaurant,Breakfast Spot,Yoga Studio,Electronics Store
1,"Alderwood, Long Branch",Pizza Place,Pub,Gym,Sandwich Place,Athletics & Sports
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Pharmacy,Bridal Shop,Shopping Mall
3,Bayview Village,Café,Chinese Restaurant,Bank,Japanese Restaurant,Event Space
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Sandwich Place,Italian Restaurant,Thai Restaurant,Pharmacy


In [25]:
t_neighborhoods_venues_sorted[t_neighborhoods_venues_sorted['Neighborhood']=='Agincourt']

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Lounge,Latin American Restaurant,Breakfast Spot,Yoga Studio,Electronics Store


In [31]:
#neighborhood_clusters['Neighborhood'] = neighborhood_clusters['Neighbourhood']
neighbourhod_venues_ranked = pd.merge(neighborhood_clusters, t_neighborhoods_venues_sorted, on=['Neighborhood'] )
neighbourhod_venues_ranked = neighbourhod_venues_ranked.drop('Neighborhood',axis=1)

KeyError: 'Neighborhood'

In [None]:
neighbourhod_venues_ranked[neighbourhod_venues_ranked['Cluster']==0]


In [None]:
neighbourhod_venues_ranked[neighbourhod_venues_ranked['Cluster']==1]


In [None]:
neighbourhod_venues_ranked[neighbourhod_venues_ranked['Cluster']==2]


In [27]:
neighbourhod_venues_ranked[neighbourhod_venues_ranked['Cluster']==3]


NameError: name 'neighbourhod_venues_ranked' is not defined

In [28]:
neighbourhod_venues_ranked[neighbourhod_venues_ranked['Cluster']==4]


NameError: name 'neighbourhod_venues_ranked' is not defined