### Capstone Project - The Battle of Neighbourhoods - Week 5

##### Load Libraries

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
import urllib.request
import json

from urllib.request import urlopen
import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

import matplotlib.pyplot as plt

%matplotlib inline
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries loaded.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2020.12.5          |   py36h5fab9bb_1         143 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-2.1.0                |     pyhd3deb0d_0          64 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         240 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-2.1.0-pyhd3deb0d_0

The following packages will be UPDATED:

  certifi                          2020.12.5-py36h5fab9bb_0 --> 202

##### Load Toronto Neighbourhood Data file

In [4]:
neighborhoods = pd.read_csv (r'Toronto Neighborhood data  -Cleaned CSV file.csv')
neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


##### Determine Location data of Toronto, ON

In [5]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


##### Create map of North York, Toronto with Neigborhoods superimposed on top of the map
######This is required since we are interested only in North York region of Toronto

In [7]:
NorthYork_data = neighborhoods[neighborhoods['Borough'] == 'North York'].reset_index(drop=True)
NorthYork_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.803762,-79.363452
1,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M2L,North York,"York Mills, Silver Hills",43.75749,-79.374714
4,M2M,North York,"Willowdale, Newtonbrook",43.789053,-79.408493


In [9]:
import folium
# create map of North York, Toronto using latitude and longitude values
map_NorthYork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(NorthYork_data['Latitude'], NorthYork_data['Longitude'], NorthYork_data['Borough'], NorthYork_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NorthYork)  
    
map_NorthYork

### Foursquare Venues near North York, Toronto

In [11]:
LIMIT = 500 
radius = 5000 
CLIENT_ID = 'FJD5M3BGZ0OZPES4DFYPYKBEFFPSRGB1RUF055BZDLWDBK4W'
CLIENT_SECRET = 'KFUME2AKDM1PA4ZAVVM3GHDWFFHSOM4CIF0G0YEQV33Y4MMG'
VERSION = '20181020'

In [12]:
import urllib
def getNearbyVenues(names, latitudes, longitudes, radius=5000, categoryIds=''):
    try:
        venues_list=[]
        for name, lat, lng in zip(names, latitudes, longitudes):
            #print(name)

            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)

            if (categoryIds != ''):
                url = url + '&categoryId={}'
                url = url.format(categoryIds)

            # make the GET request
            response = requests.get(url).json()
            results = response["response"]['venues']

            # return only relevant information for each nearby venue
            for v in results:
                success = False
                try:
                    category = v['categories'][0]['name']
                    success = True
                except:
                    pass

                if success:
                    venues_list.append([(
                        name, 
                        lat, 
                        lng, 
                        v['name'], 
                        v['location']['lat'], 
                        v['location']['lng'],
                        v['categories'][0]['name']
                    )])
          
        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',  
                  'Venue Category']
    
    except:
        print(url)
        print(response)
        print(results)
        print(nearby_venues)

    return(nearby_venues)

In [None]:
### Indian Restaurants in North York, Toronto area

In [13]:
#https://developer.foursquare.com/docs/resources/categories
#IndianRestaurant = 4bf58dd8d48988d10f941735
neighborhoods = neighborhoods[neighborhoods['Borough'] == 'North York'].reset_index(drop=True)
NorthYork_venues_IndianRestaurant = getNearbyVenues(names=neighborhoods['Neighborhood'], latitudes=neighborhoods['Latitude'], longitudes=neighborhoods['Longitude'], radius=1000, categoryIds='4bf58dd8d48988d10f941735')
NorthYork_venues_IndianRestaurant.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,Good Karma,43.778239,-79.343575,Indian Restaurant
1,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,Vatica,43.775762,-79.330994,Chaat Place
2,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,Iqbal Kebab & Sweets Centre,43.774032,-79.341003,Indian Restaurant
3,"Willowdale, Newtonbrook",43.789053,-79.408493,Earth Indian Restaurant,43.787524,-79.417466,Indian Restaurant
4,"Willowdale, Newtonbrook",43.789053,-79.408493,Tandori Indian Cuisine,43.780807,-79.416185,Indian Restaurant


##### Data dimensions

In [14]:
NorthYork_venues_IndianRestaurant.shape

(24, 7)

##### Adding to Map

In [15]:
def addToMap(df, color, existingMap):
    for lat, lng, local, venue, venueCat in zip(df['Venue Latitude'], df['Venue Longitude'], df['Neighborhood'], df['Venue'], df['Venue Category']):
        label = '{} ({}) - {}'.format(venue, venueCat, local)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.7).add_to(existingMap)

In [17]:
map_NorthYork_IndianRestaurant = folium.Map(location=[latitude, longitude], zoom_start=10)
addToMap(NorthYork_venues_IndianRestaurant, 'red', map_NorthYork_IndianRestaurant)

map_NorthYork_IndianRestaurant

In [18]:
def addColumn(startDf, columnTitle, dataDf):
    grouped = dataDf.groupby('Neighborhood').count()
    
    for n in startDf['Neighborhood']:
        try:
            startDf.loc[startDf['Neighborhood'] == n,columnTitle] = grouped.loc[n, 'Venue']
        except:
            startDf.loc[startDf['Neighborhood'] == n,columnTitle] = 0

In [19]:
NorthYork_grouped = NorthYork_venues_IndianRestaurant.groupby('Neighborhood').count()
NorthYork_grouped
#print('There are {} uniques categories.'.format(len(NorthYork_venues_IndianRestaurant['Venue Category'].unique())))

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bedford Park, Lawrence Manor East",4,4,4,4,4,4
Don Mills,4,4,4,4,4,4
"Fairview, Henry Farm, Oriole",3,3,3,3,3,3
Glencairn,2,2,2,2,2,2
"Lawrence Manor, Lawrence Heights",1,1,1,1,1,1
"Northwood Park, York University",1,1,1,1,1,1
Victoria Village,2,2,2,2,2,2
"Willowdale, Newtonbrook",2,2,2,2,2,2
"Willowdale, Willowdale East",3,3,3,3,3,3
York Mills West,2,2,2,2,2,2


### Neighborhood Analysis

In [20]:

NorthYork_onehot = pd.get_dummies(NorthYork_venues_IndianRestaurant[['Venue Category']], prefix="", prefix_sep="")

# Adding Neighborhood column back to dataframe
NorthYork_onehot['Neighborhood'] = NorthYork_venues_IndianRestaurant['Neighborhood'] 

# Movibng Neighborhood column to the first column
fixed_columns = [NorthYork_onehot.columns[-1]] + list(NorthYork_onehot.columns[:-1])
NorthYork_onehot = NorthYork_onehot[fixed_columns]

NorthYork_onehot.head()

Unnamed: 0,Neighborhood,Chaat Place,Indian Chinese Restaurant,Indian Restaurant
0,"Fairview, Henry Farm, Oriole",0,0,1
1,"Fairview, Henry Farm, Oriole",1,0,0
2,"Fairview, Henry Farm, Oriole",0,0,1
3,"Willowdale, Newtonbrook",0,0,1
4,"Willowdale, Newtonbrook",0,0,1


In [21]:
NorthYork_grouped = NorthYork_onehot.groupby('Neighborhood').mean().reset_index()
NorthYork_grouped

Unnamed: 0,Neighborhood,Chaat Place,Indian Chinese Restaurant,Indian Restaurant
0,"Bedford Park, Lawrence Manor East",0.0,0.0,1.0
1,Don Mills,0.0,0.0,1.0
2,"Fairview, Henry Farm, Oriole",0.333333,0.0,0.666667
3,Glencairn,0.0,0.0,1.0
4,"Lawrence Manor, Lawrence Heights",0.0,0.0,1.0
5,"Northwood Park, York University",0.0,0.0,1.0
6,Victoria Village,0.0,0.0,1.0
7,"Willowdale, Newtonbrook",0.0,0.0,1.0
8,"Willowdale, Willowdale East",0.0,0.333333,0.666667
9,York Mills West,0.0,0.0,1.0


#### Most Common Venues

In [22]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [28]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = NorthYork_grouped['Neighborhood']

for ind in np.arange(NorthYork_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(NorthYork_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,"Bedford Park, Lawrence Manor East",Indian Restaurant,Indian Chinese Restaurant,Chaat Place
1,Don Mills,Indian Restaurant,Indian Chinese Restaurant,Chaat Place
2,"Fairview, Henry Farm, Oriole",Indian Restaurant,Chaat Place,Indian Chinese Restaurant
3,Glencairn,Indian Restaurant,Indian Chinese Restaurant,Chaat Place
4,"Lawrence Manor, Lawrence Heights",Indian Restaurant,Indian Chinese Restaurant,Chaat Place


### Cluster Neighborhoods

In [53]:
# set number of clusters
kclusters = 4

NorthYork_grouped_clustering = NorthYork_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(NorthYork_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

  return_n_iter=True)


array([0, 0, 2, 0, 0, 0, 0, 0, 1, 0], dtype=int32)

In [None]:
# Adding labels to Clusters
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

NorthYork_merged = NorthYork_data
NorthYork_merged = NorthYork_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

NorthYork_merged.head()

### Create Clusters

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(NorthYork_merged['Latitude'], NorthYork_merged['Longitude'], NorthYork_merged['Neighborhood'], NorthYork_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [56]:

NorthYork_merged.loc[NorthYork_merged['Cluster Labels'] == 0, NorthYork_merged.columns[[1] + list(range(5, NorthYork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
4,North York,0.0,Indian Restaurant,Indian Chinese Restaurant,Chaat Place
6,North York,0.0,Indian Restaurant,Indian Chinese Restaurant,Chaat Place
9,North York,0.0,Indian Restaurant,Indian Chinese Restaurant,Chaat Place
10,North York,0.0,Indian Restaurant,Indian Chinese Restaurant,Chaat Place
12,North York,0.0,Indian Restaurant,Indian Chinese Restaurant,Chaat Place
17,North York,0.0,Indian Restaurant,Indian Chinese Restaurant,Chaat Place
18,North York,0.0,Indian Restaurant,Indian Chinese Restaurant,Chaat Place
19,North York,0.0,Indian Restaurant,Indian Chinese Restaurant,Chaat Place
20,North York,0.0,Indian Restaurant,Indian Chinese Restaurant,Chaat Place


In [57]:
NorthYork_merged.loc[NorthYork_merged['Cluster Labels'] == 1, NorthYork_merged.columns[[1] + list(range(5, NorthYork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
5,North York,1.0,Indian Restaurant,Indian Chinese Restaurant,Chaat Place


In [58]:
NorthYork_merged.loc[NorthYork_merged['Cluster Labels'] == 2, NorthYork_merged.columns[[1] + list(range(5, NorthYork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
1,North York,2.0,Indian Restaurant,Chaat Place,Indian Chinese Restaurant


In [59]:
NorthYork_merged.loc[NorthYork_merged['Cluster Labels'] == 3, NorthYork_merged.columns[[1] + list(range(5, NorthYork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
