# Capstone Project Week 3 Assignment

Clustering Toronto's neighborhoods

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

## Part 1: Scrape the Wiki with beautiful soup and clean the data

In [187]:
path = "C:\\Users\\schuetzenberger\\Documents\\main\\COURSES\\IBM Data Science\\Capstone\\PostalCodes.html"

In [31]:
# read the html
with open(path, 'rb') as fp:
    soup = BeautifulSoup(fp)

In [55]:
# get the table element
table = soup.find_all("table")[0]

In [188]:
# read all rows in the table, and then all elements in a row into a nested list
rs = []
for row in table.find_all('tr'):
    r = []
    for i in row.find_all('td'):
        r.append(i.text[:-2])
    rs.append(r)

In [189]:
# convert the nested list into a pandas datafram 
df = pd.DataFrame(rs, columns = ['PostalCode', 'Borough', 'Neighborhood'])

In [190]:
# remove all rows with a 'Not assigned' borough
df1 = df[df['Borough'] != 'Not assigned']
# remove the first row which contains only 'None'
df1 = df1.drop(0)

In [191]:
# check whether their are any unassigned neighborhoods
df1[df1['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


In [192]:
# check whether the neighborhoods for the postcode M5A are listed correctly
df1[df1['PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [193]:
#sort the table by postal code and reset the index such that the index goes from 0 to n
df1.sort_values('PostalCode', inplace = True)
df1.reset_index(inplace=True, drop = True)

In [195]:
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [196]:
df1.shape

(103, 3)

## Part 2: Get the coordinates

In [197]:
import geocoder as gc

In [203]:
gc.google('Mountain View, CA')

<[REQUEST_DENIED] Google - Geocode [empty]>

In [None]:
# getting the location data from the csv file 

In [207]:
coords = pd.read_csv("C:\\Users\\schuetzenberger\\Documents\\main\\COURSES\\IBM Data Science\\Capstone\\Geospatial_Coordinates.csv")

In [217]:
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [220]:
df1['Latitude'] = coords['Latitude']
df1['Longitude'] = coords['Longitude']

In [222]:
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3: Clustering

### 3.1 Looking at the map + data 

In [334]:
import folium
from html import escape, unescape
import requests # library to handle requests
from pandas.io.json import json_normalize

In [232]:
latitude = df1['Latitude'].mean()
longitude = df1['Longitude'].mean()

In [290]:
i = 0
for borough, neighborhood in zip(df1['Borough'], df1['Neighborhood']):
    i+=1
    label = '{}, {}'.format(neighborhood, borough)
    
    if "'" in label:
        print(label.find("'"))
        label = label.replace("'", '')
    print(i, label)

1 Malvern, Rouge, Scarborough
2 Rouge Hill, Port Union, Highland Creek, Scarborough
3 Guildwood, Morningside, West Hill, Scarborough
4 Woburn, Scarborough
5 Cedarbrae, Scarborough
6 Scarborough Village, Scarborough
7 Kennedy Park, Ionview, East Birchmount Park, Scarborough
8 Golden Mile, Clairlea, Oakridge, Scarborough
9 Cliffside, Cliffcrest, Scarborough Village West, Scarborough
10 Birch Cliff, Cliffside West, Scarborough
11 Dorset Park, Wexford Heights, Scarborough Town Centre, Scarborough
12 Wexford, Maryvale, Scarborough
13 Agincourt, Scarborough
21
14 Clarks Corners, Tam OShanter, Sullivan, Scarborough
42
15 Milliken, Agincourt North, Steeles East, LAmoreaux East, Scarborough
15
16 Steeles West, LAmoreaux West, Scarborough
17 Upper Rouge, Scarborough
18 Hillcrest Village, North York
19 Fairview, Henry Farm, Oriole, North York
20 Bayview Village, North York
21 York Mills, Silver Hills, North York
22 Willowdale, Newtonbrook, North York
23 Willowdale, Willowdale East, North York
24 

In [281]:
label

<folium.map.Popup at 0x15304e5df60>

In [301]:

# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df1['Latitude'], df1['Longitude'], df1['Borough'], df1['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    #print(label)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        #popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### 3.2. Retrieving information from foursquare about the neighborhoods

In [302]:
CLIENT_ID = 'L3WPUO2ZAF3LLZT1LZQ01MKA14AJT2XZSCYIJ2S1LAPNQGPL' # your Foursquare ID
CLIENT_SECRET = 'LQGKSTXFJ5GNDLST4FGHUO3AEMPRZRFEADDVZTFKJSJWVTXG' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: L3WPUO2ZAF3LLZT1LZQ01MKA14AJT2XZSCYIJ2S1LAPNQGPL
CLIENT_SECRET:LQGKSTXFJ5GNDLST4FGHUO3AEMPRZRFEADDVZTFKJSJWVTXG


#### Testing retreiving the venue list from foursquare for the center location in Toronto

In [307]:
radius = 400 
LIMIT = 100

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?client_id=L3WPUO2ZAF3LLZT1LZQ01MKA14AJT2XZSCYIJ2S1LAPNQGPL&client_secret=LQGKSTXFJ5GNDLST4FGHUO3AEMPRZRFEADDVZTFKJSJWVTXG&ll=43.70460773398059,-79.39715291165048&v=20180605&radius=400&limit=100'

In [310]:
#retreiving the venue list
results = requests.get(url)
results = results.json()['response']['groups'][0]['items']

In [343]:
# getting the relevant venue features
nearby = json_normalize(results)
features = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby = nearby[features]

In [346]:
# function to get the category names from the 'venue.categories' columns
def get_cats(row):
    cats = []
    for i in range(len(row['venue.categories'])):
        cat = row['venue.categories'][i]['name']
        cats.append(cat)
    return cats

In [350]:
# applying the category function and renaming the df columns
nearby['categorgies'] = nearby.apply(get_cats, axis = 1)
nearby.drop('venue.categories', axis = 1, inplace = True)
nearby.rename(columns = {'venue.name': 'Name', 'venue.location.lat':'Latitude', 'venue.location.lng':'Longitude'}, inplace = True)
nearby.head()

#### Retrieving the venues for all Toronto neighborhoods 

In [476]:
def get_venues_nbhd(lat, lng, radius, limit):
    
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lat, lng, VERSION, radius, LIMIT)
    results = requests.get(url).json()['response']['groups'][0]['items']

    nearby = json_normalize(results)
    try:
        nearby = nearby[['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']]
        nearby['categorgies'] = nearby.apply(get_cats, axis = 1)
        nearby.drop('venue.categories', axis = 1, inplace = True)
        nearby.rename(columns = {'venue.name': 'Name', 'venue.location.lat':'Latitude', 'venue.location.lng':'Longitude'}, inplace = True)
    
        return nearby
    except:
        return pd.DataFrame(columns = ['Name', 'Latitude', 'Longitude', 'categorgies'])

In [477]:
venues = pd.DataFrame(columns = ['Name', 'Latitude', 'Longitude', 'categorgies', 'Nbhd_lat', 'Nbhd_lng', 'Nbhd', 'Borough'] )
LIMIT = 100
radius = 600
for nbhd in range(df1.shape[0]):
    print(nbhd)
    lat = df1['Latitude'][nbhd]
    lng = df1['Longitude'][nbhd]
    print(lat, lng)
    venuedf = get_venues_nbhd(lat, lng, radius, LIMIT)
    venuedf['Nbhd_lat'] = lat
    venuedf['Nbhd_lng'] = lng
    venuedf['Nbhd'] = df1['Neighborhood'][nbhd]
    venuedf['Borough'] = df1['Borough'][nbhd]
    
    venues = venues.append(venuedf, ignore_index = True)

0
43.806686299999996 -79.19435340000001
1
43.7845351 -79.16049709999999
2
43.7635726 -79.1887115
3
43.7709921 -79.21691740000001
4
43.773136 -79.23947609999999
5
43.7447342 -79.23947609999999
6
43.7279292 -79.26202940000002
7
43.711111700000004 -79.2845772
8
43.716316 -79.23947609999999
9
43.692657000000004 -79.2648481
10
43.7574096 -79.27330400000001
11
43.750071500000004 -79.2958491
12
43.7942003 -79.26202940000002
13
43.7816375 -79.3043021
14
43.8152522 -79.2845772
15
43.799525200000005 -79.3183887
16
43.836124700000006 -79.20563609999999
17
43.8037622 -79.3634517
18
43.7785175 -79.3465557
19
43.7869473 -79.385975
20
43.7574902 -79.37471409999999
21
43.789053 -79.40849279999999
22
43.7701199 -79.40849279999999
23
43.752758299999996 -79.4000493
24
43.7827364 -79.4422593
25
43.7532586 -79.3296565
26
43.745905799999996 -79.352188
27
43.72589970000001 -79.340923
28
43.7543283 -79.4422593
29
43.7679803 -79.48726190000001
30
43.737473200000004 -79.46476329999999
31
43.7390146 -79.5069436


In [478]:
venues.head()

Unnamed: 0,Name,Latitude,Longitude,categorgies,Nbhd_lat,Nbhd_lng,Nbhd,Borough
0,Images Salon & Spa,43.802283,-79.198565,[Spa],43.806686,-79.194353,"Malvern, Rouge",Scarborough
1,Wendy’s,43.807448,-79.199056,[Fast Food Restaurant],43.806686,-79.194353,"Malvern, Rouge",Scarborough
2,Wendy's,43.802008,-79.19808,[Fast Food Restaurant],43.806686,-79.194353,"Malvern, Rouge",Scarborough
3,Lee Valley,43.803161,-79.199681,[Hobby Shop],43.806686,-79.194353,"Malvern, Rouge",Scarborough
4,Royal Canadian Legion,43.782533,-79.163085,[Bar],43.784535,-79.160497,"Rouge Hill, Port Union, Highland Creek",Scarborough


In [479]:
venues.groupby('Nbhd').count()

Unnamed: 0_level_0,Name,Latitude,Longitude,categorgies,Nbhd_lat,Nbhd_lng,Borough
Nbhd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Agincourt,5,5,5,5,5,5,5
"Alderwood, Long Branch",10,10,10,10,10,10,10
"Bathurst Manor, Wilson Heights, Downsview North",22,22,22,22,22,22,22
Bayview Village,4,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",28,28,28,28,28,28,28
Berczy Park,91,91,91,91,91,91,91
"Birch Cliff, Cliffside West",6,6,6,6,6,6,6
"Brockton, Parkdale Village, Exhibition Place",41,41,41,41,41,41,41
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",23,23,23,23,23,23,23
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",15,15,15,15,15,15,15


#### Cleaning the 'categories' column

In [480]:
def list_to_string(entry):
    emptystr = ''
    for i in range(len(entry)):
        emptystr += entry[i]
    return emptystr
venues['cats'] = venues['categorgies'].apply(list_to_string)

In [484]:
venues.drop(columns = 'categorgies',  axis = 1, inplace = True)

In [485]:
venues.head()

Unnamed: 0,Name,Latitude,Longitude,Nbhd_lat,Nbhd_lng,Nbhd,Borough,cats
0,Images Salon & Spa,43.802283,-79.198565,43.806686,-79.194353,"Malvern, Rouge",Scarborough,Spa
1,Wendy’s,43.807448,-79.199056,43.806686,-79.194353,"Malvern, Rouge",Scarborough,Fast Food Restaurant
2,Wendy's,43.802008,-79.19808,43.806686,-79.194353,"Malvern, Rouge",Scarborough,Fast Food Restaurant
3,Lee Valley,43.803161,-79.199681,43.806686,-79.194353,"Malvern, Rouge",Scarborough,Hobby Shop
4,Royal Canadian Legion,43.782533,-79.163085,43.784535,-79.160497,"Rouge Hill, Port Union, Highland Creek",Scarborough,Bar


In [486]:
print('There are {} uniques categories.'.format(len(venues['cats'].unique())))

There are 295 uniques categories.


#### Visualizing all venues and neighborhoods

In [491]:


# create map of one neighborhood and all venues, using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df1['Latitude'], df1['Longitude'], df1['Borough'], df1['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    #print(label)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        #popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
for lat, lng in zip(venues['Latitude'], venues['Longitude']):
    #label = '{}, {}'.format(neighborhood, borough)
    #print(label)
    #label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        #popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)    

map_toronto

### 3.3 One hot encoding of venue categories and preparing the df for clustering

In [492]:
venues.head()

Unnamed: 0,Name,Latitude,Longitude,Nbhd_lat,Nbhd_lng,Nbhd,Borough,cats
0,Images Salon & Spa,43.802283,-79.198565,43.806686,-79.194353,"Malvern, Rouge",Scarborough,Spa
1,Wendy’s,43.807448,-79.199056,43.806686,-79.194353,"Malvern, Rouge",Scarborough,Fast Food Restaurant
2,Wendy's,43.802008,-79.19808,43.806686,-79.194353,"Malvern, Rouge",Scarborough,Fast Food Restaurant
3,Lee Valley,43.803161,-79.199681,43.806686,-79.194353,"Malvern, Rouge",Scarborough,Hobby Shop
4,Royal Canadian Legion,43.782533,-79.163085,43.784535,-79.160497,"Rouge Hill, Port Union, Highland Creek",Scarborough,Bar


In [497]:
# one hot encoding
venues_onehot = pd.get_dummies(venues[['cats']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
venues_onehot['Nbhd'] = venues['Nbhd'] 

# move neighborhood column to the first column
fixed_columns = [venues_onehot.columns[-1]] + list(venues_onehot.columns[:-1])
manhattan_onehot = venues_onehot[fixed_columns]

venues_onehot.head()

Unnamed: 0,ATM,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Amphitheater,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Nbhd
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"Malvern, Rouge"
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"Malvern, Rouge"
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"Malvern, Rouge"
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"Malvern, Rouge"
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"Rouge Hill, Port Union, Highland Creek"


In [502]:
venues_grouped = venues_onehot.groupby('Nbhd').mean().reset_index()
venues_grouped.head()

Unnamed: 0,Nbhd,ATM,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [503]:
venues_grouped.shape

(97, 296)

### 3.4 Clustering the neighborhoods based on the frequency of venue categories

In [505]:
from sklearn.cluster import KMeans

In [508]:
# set number of clusters
kclusters = 5

venues_grouped_clustering = venues_grouped.drop('Nbhd', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(venues_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 4, 4, 4, 4, 4, 4, 4, 0, 4])

In [520]:
labelled_nbhds = pd.concat([venues_grouped['Nbhd'], pd.Series(kmeans.labels_)], axis = 1)

In [532]:
labelled_nbhds_w_coordinates = df1.copy()
labelled_nbhds_w_coordinates = labelled_nbhds_w_coordinates.join(labelled_nbhds.set_index('Nbhd'), on='Neighborhood')
labelled_nbhds_w_coordinates.rename(columns = {0:'Clusters'}, inplace = True)

In [545]:
labelled_nbhds_w_coordinates.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Clusters
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,0.0
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,2.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0.0
3,M1G,Scarborough,Woburn,43.770992,-79.216917,4.0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,4.0


In [558]:
labelled_nbhds_w_coordinates = labelled_nbhds_w_coordinates[~np.isnan(labelled_nbhds_w_coordinates['Clusters'])]

#### How many neighborhoods are in each cluster? 

In [564]:
labelled_nbhds_w_coordinates.groupby('Clusters').count()

Unnamed: 0_level_0,PostalCode,Borough,Neighborhood,Latitude,Longitude
Clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,36,36,36,36,36
1.0,1,1,1,1,1
2.0,1,1,1,1,1
3.0,5,5,5,5,5
4.0,58,58,58,58,58


In [547]:
import matplotlib.cm as cm
import matplotlib.colors as colors

#### Visualizing the clusters

In [562]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(labelled_nbhds_w_coordinates['Latitude'], labelled_nbhds_w_coordinates['Longitude'], labelled_nbhds_w_coordinates['Neighborhood'], labelled_nbhds_w_coordinates['Clusters']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        #popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters