## Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

## Section 1. Parse data and make a dataframe

In [1]:
import requests
import urllib.request
from bs4 import BeautifulSoup

In [2]:
import numpy as np
import pandas as pd
from urllib.request import urlopen

### Parse data from the web page

In [3]:
# Parse the wiki page
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')

# get the table body (tagged)
soup = soup.find_all('tbody')


In [4]:
# we need to get data from the first table (there are more than 1 in the page)
# set the counter and loop through rows and cells

counter = 0
post_codes = []
districts = []
neighbourhoods = []

for table in soup:
    counter = counter + 1
    rows = table.find_all('tr')
    
    if counter < 2:
        for row in rows:
            cells = row.find_all('td')
        
            if len(cells) > 1:
                post_code = cells[0]
                post_codes.append(post_code.text.strip())
            
                district = cells[1]
                districts.append(district.text.strip())
            
                neighbourhood = cells[2]
                neighbourhoods.append(neighbourhood.text.strip())

### Make a dataframe from the parsed data

In [5]:
df = pd.DataFrame()
df['Postal Code'] = post_codes
df['Borough'] = districts
df['Neighbourhood'] = neighbourhoods

### Filter and reshape dataframe according to the requirements in the assignment

In [6]:
# Remove rows where "Borough" is not assigned
df = df.loc[(df.Borough != 'Not assigned')]

# Group by postal code and join Neighbourhoods with the same posta; codes
df = df.groupby(['Postal Code','Borough'])['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()

# Check if any Neighbourhoods are left "Not assigned"
df.loc[(df.Neighbourhood == 'Not assigned')]

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [7]:
# Check the result
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [8]:
df.shape

(103, 3)

## Section 2. Add coordinates

In [9]:
coord = pd.read_csv('Geospatial_Coordinates.csv') # used the file provided with the course materials
coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
df = pd.merge(df, coord)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Section 3. View  for neighbourhoods

In [11]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [12]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium # map rendering library

In [13]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [14]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### We will now proceed with the "Downtown Toronto" area

In [15]:
toronto_downtown = df.loc[(df.Borough == 'Downtown Toronto')].reset_index()

In [16]:
map_dt_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(toronto_downtown['Latitude'], toronto_downtown['Longitude'], toronto_downtown['Borough'], toronto_downtown['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dt_toronto)  
    
map_dt_toronto

## Section 4. Use API to get places nearby

### I was unable to set up a Foursquare developer account, so I will use Google Places API instead

In [17]:
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import urllib

In [18]:
key = '___' # The API Key removed before posting to Github

In [19]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_df= []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        apiurl = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={},{}&radius={}&key={}'.format( 
            lat, 
            lng, 
            radius, 
            key)
        # make API request
        results = requests.get(apiurl).json()
        #data = results['results']
        nearby_venues = pd.json_normalize(results['results'])
        nearby_venues['Neighbourhood'] = name
        venues_df.append(nearby_venues)
        
    return(venues_df)

In [20]:
toronto_dt_venues = getNearbyVenues(names=toronto_downtown['Neighbourhood'],
                                   latitudes=toronto_downtown['Latitude'],
                                   longitudes=toronto_downtown['Longitude']
                                  )

Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Queen's Park, Ontario Provincial Government


In [22]:
places = pd.concat(toronto_dt_venues)

In [23]:
places_short = places[['Neighbourhood', 'name', 'types', 'geometry.location.lat', 'geometry.location.lng']].reset_index(drop=True)
places_short.head()

Unnamed: 0,Neighbourhood,name,types,geometry.location.lat,geometry.location.lng
0,Rosedale,Toronto,"[locality, political]",43.653226,-79.383184
1,Rosedale,Wide World Properties Ltd,"[real_estate_agency, point_of_interest, establ...",43.679647,-79.382494
2,Rosedale,Beaumont Park,"[park, tourist_attraction, point_of_interest, ...",43.679683,-79.373319
3,Rosedale,Bassett Events,"[point_of_interest, establishment]",43.676995,-79.379213
4,Rosedale,The Rosedale Rental Apartments,"[real_estate_agency, general_contractor, point...",43.678789,-79.377573


In [24]:
dummies = pd.get_dummies(places_short.types.apply(pd.Series).stack()).sum(level=0)

In [25]:
# add neighborhood column back to dataframe
dummies['Neighbourhood'] = places_short['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [dummies.columns[-1]] + list(dummies.columns[:-1])
dummies = dummies[fixed_columns]

dummies

Unnamed: 0,Neighbourhood,airport,art_gallery,atm,bakery,bank,bar,beauty_salon,book_store,cafe,...,shopping_mall,stadium,store,sublocality,sublocality_level_1,supermarket,tourist_attraction,transit_station,travel_agency,university
0,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,"Queen's Park, Ontario Provincial Government",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
376,"Queen's Park, Ontario Provincial Government",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
377,"Queen's Park, Ontario Provincial Government",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
378,"Queen's Park, Ontario Provincial Government",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
toronto_grouped = dummies.groupby('Neighbourhood').mean().reset_index()

In [27]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
               venue  freq
0  point_of_interest  0.90
1      establishment  0.90
2         restaurant  0.55
3               food  0.55
4                bar  0.20


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
               venue  freq
0      establishment   0.9
1  point_of_interest   0.9
2            airport   0.2
3      travel_agency   0.1
4          political   0.1


----Central Bay Street----
               venue  freq
0  point_of_interest  0.90
1      establishment  0.90
2               food  0.25
3              store  0.20
4         restaurant  0.20


----Christie----
               venue  freq
0  point_of_interest  0.90
1      establishment  0.90
2              store  0.30
3             health  0.15
4               food  0.10


----Church and Wellesley----
               venue  freq
0      establishment  0.90
1  point_of_interest  0.90
2              store  0.25
3            lodging  0.25
4

In [28]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [29]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,point_of_interest,establishment,food,restaurant,bar,night_club,tourist_attraction,political,lodging,museum
1,"CN Tower, King and Spadina, Railway Lands, Har...",establishment,point_of_interest,airport,food,travel_agency,political,restaurant,cafe,health,locality
2,Central Bay Street,establishment,point_of_interest,food,store,restaurant,lodging,home_goods_store,bank,finance,political
3,Christie,establishment,point_of_interest,store,health,lawyer,grocery_or_supermarket,food,political,car_dealer,hardware_store
4,Church and Wellesley,point_of_interest,establishment,lodging,store,restaurant,food,clothing_store,political,hair_care,bar


## Section 5. Clustering analysis for neighbourhoods

In [30]:
from sklearn.cluster import KMeans

In [31]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 2, 0, 3, 0, 1, 1, 0, 1, 0])

In [32]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_downtown

# merge toronto_grouped with toronto_downtown  to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,index,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,50,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,2,establishment,point_of_interest,transit_station,park,real_estate_agency,tourist_attraction,general_contractor,political,health,doctor
1,51,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675,0,point_of_interest,establishment,food,store,restaurant,meal_delivery,lodging,political,pet_store,health
2,52,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,0,point_of_interest,establishment,lodging,store,restaurant,food,clothing_store,political,hair_care,bar
3,53,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,3,establishment,point_of_interest,store,electronics_store,food,car_dealer,home_goods_store,car_repair,political,bar
4,54,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,establishment,point_of_interest,lodging,food,restaurant,store,home_goods_store,clothing_store,political,electronics_store


In [33]:
# create map MAPS ARE NOT DISPLAYED AT GITHUB
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [34]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postal Code,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M4X,-79.367675,0,point_of_interest,establishment,food,store,restaurant,meal_delivery,lodging,political,pet_store,health
2,M4Y,-79.38316,0,point_of_interest,establishment,lodging,store,restaurant,food,clothing_store,political,hair_care,bar
4,M5B,-79.378937,0,establishment,point_of_interest,lodging,food,restaurant,store,home_goods_store,clothing_store,political,electronics_store
5,M5C,-79.375418,0,establishment,point_of_interest,lodging,store,political,clothing_store,neighborhood,department_store,movie_theater,locality
7,M5G,-79.387383,0,establishment,point_of_interest,food,store,restaurant,lodging,home_goods_store,bank,finance,political
13,M5T,-79.400049,0,point_of_interest,establishment,lodging,store,health,food,political,home_goods_store,gym,travel_agency


In [35]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postal Code,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,M5H,-79.384568,1,establishment,point_of_interest,food,lodging,restaurant,bar,meal_takeaway,store,political,electronics_store
9,M5J,-79.381752,1,establishment,point_of_interest,food,restaurant,lodging,bar,tourist_attraction,political,neighborhood,locality
10,M5K,-79.381576,1,establishment,point_of_interest,lodging,food,restaurant,bar,political,locality,health,finance
11,M5L,-79.379817,1,establishment,point_of_interest,lodging,food,restaurant,bar,store,political,night_club,department_store
15,M5W,-79.374846,1,point_of_interest,establishment,restaurant,lodging,food,store,bar,political,neighborhood,night_club
16,M5X,-79.38228,1,point_of_interest,establishment,lodging,restaurant,food,bar,political,health,finance,department_store


In [36]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postal Code,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4W,-79.377529,2,establishment,point_of_interest,transit_station,park,real_estate_agency,tourist_attraction,general_contractor,political,health,doctor
12,M5S,-79.400049,2,point_of_interest,establishment,store,political,school,lodging,university,library,food,health
14,M5V,-79.39442,2,establishment,point_of_interest,airport,food,travel_agency,political,restaurant,cafe,health,locality
18,M7A,-79.389494,2,establishment,point_of_interest,health,lodging,political,food,restaurant,university,hospital,finance


In [37]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postal Code,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,M5A,-79.360636,3,establishment,point_of_interest,store,electronics_store,food,car_dealer,home_goods_store,car_repair,political,bar
17,M6G,-79.422564,3,establishment,point_of_interest,store,health,lawyer,grocery_or_supermarket,food,political,car_dealer,hardware_store


In [38]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postal Code,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,M5E,-79.373306,4,point_of_interest,establishment,food,restaurant,bar,night_club,tourist_attraction,political,lodging,museum


There is not so much difference between the clusters in this case - N1 and N2 venues in all clusters are Point of interest and Establishment. We could try to drop these 2 types and repeat with other more meaningful types. Another thing we could do is to estimate the optimum number of clusters by comparing the output with 3, 5 and 7 clusters