# <center> _Clustering neighbourhoods in TO_

## Webscraping

In [1]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [2]:
from bs4 import BeautifulSoup as BS
import requests

In [3]:
data  = requests.get(url).text 
scrape = BS(data, 'html5lib')

## Cleaning data and putting it into list of dictionaries

In [4]:
neighbourhoods=[]
table=scrape.find('table')
for row in table.find_all('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass 
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighbourhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        neighbourhoods.append(cell)

## Putting data into dataframe

In [5]:
import pandas as pd
import numpy as np

In [6]:
df=pd.DataFrame(neighbourhoods)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                    'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                    'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                    'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [7]:
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


## Finding empty cells

In [8]:
df = df.replace('', np.nan)
df.loc[df.isna().any(axis=1)]

Unnamed: 0,PostalCode,Borough,Neighbourhood


In [9]:
print(df.shape)

(103, 3)


### There doesn't seem to be any other missing data as all the cells are full.

***

## Finding the latitudes and longitudes of the neighbourhoods

In [10]:
pip install geocoder

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Note: you may need to restart the kernel to use updated packages.


In [11]:
import geocoder

In [12]:
lat_lng_coords = None

while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format('M5A'))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

KeyboardInterrupt: 

### I wasn't able to get any coordinates using the geocoder app so I used the csv file in the end, as follows.

In [13]:
df2 = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv')

In [14]:
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
df2 = df2.rename(columns={'Postal Code': 'PostalCode'})

In [16]:
df3 = pd.merge(df, df2)

In [17]:
df3.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [18]:
print(df3.shape)

(103, 5)


***

## Exploration of downtown Toronto

### I first created a dataframe with just the neighbourhoods in Downtown Toronto.

In [21]:
dt_TO = df3[df3['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
dt_TO.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
4,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


In [22]:
print(dt_TO.shape)

(17, 5)


### Then I will use Foursquare to explore and cluster the neighbourhoods.

In [23]:
CLIENT_ID = 'D2JOFLEXDHMNPZGNMNIJR53L0ILI0UELERUPUB3XL4WREDS1' 
CLIENT_SECRET = '0WFKC0ECIIPQ1C1M5OOCHONJBH1NZBTACNHDKCZT5KOUH0ES' 
VERSION = '20180605'
LIMIT = 100

### I will use the get-nearby-venues function created in the lab to explore downtown TO.

In [25]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            'D2JOFLEXDHMNPZGNMNIJR53L0ILI0UELERUPUB3XL4WREDS1', 
            '0WFKC0ECIIPQ1C1M5OOCHONJBH1NZBTACNHDKCZT5KOUH0ES', 
            '20180605', 
            lat, 
            lng, 
            radius, 
            100)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Applying this function will show us the types, number, and coordinates of the different venues in Downtown TO.

In [26]:
dt_TO_venues = getNearbyVenues(names = dt_TO['Neighbourhood'],
                               latitudes = dt_TO['Latitude'],
                               longitudes = dt_TO['Longitude']
                                )
dt_TO_venues.head()

Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
1,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


### I also checked the total number of venues that were found.

In [27]:
print(dt_TO_venues.shape)

(1065, 7)


In [29]:
dt_TO_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,47,47,47,47,47,47
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",17,17,17,17,17,17
Central Bay Street,63,63,63,63,63,63
Christie,14,14,14,14,14,14
Church and Wellesley,65,65,65,65,65,65
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"First Canadian Place, Underground city",100,100,100,100,100,100
"Garden District, Ryerson",100,100,100,100,100,100
"Harbourfront East, Union Station, Toronto Islands",100,100,100,100,100,100
"Kensington Market, Chinatown, Grange Park",57,57,57,57,57,57


### Now I will make a new dataframe that shows the number of each venue in each neighbourhood.

In [30]:
dt_TO_onehot = pd.get_dummies(dt_TO_venues[['Venue Category']], prefix="", prefix_sep="")
dt_TO_onehot['Neighbourhood'] = dt_TO_venues['Neighbourhood'] 
fixed_columns = [dt_TO_onehot.columns[-1]] + list(dt_TO_onehot.columns[:-1])
dt_TO_onehot = dt_TO_onehot[fixed_columns]
dt_TO_onehot.head()

Unnamed: 0,Neighbourhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theme Restaurant,Thrift / Vintage Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
dt_TO_onehot.shape

(1065, 187)

### Then the dataframe is arranged so that the frequency of each type of venue is shown for each neighbourhood.

In [32]:
dt_TO_grouped = dt_TO_onehot.groupby('Neighbourhood').mean().reset_index()
dt_TO_grouped.shape

(17, 187)

### Then I will show each neighbourhood with the top 10 venues.

In [33]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [44]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = dt_TO_grouped['Neighbourhood']

for ind in np.arange(dt_TO_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(dt_TO_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Sandwich Place,Bakery,Beer Bar,Seafood Restaurant,Farmers Market,Vegetarian / Vegan Restaurant,Museum,Bistro
1,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Harbor / Marina,Coffee Shop,Plane,Boutique,Rental Car Location,Sculpture Garden,Bar
2,Central Bay Street,Coffee Shop,Sandwich Place,Sushi Restaurant,Italian Restaurant,Café,Japanese Restaurant,Salad Place,Restaurant,Bank,Pizza Place
3,Christie,Grocery Store,Café,Park,Coffee Shop,Italian Restaurant,Nightclub,Restaurant,Baby Store,Distribution Center,Discount Store
4,Church and Wellesley,Sushi Restaurant,Japanese Restaurant,Restaurant,Coffee Shop,Fast Food Restaurant,Mediterranean Restaurant,Gym,Indian Restaurant,Gay Bar,Burrito Place


### I will use k-means to cluster the neighbourhoods into 3 clusters depending on the venues.

In [45]:
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

In [46]:
kclusters = 3
dt_TO_grouped_clustering = dt_TO_grouped.drop('Neighbourhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dt_TO_grouped_clustering)
kmeans.labels_[0:10]

array([1, 1, 1, 2, 1, 1, 1, 1, 1, 1], dtype=int32)

## New dataframe showing Downtown TO neighbourhoods, top 10 venues, and their clusters.

In [49]:
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
dt_TO_merged = dt_TO
dt_TO_merged = dt_TO_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
dt_TO_merged.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1,Coffee Shop,Park,Bakery,Café,Pub,Bank,Breakfast Spot,Discount Store,Distribution Center,Restaurant
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1,Coffee Shop,Clothing Store,Sandwich Place,Café,Hotel,Bank,Cosmetics Shop,Japanese Restaurant,Pizza Place,Theater
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Coffee Shop,Café,Cocktail Bar,Gastropub,Clothing Store,Restaurant,Italian Restaurant,Farmers Market,Gym,Department Store
3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,1,Coffee Shop,Cocktail Bar,Sandwich Place,Bakery,Beer Bar,Seafood Restaurant,Farmers Market,Vegetarian / Vegan Restaurant,Museum,Bistro
4,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,1,Coffee Shop,Sandwich Place,Sushi Restaurant,Italian Restaurant,Café,Japanese Restaurant,Salad Place,Restaurant,Bank,Pizza Place


### Closer analysis of the 3 clusters

In [50]:
dt_TO_merged.tail()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442,1,Airport Lounge,Airport Service,Airport Terminal,Harbor / Marina,Coffee Shop,Plane,Boutique,Rental Car Location,Sculpture Garden,Bar
13,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,0,Park,Playground,Trail,Cosmetics Shop,Doner Restaurant,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store
14,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675,1,Pizza Place,Coffee Shop,Café,Restaurant,Pub,Pharmacy,Chinese Restaurant,Bakery,Italian Restaurant,Indian Restaurant
15,M5X,Downtown Toronto,"First Canadian Place, Underground city",43.648429,-79.38228,1,Coffee Shop,Café,Sandwich Place,Hotel,Asian Restaurant,Bank,Gym,Japanese Restaurant,Sushi Restaurant,Deli / Bodega
16,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,1,Sushi Restaurant,Japanese Restaurant,Restaurant,Coffee Shop,Fast Food Restaurant,Mediterranean Restaurant,Gym,Indian Restaurant,Gay Bar,Burrito Place


In [51]:
dt_TO_merged.shape

(17, 16)

In [52]:
dt_TO_merged.loc[dt_TO_merged['Cluster Labels'] == 0, dt_TO_merged.columns[[1] + list(range(5, dt_TO_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,Downtown Toronto,0,Park,Playground,Trail,Cosmetics Shop,Doner Restaurant,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store


In [53]:
dt_TO_merged.loc[dt_TO_merged['Cluster Labels'] == 1, dt_TO_merged.columns[[1] + list(range(5, dt_TO_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,1,Coffee Shop,Park,Bakery,Café,Pub,Bank,Breakfast Spot,Discount Store,Distribution Center,Restaurant
1,Downtown Toronto,1,Coffee Shop,Clothing Store,Sandwich Place,Café,Hotel,Bank,Cosmetics Shop,Japanese Restaurant,Pizza Place,Theater
2,Downtown Toronto,1,Coffee Shop,Café,Cocktail Bar,Gastropub,Clothing Store,Restaurant,Italian Restaurant,Farmers Market,Gym,Department Store
3,Downtown Toronto,1,Coffee Shop,Cocktail Bar,Sandwich Place,Bakery,Beer Bar,Seafood Restaurant,Farmers Market,Vegetarian / Vegan Restaurant,Museum,Bistro
4,Downtown Toronto,1,Coffee Shop,Sandwich Place,Sushi Restaurant,Italian Restaurant,Café,Japanese Restaurant,Salad Place,Restaurant,Bank,Pizza Place
6,Downtown Toronto,1,Coffee Shop,Café,Sandwich Place,Gym,Clothing Store,Sushi Restaurant,Restaurant,Lounge,Bank,Burrito Place
7,Downtown Toronto,1,Coffee Shop,Café,Hotel,Pizza Place,Scenic Lookout,Aquarium,Park,Sporting Goods Shop,Deli / Bodega,Italian Restaurant
8,Downtown Toronto,1,Coffee Shop,Hotel,Café,Sandwich Place,Asian Restaurant,Pharmacy,Restaurant,Salad Place,Deli / Bodega,Japanese Restaurant
9,Downtown Toronto,1,Coffee Shop,Sandwich Place,Café,Hotel,Restaurant,Gym,Japanese Restaurant,Cocktail Bar,Bank,Asian Restaurant
10,Downtown Toronto,1,Café,Coffee Shop,Sandwich Place,Pub,Bar,Japanese Restaurant,Bakery,Italian Restaurant,Beer Store,Beer Bar


In [54]:
dt_TO_merged.loc[dt_TO_merged['Cluster Labels'] == 2, dt_TO_merged.columns[[1] + list(range(5, dt_TO_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Downtown Toronto,2,Grocery Store,Café,Park,Coffee Shop,Italian Restaurant,Nightclub,Restaurant,Baby Store,Distribution Center,Discount Store
