# Segmenting and Clustering Neighborhoods in Toronto

## 1. Initial Dataframe using Toronto postal code wiki

> Install BeautifulSoup and import Pandas

In [1]:
!pip install bs4
import pandas as pd
from bs4 import BeautifulSoup
import requests



> Pull data from Wiki and enter into pandas dataframe called 'borough'

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(source.content, 'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
borough = pd.DataFrame(df[0])

> Remove rows with Borough not assigned, reset index, and check

In [3]:
borough = borough[borough.Borough !='Not assigned']
borough = borough.reset_index(drop=True)
borough.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


> Confirm no neighborhoods with value 'Not assigned'

In [4]:
NeighNA = borough[(borough['Neighborhood'] == 'Not assigned')]
NeighNA

Unnamed: 0,Postal Code,Borough,Neighborhood


> Display Number of Rows and Columns

In [5]:
borough.shape

(103, 3)

## 2. Add postal code longitude and latitude values to dataframe

> Read from .csv to create new df with columns Postal Code, Longitude, Latitude

In [6]:
long_lat = pd.read_csv('https://cocl.us/Geospatial_data')

> Combine df with longitude and latitude values with boroughs df using .merge

In [7]:
borough11 = pd.merge(borough, long_lat, how='inner')
borough11.reset_index(drop=True, inplace=True)
borough11.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## 3. Analyze and Cluster Data

> Import and install necessary libraries and tools for visual analysis and clustering

In [8]:
import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

!pip -q install geopy
from geopy.geocoders import Nominatim

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!pip -q install folium
import folium


> Use geopy library to get the latitude and longitude values of Toronto

In [9]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


> Create map of Toronto with postal codes superimposed, labeled with postal codes, neighborhoods, and borough

In [10]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for postal, lat, lng, borough, neighborhood in zip(borough11['Postal Code'], borough11['Latitude'], borough11['Longitude'], borough11['Borough'], borough11['Neighborhood']):
    label = '{}, {}, {}'.format(postal, neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

> Pull in venue data from foursquare for each postal code

> Define function that pulls venue data from foursquare for each postal code

In [12]:
def getNearbyVenues(postal_codes, latitudes, longitudes, neighborhoods, boroughs, radius=500, limit=100):
    
    venues_list=[]
    for postal, lat, lng, neighborhood, borough in zip(postal_codes, latitudes, longitudes, neighborhoods, boroughs):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            postal,
            lat,
            lng,
            neighborhood,
            borough,
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code',
                  'Postal Code Latitude', 
                  'Postal Code Longitude', 
                  'Neighborhood',
                  'Borough',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

>  Run the function for each postal code and add it to a new dataframe

In [13]:
toronto_venues = getNearbyVenues(postal_codes=borough11['Postal Code'],
                                   latitudes=borough11['Latitude'],
                                   longitudes=borough11['Longitude'],
                                   neighborhoods=borough11['Neighborhood'],
                                   boroughs=borough11['Borough']
                                  )

> View dataframe toronto_venues

In [14]:
print(toronto_venues.shape)
toronto_venues.head()

(2097, 9)


Unnamed: 0,Postal Code,Postal Code Latitude,Postal Code Longitude,Neighborhood,Borough,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,43.753259,-79.329656,Parkwoods,North York,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,43.753259,-79.329656,Parkwoods,North York,Sun Life,43.75476,-79.332783,Construction & Landscaping
2,M3A,43.753259,-79.329656,Parkwoods,North York,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,M4A,43.725882,-79.315572,Victoria Village,North York,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,M4A,43.725882,-79.315572,Victoria Village,North York,Tim Hortons,43.725517,-79.313103,Coffee Shop


> Create dataframe showing top 5 categories of venue for each postal code

In [15]:
toronto_venues.groupby('Postal Code').count()

toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add postal code column back to dataframe
toronto_onehot['Postal Code'] = toronto_venues['Postal Code'] 

# move postal code column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toront_onehot = toronto_onehot[fixed_columns]

# group by postal code and mean of frequency of venue category occurrence
toronto_grouped = toronto_onehot.groupby('Postal Code').mean().reset_index()

# define function that will sort venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

# create dataframe showing top 5 venues for each postal code
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postal_code_venues_sorted = pd.DataFrame(columns=columns)
postal_code_venues_sorted['Postal Code'] = toronto_grouped['Postal Code']

for ind in np.arange(toronto_grouped.shape[0]):
    postal_code_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

postal_code_venues_sorted.head()

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Fast Food Restaurant,Dessert Shop,Farmers Market,Falafel Restaurant,Event Space
1,M1C,Moving Target,Bar,Yoga Studio,Donut Shop,Discount Store
2,M1E,Mexican Restaurant,Rental Car Location,Breakfast Spot,Electronics Store,Medical Center
3,M1G,Coffee Shop,Korean Restaurant,Eastern European Restaurant,Discount Store,Distribution Center
4,M1H,Hakka Restaurant,Bakery,Lounge,Caribbean Restaurant,Athletics & Sports


> Cluster each postal code: Run k-means to create 5 clusters and create new dataframe with cluster value

In [16]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# add clustering labels
postal_code_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = borough11

# merge toronto_grouped with borough11 to add latitude/longitude for each postal code
toronto_merged = toronto_merged.join(postal_code_venues_sorted.set_index('Postal Code'), on='Postal Code').dropna()

toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,4.0,Food & Drink Shop,Park,Construction & Landscaping,Yoga Studio,Diner
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Hockey Arena,Portuguese Restaurant,Pizza Place,Coffee Shop,Dim Sum Restaurant
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0.0,Coffee Shop,Bakery,Pub,Park,Breakfast Spot
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0.0,Clothing Store,Vietnamese Restaurant,Miscellaneous Shop,Arts & Crafts Store,Coffee Shop
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0.0,Coffee Shop,Sushi Restaurant,Yoga Studio,Bar,Beer Bar


> Create map to visualize clusters

In [17]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Postal Code'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

> Cluster 1

In [18]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,North York,0.0,Hockey Arena,Portuguese Restaurant,Pizza Place,Coffee Shop,Dim Sum Restaurant
2,Downtown Toronto,0.0,Coffee Shop,Bakery,Pub,Park,Breakfast Spot
3,North York,0.0,Clothing Store,Vietnamese Restaurant,Miscellaneous Shop,Arts & Crafts Store,Coffee Shop
4,Downtown Toronto,0.0,Coffee Shop,Sushi Restaurant,Yoga Studio,Bar,Beer Bar
6,Scarborough,0.0,Fast Food Restaurant,Dessert Shop,Farmers Market,Falafel Restaurant,Event Space
7,North York,0.0,Café,Gym / Fitness Center,Caribbean Restaurant,Japanese Restaurant,Baseball Field
8,East York,0.0,Pizza Place,Gym / Fitness Center,Fast Food Restaurant,Breakfast Spot,Pet Store
9,Downtown Toronto,0.0,Clothing Store,Coffee Shop,Middle Eastern Restaurant,Japanese Restaurant,Café
10,North York,0.0,Park,Pizza Place,Pub,Japanese Restaurant,Dog Run
12,Scarborough,0.0,Moving Target,Bar,Yoga Studio,Donut Shop,Discount Store


> Cluster 2

In [19]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
57,North York,1.0,Food Service,Baseball Field,Drugstore,Discount Store,Distribution Center
101,Etobicoke,1.0,Baseball Field,Yoga Studio,Drugstore,Discount Store,Distribution Center


> Cluster 3

In [20]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
62,Central Toronto,2.0,Garden,Donut Shop,Dim Sum Restaurant,Diner,Discount Store


> Cluster 4

In [21]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
52,North York,3.0,Home Service,Yoga Studio,Diner,Discount Store,Distribution Center


> Cluster 5

In [22]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,North York,4.0,Food & Drink Shop,Park,Construction & Landscaping,Yoga Studio,Diner
21,York,4.0,Park,Pool,Women's Store,Greek Restaurant,Falafel Restaurant
35,East York,4.0,Park,Coffee Shop,Convenience Store,Donut Shop,Diner
61,Central Toronto,4.0,Park,Bus Line,Swim School,Donut Shop,Discount Store
64,York,4.0,Park,Convenience Store,Yoga Studio,Donut Shop,Diner
66,North York,4.0,Park,Bank,Convenience Store,Drugstore,Discount Store
83,Central Toronto,4.0,Park,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner
85,Scarborough,4.0,Park,Playground,Donut Shop,Dim Sum Restaurant,Diner
91,Downtown Toronto,4.0,Park,Trail,Playground,Dim Sum Restaurant,Diner
98,Etobicoke,4.0,River,Smoke Shop,Park,Dog Run,Dessert Shop
