# Capstone Project - Week 3

### Importing all the necessary libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

!pip install geocoder

import geocoder

!pip install geopy

from geopy.geocoders import Nominatim 
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors



In [2]:
# send the GET request
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [3]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)


In [4]:
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [5]:
df_grouped = df.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Making Neighborhood == 'Not Assigned' to same Borough name


In [6]:
for index, row in df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Creating the dataset we need with the given conditions for the assignment

In [7]:
column_names = ["PostalCode", "Borough", "Neighborhood"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(df_grouped[df_grouped["PostalCode"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


In [8]:
print('The shape of the dataset is: {}'.format(df_grouped.shape))

The shape of the dataset is: (103, 3)


## Question 2

### A function that retrieves lat/long coordinates

In [9]:
def get_coords(postcode):
    return geocoder.arcgis(f'{postcode}, Toronto, Ontario').latlng

### Get coordinates of all postal codes

In [11]:
postcodes = df['PostalCode'].tolist()
coords = [get_coords(code) for code in postcodes]

In [12]:
# a new dataframe to store coordinates
coords_arcgis = df.copy()

# add columns for latitudes and longitudes
coords_arcgis['Latitude'] = [coord[0] for coord in coords]
coords_arcgis['Longitude'] = [coord[1] for coord in coords]
coords_arcgis.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Queen's Park,Ontario Provincial Government,43.66253,-79.39188


In [14]:
# load the coordinates file
coords_google = pd.read_csv('Geospatial_Coordinates.csv',
                            header=0,
                            names=['PostalCode', 'Latitude', 'Longitude']
                           )

# merge with the original dataframe
toronto = df.merge(coords_google, on='PostalCode', how='inner')
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


# Question 3

## Visualization

### Get the coordinates of Toronto using Nominatim geocoder

In [15]:
geolocator = Nominatim(user_agent='tor_explorer')
location = geolocator.geocode('Toronto, Ontario')
latitude = location.latitude
longitude = location.longitude
print(f"The geograpical coordinates of Toronto are {latitude}, {longitude}.")

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [17]:
map_tor = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, nbhd in zip(toronto['Latitude'],
                                   toronto['Longitude'],
                                   toronto['Borough'],
                                   toronto['Neighborhood']):
    label = f"{nbhd}, {borough}"
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng],
                        radius=5,
                        popup=label,
                        color='purple',
                        fill=True,
                        fill_color='pink',
                        fill_opacity=0.7,
                        parse_html=False
                       ).add_to(map_tor)

map_tor

In [18]:
tor_boi = toronto[toronto['Borough'].str.contains('Toronto')]
tor_boi.reset_index(drop=True, inplace=True)
print(tor_boi['Borough'].unique().tolist())
tor_boi.head()

['Downtown Toronto', 'East Toronto', 'West Toronto', 'East York/East Toronto', 'Central Toronto', 'Downtown Toronto Stn A', 'East Toronto Business']


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [19]:
map_tor_boi = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, nbhd in zip(tor_boi['Latitude'],
                                   tor_boi['Longitude'],
                                   tor_boi['Borough'],
                                   tor_boi['Neighborhood']):
    label = folium.Popup(f"{nbhd}, {borough}", parse_html=True)
    folium.CircleMarker([lat, lng],
                        radius=5,
                        popup=label,
                        color='purple',
                        fill=True,
                        fill_color='pink',
                        fill_opacity=0.7,
                        parse_html=False
                       ).add_to(map_tor_boi)
map_tor_boi

## Explore a particular neighborhood

### Select a neighborhood and its corresponding coordinates

In [20]:
noi = 'Studio District'
noi_idx = tor_boi[tor_boi['Neighborhood'] == noi].index[0]
noi_lat = tor_boi.loc[noi_idx, 'Latitude']
noi_lng = tor_boi.loc[noi_idx, 'Longitude']
print(f"Latitude and longitude values of {noi} are {noi_lat}, {noi_lng}.")

Latitude and longitude values of Studio District are 43.6595255, -79.340923.


#### Define Foursquare credentials and parameters

In [21]:
client_id = 'AIEYQ5SYM1CHR3FSCTDEB1U2AXZXSZOA4KOZLNA0PUDTO1K4'
client_secret = 'YIEFGP5KVJHIZD2ZEIAVCM2FDJCJTQ4OBHSGNE0UAA1MQH0Z'
version = '20180605'
limit = 100
radius = 500

#### Specify URL

In [22]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    client_id, 
    client_secret, 
    version, 
    noi_lat, 
    noi_lng, 
    radius, 
    limit)

# make an HTTP request and
# store the response in a variable 'results'
results = requests.get(url).json()
print("Request successful.")

Request successful.


#### A function that extracts the category of the venue

In [23]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [24]:
# get all the venues, flatten JSON into a dataframe
venues_json = results['response']['groups'][0]['items'] 
venues = pd.json_normalize(venues_json)

# filter relevant columns
cols_filtered = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
venues = venues.loc[:, cols_filtered]

# apply categorization of venues
venues['venue.categories'] = venues.apply(get_category_type, axis=1)

# rename columns
venues.columns = ['Name', 'Category', 'Latitude', 'Longitude']

print(f"{venues.shape[0]} venues were returned by Foursquare.")
venues.head(10)

36 venues were returned by Foursquare.


Unnamed: 0,Name,Category,Latitude,Longitude
0,Ed's Real Scoop,Ice Cream Shop,43.660656,-79.342019
1,Te Aro,Coffee Shop,43.661373,-79.338577
2,Queen Books,Bookstore,43.660651,-79.342267
3,The Bone House,Pet Store,43.660894,-79.341097
4,Mercury Espresso Bar,Coffee Shop,43.660806,-79.341241
5,Hooked,Fish Market,43.660407,-79.343257
6,Purple Penguin Cafe,Café,43.660501,-79.342565
7,WAYLABAR,Gay Bar,43.661234,-79.339597
8,Leslieville,Neighborhood,43.66207,-79.337856
9,Brick Street Breads,Bakery,43.660685,-79.342501


In [25]:
noi_top_venues = venues['Category'].value_counts()
print(f"The top venues in {noi} are:\n")
noi_top_venues.loc[noi_top_venues > 1]

The top venues in Studio District are:



Coffee Shop            3
Gastropub              2
American Restaurant    2
Brewery                2
Bakery                 2
Café                   2
Name: Category, dtype: int64

# Explore venues in all neighborhoods

#### Create a function to get nearby venues from all neighborhoods in the boroughs of interest (East/West/Central/Downtown Toronto).

In [26]:
def get_venues(names, lats, lngs, radius=500, limit=100):
    venues_list = []
    for name, lat, lng in zip(names, lats, lngs):
        # specify the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            client_id,
            client_secret,
            version,
            lat,
            lng,
            radius,
            limit)
        # make the request, store the response
        results = requests.get(url).json()['response']['groups'][0]['items']
        # extract relevant information from each venue
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
    # populate the dataframe with venues list
    venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    venues.columns = ['Neighborhood',
                      'Nbhd Latitude',
                      'Nbhd Longitude',
                      'Venue',
                      'Venue Latitude',
                      'Venue Longitude',
                      'Venue Category']
    return(venues)

In [27]:
# run the above function on each neighborhood,
# store the result as a new dataframe
tor_venues = get_venues(
    tor_boi['Neighborhood'],
    tor_boi['Latitude'],
    tor_boi['Longitude']
)
tor_venues.head()

Unnamed: 0,Neighborhood,Nbhd Latitude,Nbhd Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant
4,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


In [28]:
print(f"In all {tor_venues['Neighborhood'].nunique()} Toronto neighborhoods,",
      f"there's a total of {tor_venues.shape[0]} venues",
      f"across {tor_venues['Venue Category'].nunique()} different categories."
     )
tor_venues[['Neighborhood', 'Venue']].groupby('Neighborhood').count().reset_index()

In all 39 Toronto neighborhoods, there's a total of 1592 venues across 235 different categories.


Unnamed: 0,Neighborhood,Venue
0,Berczy Park,59
1,"Brockton, Parkdale Village, Exhibition Place",23
2,"CN Tower, King and Spadina, Railway Lands, Har...",16
3,Central Bay Street,68
4,Christie,16
5,Church and Wellesley,77
6,"Commerce Court, Victoria Hotel",100
7,Davisville,37
8,Davisville North,10
9,"Dufferin, Dovercourt Village",14


## Analyze each neighborhood

#### Venue categories are converted into numerical variables through one-hot encoding. Rows are grouped by neighborhood, and by taking the mean of the frequency of occurence of each venue category.

In [29]:
# one-hot encoding for each venue category
tor_onehot = pd.get_dummies(tor_venues['Venue Category'], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tor_onehot['Neighborhood'] = tor_venues['Neighborhood'] 
tor_onehot = tor_onehot.groupby('Neighborhood').mean().reset_index()

print(f"Dataframe size: {tor_onehot.shape[0]} rows, {tor_onehot.shape[1]} columns")
tor_onehot.head()

Dataframe size: 39 rows, 235 columns


Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016949,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, King and Spadina, Railway Lands, Har...",0.0625,0.0625,0.0625,0.125,0.125,0.0625,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.014706,0.0,0.0,0.0,0.0,0.014706
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### A function to sort venues in a descending order of frequency

In [30]:
def top_venues(row, num_venues):
    row_cats = row.iloc[1:]
    row_cats_sorted = row_cats.sort_values(ascending=False)
    return row_cats_sorted.index.values[0:num_venues]

In [31]:
num_venues = 10 # number of top venues
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
cols = ['Neighborhood']
for i in np.arange(num_venues):
    try:
        cols.append(f"{i+1}{indicators[i]} Most Common Venue")
    except:
        cols.append(f"{i+1}th Most Common Venue")

# create a dataframe of 10 most common venues by neighborhood
tor_common = pd.DataFrame(columns=cols)
tor_common['Neighborhood'] = tor_onehot['Neighborhood']

for i in np.arange(tor_onehot.shape[0]):
    tor_common.iloc[i, 1:] = top_venues(tor_onehot.iloc[i, :], num_venues)

tor_common.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Cocktail Bar,Bakery,Coffee Shop,Pub,Beer Bar,Restaurant,Cheese Shop,Pharmacy,Farmers Market,Seafood Restaurant
1,"Brockton, Parkdale Village, Exhibition Place",Café,Bakery,Coffee Shop,Breakfast Spot,Grocery Store,Performing Arts Venue,Pet Store,Nightclub,Climbing Gym,Restaurant
2,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Harbor / Marina,Sculpture Garden,Airport Food Court,Airport Gate,Airport Terminal,Bar,Boat or Ferry,Boutique
3,Central Bay Street,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Salad Place,Bubble Tea Shop,Restaurant,Burger Joint,Spa,Japanese Restaurant
4,Christie,Grocery Store,Café,Park,Nightclub,Italian Restaurant,Restaurant,Baby Store,Athletics & Sports,Candy Store,Coffee Shop


## Cluster neighborhoods using K-means algorithm

#### Apply k-means clustering algorithm to segment and cluster Toronto neighborhoods based on a set of features (venue categories).

In [32]:
Ks = 5 # number of clusters
X = tor_onehot.drop('Neighborhood', 1) # select features

model = KMeans(n_clusters=Ks, random_state=0).fit(X)
model.labels_ # cluster labels generated for each row

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 4,
       2, 2, 2, 2, 3, 1, 2, 2, 2, 2, 2, 2, 0, 3, 2, 2, 2])

In [33]:
tor_common.insert(0, 'Cluster Label', model.labels_)
tor_merged = tor_boi.copy()
tor_merged = tor_merged.join(tor_common.set_index('Neighborhood'), on='Neighborhood')
tor_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2,Coffee Shop,Bakery,Pub,Park,Café,Breakfast Spot,Theater,Chocolate Shop,Electronics Store,Dessert Shop
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,2,Coffee Shop,Clothing Store,Bubble Tea Shop,Café,Japanese Restaurant,Cosmetics Shop,Diner,Electronics Store,Hotel,Fast Food Restaurant
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,2,Coffee Shop,Café,Restaurant,Cosmetics Shop,Hotel,Clothing Store,Cocktail Bar,Bakery,Park,Seafood Restaurant
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Trail,Pub,Yoga Studio,Donut Shop,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Eastern European Restaurant
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,2,Cocktail Bar,Bakery,Coffee Shop,Pub,Beer Bar,Restaurant,Cheese Shop,Pharmacy,Farmers Market,Seafood Restaurant


In [34]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(Ks)
ys = [i + x + (i*x)**2 for i in range(Ks)]
colors_array = cm.gist_rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
for lat, lng, nbhd, cluster in zip(tor_merged['Latitude'],
                                   tor_merged['Longitude'],
                                   tor_merged['Neighborhood'],
                                   tor_merged['Cluster Label']):
    label = folium.Popup(f"Cluster {cluster}: {nbhd}", parse_html=True)
    folium.CircleMarker([lat, lng],
                        radius=5,
                        popup=label,
                        color=rainbow[cluster-1],
                        fill=True,
                        fill_color=rainbow[cluster-1],
                        fill_opacity=0.5
                       ).add_to(map_clusters)
map_clusters

## Examine Clusters

### Determine the discriminating venue categories that distinguish each cluster.

#### Cluster 0

In [35]:
cluster0 = tor_merged.loc[tor_merged['Cluster Label'] == 0,
                          tor_merged.columns[[2] + list(range(6, tor_merged.shape[1]))]
                         ].reset_index(drop=True)
cluster0

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,The Beaches,Health Food Store,Trail,Pub,Yoga Studio,Donut Shop,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Eastern European Restaurant
1,Forest Hill North & West,Jewelry Store,Trail,Mexican Restaurant,Sushi Restaurant,Yoga Studio,Discount Store,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store


In [36]:
# get all the venue categories in the cluster
cl0_venues_lists = cluster0.iloc[:, 1:].values.tolist()

# flatten the resulting nested list
cl0_venues = [venue for sublist in cl0_venues_lists for venue in sublist]

# count unique values
pd.Series(cl0_venues).value_counts()

Trail                          2
Discount Store                 2
Yoga Studio                    2
Ethiopian Restaurant           1
Electronics Store              1
Mexican Restaurant             1
Sushi Restaurant               1
Eastern European Restaurant    1
Health Food Store              1
Jewelry Store                  1
Event Space                    1
Distribution Center            1
Escape Room                    1
Pub                            1
Dog Run                        1
Donut Shop                     1
Doner Restaurant               1
dtype: int64

#### Cluster 1

In [37]:
cluster1 = tor_merged.loc[tor_merged['Cluster Label'] == 1,
                          tor_merged.columns[[2] + list(range(6, tor_merged.shape[1]))]
                         ].reset_index(drop=True)
cluster1

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Roselawn,Garden,Ice Cream Shop,Home Service,Dumpling Restaurant,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Yoga Studio


In [38]:
# get all the venue categories in the cluster
cl1_venues_lists = cluster1.iloc[:, 1:].values.tolist()

# flatten the resulting nested list
cl1_venues = [venue for sublist in cl1_venues_lists for venue in sublist]

# count unique values
pd.Series(cl1_venues).value_counts()

Distribution Center    1
Garden                 1
Yoga Studio            1
Donut Shop             1
Dog Run                1
Dumpling Restaurant    1
Ice Cream Shop         1
Home Service           1
Discount Store         1
Doner Restaurant       1
dtype: int64

#### Cluster 2

In [39]:
cluster2 = tor_merged.loc[tor_merged['Cluster Label'] == 2,
                          tor_merged.columns[[2] + list(range(6, tor_merged.shape[1]))]
                         ].reset_index(drop=True)

cl2_venues_lists = cluster2.iloc[:, 1:].values.tolist()
cl2_venues = [venue for sublist in cl2_venues_lists for venue in sublist]
pd.Series(cl2_venues).value_counts()

Café                             24
Coffee Shop                      23
Restaurant                       20
Bakery                           14
Hotel                            10
                                 ..
Vegetarian / Vegan Restaurant     1
Eastern European Restaurant       1
Gym / Fitness Center              1
Candy Store                       1
Climbing Gym                      1
Length: 112, dtype: int64

#### Cluster 3

In [40]:
cluster3 = tor_merged.loc[tor_merged['Cluster Label'] == 3,
                          tor_merged.columns[[2] + list(range(6, tor_merged.shape[1]))]
                         ].reset_index(drop=True)

cl3_venues_lists = cluster3.iloc[:, 1:].values.tolist()
cl3_venues = [venue for sublist in cl3_venues_lists for venue in sublist]
pd.Series(cl3_venues).value_counts()

Park                           2
Escape Room                    2
Ethiopian Restaurant           2
Yoga Studio                    2
Electronics Store              2
Playground                     1
Trail                          1
Event Space                    1
Dessert Shop                   1
Falafel Restaurant             1
Discount Store                 1
Convenience Store              1
Intersection                   1
Dumpling Restaurant            1
Eastern European Restaurant    1
dtype: int64

#### Cluster 4

In [41]:
cluster4 = tor_merged.loc[tor_merged['Cluster Label'] == 4,
                          tor_merged.columns[[2] + list(range(6, tor_merged.shape[1]))]
                         ].reset_index(drop=True)

cl4_venues_lists = cluster4.iloc[:, 1:].values.tolist()
cl4_venues = [venue for sublist in cl4_venues_lists for venue in sublist]
pd.Series(cl4_venues).value_counts()

Distribution Center    1
Trail                  1
Yoga Studio            1
Tennis Court           1
Restaurant             1
Dog Run                1
Doner Restaurant       1
Donut Shop             1
Discount Store         1
Diner                  1
dtype: int64