In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
# Matplotlib and associated plotting modules
import matplotlib
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
import sklearn
from sklearn.cluster import KMeans
import folium # map rendering library

print(f"Matplotlib version: {matplotlib.__version__}")
print(f"Numpy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Requests version: {requests.__version__}")
print(f"sklearn version: {sklearn.__version__}")
print(f"folium version: {folium.__version__}")

Matplotlib version: 3.4.2
Numpy version: 1.21.0
Pandas version: 1.2.5
Requests version: 2.15.1
sklearn version: 0.24.2
folium version: 0.12.1


## Toronto, Ontario neighborhood data

In [2]:
data_file = "./toranto_pc_latlon.csv"
df = pd.read_csv(data_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   postal_code   103 non-null    object 
 1   borough       103 non-null    object 
 2   neighborhood  103 non-null    object 
 3   Latitude      103 non-null    float64
 4   Longitude     103 non-null    float64
dtypes: float64(2), object(3)
memory usage: 4.1+ KB


In [3]:
# sanity check for the contents of df...
df.head()

Unnamed: 0,postal_code,borough,neighborhood,Latitude,Longitude
0,M1B,North York,Parkwoods,43.806686,-79.194353
1,M1C,North York,Victoria Village,43.784535,-79.160497
2,M1E,Downtown Toronto,"Regent Park , Harbourfront",43.763573,-79.188711
3,M1G,North York,"Lawrence Manor , Lawrence Heights",43.770992,-79.216917
4,M1H,Queen,Ontario Provincial Government,43.773136,-79.239476


### Counts of borough's and neighborhood's

In [4]:
print(f"Total Borough: {df.borough.nunique()}\nTotal Neighborhood: {df.shape[0]}")

Total Borough: 15
Total Neighborhood: 103


### Map of Toronto with its neighborhood

In [5]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_exp")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [6]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'],
                                           df['Longitude'],
                                           df['borough'],
                                           df['neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Using Foursquare API to explore the neighborhoods and segment them.

In [7]:
from configparser import ConfigParser

config = ConfigParser()
config.read("./keys.ini")

client_id = config.get("foursquare", 'ApiKey')
client_secret = config.get("foursquare", 'ApiSecret')
version = '20210628'
limit = 100 # A default Foursquare API limit value

#### Before we proceed, let's borrow the **getNearbyVenues** function from the Foursquare lab to get the venue and category

In [8]:
def getNearbyVenues(pcs, names, latitudes, longitudes, radius=500):
    
    counter = 0
    venues_list=[]
    for pc, name, lat, lng in zip(pcs, names, latitudes, longitudes):
        counter += 1
        if counter % 15 != 0:
            print(pc, end=" ")
        else:
            print(pc)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore'
        params = dict(
            client_id = client_id,
            client_secret = client_secret,
            v = version,
            ll = f"{lat}, {lng}",
            radius=radius,
            limit=limit
        )
            
        # make the GET request
        results = requests.get(url=url, params=params).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            pc,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['postal_code',
                             'neighborhood', 
                             'neighborhood_lat',  
                             'neighborhood_lon',
                             'venue', 
                             'venue_lat', 
                             'venue_lon', 
                             'venue_cat']
    
    return(nearby_venues)

In [9]:
toronto_nghd = getNearbyVenues(
    pcs = df['postal_code'],
    names=df['neighborhood'],
    latitudes=df['Latitude'],
    longitudes=df['Longitude'])

M1B M1C M1E M1G M1H M1J M1K M1L M1M M1N M1P M1R M1S M1T M1V
M1W M1X M2H M2J M2K M2L M2M M2N M2P M2R M3A M3B M3C M3H M3J
M3K M3L M3M M3N M4A M4B M4C M4E M4G M4H M4J M4K M4L M4M M4N
M4P M4R M4S M4T M4V M4W M4X M4Y M5A M5B M5C M5E M5G M5H M5J
M5K M5L M5M M5N M5P M5R M5S M5T M5V M5W M5X M6A M6B M6C M6E
M6G M6H M6J M6K M6L M6M M6N M6P M6R M6S M7A M7R M7Y M8V M8W
M8X M8Y M8Z M9A M9B M9C M9L M9M M9N M9P M9R M9V M9W 

In [10]:
# sanity check
toronto_nghd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2010 entries, 0 to 2009
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   postal_code       2010 non-null   object 
 1   neighborhood      2010 non-null   object 
 2   neighborhood_lat  2010 non-null   float64
 3   neighborhood_lon  2010 non-null   float64
 4   venue             2010 non-null   object 
 5   venue_lat         2010 non-null   float64
 6   venue_lon         2010 non-null   float64
 7   venue_cat         2010 non-null   object 
dtypes: float64(4), object(4)
memory usage: 125.8+ KB


#### count of venues for each neighborhood

In [11]:
toronto_nghd.groupby('postal_code').count()

Unnamed: 0_level_0,neighborhood,neighborhood_lat,neighborhood_lon,venue,venue_lat,venue_lon,venue_cat
postal_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
M1B,1,1,1,1,1,1,1
M1C,1,1,1,1,1,1,1
M1E,9,9,9,9,9,9,9
M1G,3,3,3,3,3,3,3
M1H,7,7,7,7,7,7,7
...,...,...,...,...,...,...,...
M9N,4,4,4,4,4,4,4
M9P,8,8,8,8,8,8,8
M9R,4,4,4,4,4,4,4
M9V,10,10,10,10,10,10,10


#### Unique venue categories

In [12]:
print(f"There are {len(toronto_nghd['venue_cat'].unique())} uniques categories.")

There are 255 uniques categories.


### Analyze each neighborhood

In [13]:
# one hot encoding
toronto_oh = pd.get_dummies(toronto_nghd[['venue_cat']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_oh['postal_code'] = toronto_nghd['postal_code'] 

# move neighborhood column to the first column
fixed_columns = [toronto_oh.columns[-1]] + list(toronto_oh.columns[:-1])
toronto_oh = toronto_oh[fixed_columns]

toronto_oh.head()

Unnamed: 0,postal_code,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1C,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# toronto_oh shape
toronto_oh.shape

(2010, 256)

#### group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [15]:
toronto_grpd = toronto_oh.groupby('postal_code').mean().reset_index()
toronto_grpd

Unnamed: 0,postal_code,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,M9N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,M9P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,M9R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,M9V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# sanity check
toronto_grpd.shape

(101, 256)

#### Pandas dataframe with neighborhood and 10 most common venues

In [17]:
# function to sort the venues in descending order

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [18]:
# create the dataframe

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['postal_code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['postal_code'] = toronto_grpd['postal_code']

for ind in np.arange(toronto_grpd.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grpd.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,postal_code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Lounge,Luggage Store,Malay Restaurant,Market,Martial Arts School,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station
1,M1C,Bar,Accessories Store,Lounge,Malay Restaurant,Market,Martial Arts School,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station
2,M1E,Donut Shop,Medical Center,Bank,Breakfast Spot,Electronics Store,Intersection,Mexican Restaurant,Restaurant,Rental Car Location,Miscellaneous Shop
3,M1G,Coffee Shop,Korean BBQ Restaurant,Accessories Store,Moroccan Restaurant,Market,Martial Arts School,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station
4,M1H,Hakka Restaurant,Athletics & Sports,Bank,Bakery,Gas Station,Caribbean Restaurant,Thai Restaurant,Office,Persian Restaurant,Market


In [19]:
neighborhoods_venues_sorted.shape

(101, 11)

### Cluster Neighborhood

Run k-means to cluster the neighborhood in 5 clusters

In [20]:
n_clust = 5

toronto_grp_clst = toronto_grpd.drop('postal_code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=n_clust, random_state=0).fit(toronto_grp_clst)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 1, 1, 1, 1, 4, 1, 1, 4, 1], dtype=int32)

####  Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [21]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto = df

# merge toronto_grpd with df to add latitude/longitude for each neighborhood
toronto = toronto.join(neighborhoods_venues_sorted.set_index('postal_code'), on='postal_code', how='inner')

toronto.head() # check the last columns!

Unnamed: 0,postal_code,borough,neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,North York,Parkwoods,43.806686,-79.194353,4,Fast Food Restaurant,Lounge,Luggage Store,Malay Restaurant,Market,Martial Arts School,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station
1,M1C,North York,Victoria Village,43.784535,-79.160497,1,Bar,Accessories Store,Lounge,Malay Restaurant,Market,Martial Arts School,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station
2,M1E,Downtown Toronto,"Regent Park , Harbourfront",43.763573,-79.188711,1,Donut Shop,Medical Center,Bank,Breakfast Spot,Electronics Store,Intersection,Mexican Restaurant,Restaurant,Rental Car Location,Miscellaneous Shop
3,M1G,North York,"Lawrence Manor , Lawrence Heights",43.770992,-79.216917,1,Coffee Shop,Korean BBQ Restaurant,Accessories Store,Moroccan Restaurant,Market,Martial Arts School,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station
4,M1H,Queen,Ontario Provincial Government,43.773136,-79.239476,1,Hakka Restaurant,Athletics & Sports,Bank,Bakery,Gas Station,Caribbean Restaurant,Thai Restaurant,Office,Persian Restaurant,Market


In [22]:
toronto.shape

(101, 16)

In [23]:
toronto['Cluster Labels'].dtype

dtype('int32')

In [24]:
toronto['Cluster Labels'].describe()

count    101.000000
mean       1.376238
std        1.173469
min        0.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        4.000000
Name: Cluster Labels, dtype: float64

#### Visualize the resulting clusters

In [25]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(n_clust)
ys = [i + x + (i*x)**2 for i in range(n_clust)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto['Latitude'], toronto['Longitude'], toronto['neighborhood'], toronto['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

#### Custer 1

In [26]:
toronto.loc[toronto['Cluster Labels'] == 0, toronto.columns[[1] + list(range(5, toronto.shape[1]))]]

Unnamed: 0,borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,East York,0,Park,Intersection,Playground,Mexican Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Miscellaneous Shop,Middle Eastern Restaurant,Men's Store
21,York,0,Park,Accessories Store,Moroccan Restaurant,Malay Restaurant,Market,Martial Arts School,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station
23,East York,0,Convenience Store,Park,Men's Store,Molecular Gastronomy Restaurant,Modern European Restaurant,Miscellaneous Shop,Middle Eastern Restaurant,Mexican Restaurant,Metro Station,Accessories Store
25,Downtown Toronto,0,Food & Drink Shop,Fast Food Restaurant,Park,Men's Store,Modern European Restaurant,Miscellaneous Shop,Middle Eastern Restaurant,Mexican Restaurant,Metro Station,Accessories Store
30,Downtown Toronto,0,Airport,Park,Accessories Store,Metro Station,Molecular Gastronomy Restaurant,Modern European Restaurant,Miscellaneous Shop,Middle Eastern Restaurant,Mexican Restaurant,Mediterranean Restaurant
40,North York,0,Pizza Place,Park,Convenience Store,Market,Malay Restaurant,Martial Arts School,Medical Center,Mediterranean Restaurant,Moroccan Restaurant,Men's Store
44,Scarborough,0,Bus Line,Park,Swim School,Accessories Store,Metro Station,Modern European Restaurant,Miscellaneous Shop,Middle Eastern Restaurant,Mexican Restaurant,Men's Store
50,North York,0,Park,Playground,Trail,Accessories Store,Metro Station,Modern European Restaurant,Miscellaneous Shop,Middle Eastern Restaurant,Mexican Restaurant,Mediterranean Restaurant
72,North York,0,Asian Restaurant,Park,Accessories Store,Metro Station,Molecular Gastronomy Restaurant,Modern European Restaurant,Miscellaneous Shop,Middle Eastern Restaurant,Mexican Restaurant,Men's Store
74,Central Toronto,0,Park,Women's Store,Pool,Accessories Store,Men's Store,Modern European Restaurant,Miscellaneous Shop,Middle Eastern Restaurant,Mexican Restaurant,Metro Station


cluster 1 is dominated by parks, landmarks / monuments

#### cluster 2

In [27]:
toronto.loc[toronto['Cluster Labels'] == 1, toronto.columns[[1] + list(range(5, toronto.shape[1]))]]

Unnamed: 0,borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,North York,1,Bar,Accessories Store,Lounge,Malay Restaurant,Market,Martial Arts School,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station
2,Downtown Toronto,1,Donut Shop,Medical Center,Bank,Breakfast Spot,Electronics Store,Intersection,Mexican Restaurant,Restaurant,Rental Car Location,Miscellaneous Shop
3,North York,1,Coffee Shop,Korean BBQ Restaurant,Accessories Store,Moroccan Restaurant,Market,Martial Arts School,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station
4,Queen,1,Hakka Restaurant,Athletics & Sports,Bank,Bakery,Gas Station,Caribbean Restaurant,Thai Restaurant,Office,Persian Restaurant,Market
6,Scarborough,1,Train Station,Discount Store,Hobby Shop,Coffee Shop,Chinese Restaurant,Accessories Store,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Miscellaneous Shop
...,...,...,...,...,...,...,...,...,...,...,...,...
90,Scarborough,1,River,Accessories Store,Moroccan Restaurant,Malay Restaurant,Market,Martial Arts School,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station
92,Downtown TorontoStn A PO Boxes,1,Supplement Shop,Hardware Store,Wings Joint,Discount Store,Convenience Store,Bakery,Burger Joint,Gym,Grocery Store,Tanning Salon
94,EtobicokeNorthwest,1,Middle Eastern Restaurant,Bakery,Accessories Store,Lounge,Malay Restaurant,Market,Martial Arts School,Medical Center,Mediterranean Restaurant,Men's Store
95,Scarborough,1,Pharmacy,Coffee Shop,Beer Store,Liquor Store,Park,Café,Pet Store,Performing Arts Venue,Organic Grocery,Martial Arts School


cluster 2 seems to be food heaven...

#### Cluster 3

In [28]:
toronto.loc[toronto['Cluster Labels'] == 2, toronto.columns[[1] + list(range(5, toronto.shape[1]))]]

Unnamed: 0,borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
91,Downtown Toronto,2,Baseball Field,Accessories Store,Lounge,Malay Restaurant,Market,Martial Arts School,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station
97,Downtown Toronto,2,Food Service,Baseball Field,Accessories Store,Monument / Landmark,Malay Restaurant,Market,Martial Arts School,Medical Center,Mediterranean Restaurant,Men's Store


cluster 3 seems to have at least baseball field...