# Attention: I used one notebook for all three parts

# Segmenting and Clustering Neighborhoods in Toronto
### AN - MARCH 26, 2021


# PART I

### improting Libraries

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import geocoder

import folium

import matplotlib.cm as cm
import matplotlib.colors as colors


from sklearn.cluster import KMeans

### Scraping the data form Wikipedia

As some of the rows are not assigned, I first put an "if" to avoid using "Not assigned" data. Then, as the first three characters are postal code, I used that advantage to divide the fist column. 

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wp_db = requests.get(url).text
wp_soup = BeautifulSoup(wp_db,'html5lib')

table_contents = []
table = wp_soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned': # to remove Not assigned part
        pass
    else:
        cell['PostalCode'] = row.p.text[:3] 
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

In [3]:
df = pd.DataFrame(table_contents)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


#### cleaning and updating data

In [4]:
df['Borough'] = df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [5]:
df.shape[0]

103

In [6]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


# PART I
### Get the latitude and the longitude coordinates of each neighborhood, from .csv file

In [7]:
coordinates_df = pd.read_csv('Geospatial_Coordinates.csv')
coordinates_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging both DataFrames, based on the first one

In [8]:
df = pd.merge(df, coordinates_df, left_on='PostalCode', right_on='Postal Code', how='left').drop('Postal Code', axis=1)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


#  <font color='red'>PART III</font> 

#### expoling the Borough to find out 

In [9]:
a = df.groupby(['Borough']).mean()
a

Unnamed: 0_level_0,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Central Toronto,43.70198,-79.398954
Downtown Toronto,43.654624,-79.384184
Downtown Toronto Stn A,43.646435,-79.374846
East Toronto,43.67111,-79.325428
East Toronto Business,43.662744,-79.321558
East York,43.704043,-79.335287
East York/East Toronto,43.685347,-79.338106
Etobicoke,43.655797,-79.537348
Etobicoke Northwest,43.706748,-79.594054
Mississauga,43.636966,-79.615819


As Toronto appears in most 'Borough', I decided to focus on all Btoiught that contain the word Toronto

In [10]:
index_names = df[~df['Borough'].str.contains('Toronto')].index
df = df.drop(index_names)
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


#### create map of Manhattan using latitude and longitude values

In [11]:
latitude = df['Latitude'][0]
longitude = df['Longitude'][0]
map_toronto = folium.Map(location=[latitude,longitude],zoom_start=11)

# add markers to map
for lat,lng,label in zip(df['Latitude'],df['Longitude'],df['Neighborhood']):
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [12]:
# Unfortiunety Foursquare  is not working for me, I had no choice other than using the workaround provided by the staff members in the discussion forum.

nearby_venues1 = pd.read_json("https://raw.githubusercontent.com/ibm-developer-skills-network/yczvh-DataFilesForIBMProjects/master/segmenting_neighborhoods.json")    
nearby_venues1.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                 'Venue', 
                 'Venue Latitude', 
                 'Venue Longitude', 
                 'Venue Category']
toronto_venues = nearby_venues1
print(toronto_venues)

                                           Neighborhood  \
0                                        Malvern, Rouge   
1                Rouge Hill, Port Union, Highland Creek   
2                     Guildwood, Morningside, West Hill   
3                     Guildwood, Morningside, West Hill   
4                     Guildwood, Morningside, West Hill   
...                                                 ...   
1332  South Steeles, Silverstone, Humbergate, Jamest...   
1333  Clairville, Humberwood, Woodbine Downs, West H...   
1334  Clairville, Humberwood, Woodbine Downs, West H...   
1335  Clairville, Humberwood, Woodbine Downs, West H...   
1336  Clairville, Humberwood, Woodbine Downs, West H...   

      Neighborhood Latitude  Neighborhood Longitude                   Venue  \
0                 43.806686              -79.194353                 Wendy’s   
1                 43.784535              -79.160497   Royal Canadian Legion   
2                 43.763573              -79.188711   

In [13]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [14]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

#### Let's print each neighborhood along with the top 5 most common venues

In [15]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0                     Lounge  0.25
1  Latin American Restaurant  0.25
2             Breakfast Spot  0.25
3               Skating Rink  0.25
4        Monument / Landmark  0.00


----Alderwood, Long Branch----
          venue  freq
0   Pizza Place  0.25
1      Pharmacy  0.12
2   Coffee Shop  0.12
3  Skating Rink  0.12
4           Pub  0.12


----Bathurst Manor, Wilson Heights, Downsview North----
                       venue  freq
0                       Bank  0.09
1                Coffee Shop  0.09
2                Pizza Place  0.04
3                 Restaurant  0.04
4  Middle Eastern Restaurant  0.04


----Bayview Village----
                 venue  freq
0                 Bank  0.25
1                 Café  0.25
2   Chinese Restaurant  0.25
3  Japanese Restaurant  0.25
4          Yoga Studio  0.00


----Bedford Park, Lawrence Manor East----
                     venue  freq
0       Italian Restaurant  0.09
1              Coffee Shop  0

4  Rental Car Location  0.14


----Harbourfront East, Union Station, Toronto Islands----
                 venue  freq
0                 Park  0.07
1                Plaza  0.07
2                 Café  0.07
3                Hotel  0.07
4  Sporting Goods Shop  0.03


----High Park, The Junction South----
                venue  freq
0  Mexican Restaurant  0.08
1                Café  0.08
2     Thai Restaurant  0.08
3       Grocery Store  0.04
4               Diner  0.04


----Hillcrest Village----
                      venue  freq
0      Fast Food Restaurant   0.2
1               Golf Course   0.2
2                      Pool   0.2
3  Mediterranean Restaurant   0.2
4                   Dog Run   0.2


----Humber Summit----
                    venue  freq
0             Pizza Place  0.25
1  Furniture / Home Store  0.25
2    Caribbean Restaurant  0.25
3            Intersection  0.25
4             Yoga Studio  0.00


----Humberlea, Emery----
                 venue  freq
0       Baseball Field   

4                 Café  0.07


----The Annex, North Midtown, Yorkville----
            venue  freq
0            Café  0.14
1  Sandwich Place  0.14
2     Coffee Shop  0.10
3       BBQ Joint  0.05
4     Flower Shop  0.05


----The Beaches----
                        venue  freq
0                         Pub  0.25
1           Health Food Store  0.25
2                       Trail  0.25
3               Metro Station  0.00
4  Modern European Restaurant  0.00


----The Danforth  East----
               venue  freq
0  Convenience Store  0.25
1               Park  0.25
2       Intersection  0.25
3        Pizza Place  0.25
4     Massage Studio  0.00


----The Danforth West, Riverdale----
                venue  freq
0    Greek Restaurant  0.27
1      Ice Cream Shop  0.07
2  Italian Restaurant  0.07
3             Brewery  0.03
4     Bubble Tea Shop  0.03


----The Kingsway, Montgomery Road, Old Mill North----
                 venue  freq
0                 Pool  0.33
1           Smoke Shop  0.33
2 

#### Let's put that into a pandas dataframe

In [16]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [17]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Latin American Restaurant,Lounge,Skating Rink,Breakfast Spot,Women's Store,Deli / Bodega,Drugstore,Donut Shop,Dog Run,Distribution Center
1,"Alderwood, Long Branch",Pizza Place,Skating Rink,Pharmacy,Pub,Sandwich Place,Coffee Shop,Gym,Gas Station,Coworking Space,Diner
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Diner,Bridal Shop,Supermarket,Restaurant,Sushi Restaurant,Ice Cream Shop,Middle Eastern Restaurant,Mobile Phone Shop
3,Bayview Village,Bank,Chinese Restaurant,Japanese Restaurant,Café,Women's Store,Deli / Bodega,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Sandwich Place,Italian Restaurant,Greek Restaurant,Thai Restaurant,Liquor Store,Juice Bar,Indian Restaurant,Restaurant,Sushi Restaurant


### Cluster Neighborhoods: Run k-means to cluster the neighborhood into 5 clusters.

In [18]:
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [19]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df

# merge dataframes to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'),on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1,Coffee Shop,Park,Bakery,Breakfast Spot,Café,Greek Restaurant,Gym / Fitness Center,Pub,Performing Arts Venue,Mexican Restaurant
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1,Café,Theater,Clothing Store,Sporting Goods Shop,Hotel,Fast Food Restaurant,Steakhouse,Bakery,Ramen Restaurant,Music Venue
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Gastropub,Café,Farmers Market,Coffee Shop,Thai Restaurant,Diner,Jazz Club,Japanese Restaurant,Italian Restaurant,Restaurant
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Pub,Trail,Health Food Store,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Women's Store,Cupcake Shop
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,1,Cocktail Bar,Coffee Shop,Beer Bar,Farmers Market,Seafood Restaurant,Café,Breakfast Spot,Liquor Store,Bistro,Comfort Food Restaurant


### Visualization of clusters

In [20]:
# create map
map_clusters = folium.Map(location=[latitude,longitude],zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat,lon,poi,cluster in zip(toronto_merged['Latitude'],toronto_merged['Longitude'],toronto_merged['Neighborhood'],toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi)+' Cluster '+str(cluster),parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color = rainbow[cluster-1],
        fill = True,
        fill_color = rainbow[cluster-1],
        fill_opacity = 0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

#### Cluster 1

In [21]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0,toronto_merged.columns[[1]+list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,East York/East Toronto,0,Pizza Place,Park,Convenience Store,Intersection,Dance Studio,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center


#### Cluster 2

In [22]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1,toronto_merged.columns[[1]+list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,1,Coffee Shop,Park,Bakery,Breakfast Spot,Café,Greek Restaurant,Gym / Fitness Center,Pub,Performing Arts Venue,Mexican Restaurant
1,Downtown Toronto,1,Café,Theater,Clothing Store,Sporting Goods Shop,Hotel,Fast Food Restaurant,Steakhouse,Bakery,Ramen Restaurant,Music Venue
2,Downtown Toronto,1,Gastropub,Café,Farmers Market,Coffee Shop,Thai Restaurant,Diner,Jazz Club,Japanese Restaurant,Italian Restaurant,Restaurant
3,East Toronto,1,Pub,Trail,Health Food Store,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Women's Store,Cupcake Shop
4,Downtown Toronto,1,Cocktail Bar,Coffee Shop,Beer Bar,Farmers Market,Seafood Restaurant,Café,Breakfast Spot,Liquor Store,Bistro,Comfort Food Restaurant
5,Downtown Toronto,1,Coffee Shop,Italian Restaurant,Café,Yoga Studio,Thai Restaurant,Department Store,Sandwich Place,Spa,Japanese Restaurant,Bubble Tea Shop
6,Downtown Toronto,1,Grocery Store,Café,Park,Baby Store,Candy Store,Coffee Shop,Italian Restaurant,Nightclub,Restaurant,Deli / Bodega
7,Downtown Toronto,1,Coffee Shop,Café,Seafood Restaurant,Thai Restaurant,Smoke Shop,Lounge,Bakery,Steakhouse,Hotel,Fast Food Restaurant
8,West Toronto,1,Pharmacy,Bakery,Park,Pool,Brewery,Bar,Bank,Supermarket,Café,Middle Eastern Restaurant
10,Downtown Toronto,1,Park,Plaza,Café,Hotel,Performing Arts Venue,Skating Rink,IT Services,Ice Cream Shop,Sporting Goods Shop,Basketball Stadium


#### Cluster 3

In [23]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2,toronto_merged.columns[[1]+list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,Central Toronto,2,Bus Line,Park,Swim School,Women's Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store
21,Central Toronto,2,Park,Sushi Restaurant,Jewelry Store,Trail,Women's Store,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
29,Central Toronto,2,Lawyer,Park,Trail,Summer Camp,Drugstore,Donut Shop,Dog Run,Distribution Center,Curling Ice,Eastern European Restaurant
33,Downtown Toronto,2,Park,Playground,Trail,Women's Store,Diner,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant


#### Cluster 4

In [24]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3,toronto_merged.columns[[1]+list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


#### Cluster 5

In [25]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4,toronto_merged.columns[[1]+list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
