# Segmenting and Clustering Neighborhoods in Toronto

In [391]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs

### Scrape our dataset from wikipedia

In [423]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response= requests.get(url)
soup= bs(response.text,'html.parser')
table= soup.find('table',{'class':"wikitable"})
df= pd.read_html(str(table))
df= pd.DataFrame(df[0])
print(df.shape)
df.head()

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Preprocessing

Eliminate the 'Not assinged' Boroughs

In [156]:
#eliminate Not assigned Boroughs
df= df[(df['Borough']!='Not assigned')]

set 'Not assigned' Neighbourhood equal its Borough

In [157]:
df.loc[(df['Neighbourhood'] == 'Not assigned'), 'Neighbourhood']= df['Borough']

Combine Duplicate rows

In [173]:
df=df.groupby(['Postal Code','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [175]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [243]:
print('We have {} Boroughs and {} Neighbourhoods in Toronto'.format(len(df['Borough'].unique()),len(df['Neighbourhood'].unique())))

We have 10 Boroughs and 99 Neighbourhoods in Toronto


In [177]:
df.shape

(103, 3)

## Geographical Coordinates (Latitude and Lonigtude) 

In [430]:
path='Geospatial_Coordinates.csv'
geo_coor=pd.read_csv(path)
toronto_df= df.merge(geo_coor)
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### Map of Toronto with its Neighbourhoods

In [198]:
from geopy.geocoders import Nominatim
import folium

get Toronto geographic coordinates

In [196]:
address = 'Toronto, Canada'
geolocator= Nominatim(user_agent='foursquare_agent')
location = geolocator.geocode(address)

latitude=location.latitude
longitude=location.longitude

print(latitude,longitude)

43.6534817 -79.3839347


Toronto Map

In [201]:
map_toronto = folium.Map(location=[latitude,longitude],zoom_start=10)

for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighbourhood']):    
    label= '{},{}'.format(neighborhood,borough)
    label= folium.Popup(label,parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
    ).add_to(map_toronto)
map_toronto

## Explore

Foursquare credintials

In [203]:
CLIENT_ID ='~'
CLIENT_SECRET= '~'
VERSION= '20180605'

Create a function to go through toronto data and explore the neighborhood venues one by one

In [267]:
# this function will go through tornto data, and explore the neighborhoods venues one by one. 
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
#         print(name)
        
        #create the API request
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        # GET request
        results= requests.get(url).json()
        venues= results['response']['groups'][0]['items']
        
        # return only relevant info. for each nearby venue
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) 
            for v in venues])
        
    nearby_venues = pd.DataFrame(
        [item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood',
                            'Neighborhood Latitude',
                            'Neighborhood Longitude',
                            'Venue',
                            'Venue Latitude',
                            'Venue Longitude',
                            'Venue Category'
                            ]
    return(nearby_venues)

In [268]:
toronto_venues= getNearbyVenues(names=toronto_df['Neighbourhood'],
                                latitudes=toronto_df['Latitude'],
                                longitudes=toronto_df['Longitude'])

In [269]:
toronto_venues.shape

(2141, 7)

In [270]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,SEBS Engineering Inc. (Sustainable Energy and ...,43.782371,-79.15682,Construction & Landscaping
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [271]:
print('We have {} Neighborhoods in Toronto'.format(len(toronto_venues['Neighborhood'].unique())))
print('And {} Categories'.format(len(toronto_venues['Venue Category'].unique())))

We have 96 Neighborhoods in Toronto
And 273 Categories


In [308]:
toronto_venues.groupby('Neighborhood')['Venue'].count().sort_values(ascending=False).reset_index().head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
307,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
438,Studio District,43.659526,-79.340923,Leslieville,43.66207,-79.337856,Neighborhood
1033,"Richmond, Adelaide, King",43.650571,-79.384568,Downtown Toronto,43.653232,-79.385296,Neighborhood
1125,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752,Harbourfront,43.639526,-79.380688,Neighborhood


## Analyze each Neighborhood

In [318]:
# One hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']],prefix='',prefix_sep='')

# We have category called Neighborhood. So we need to rename it.
toronto_onehot.rename(columns={'Neighborhood':'Category_Neighborhood'},inplace=True)

# add Neighborhood column to the begining of our df
toronto_onehot.insert(0,'Neighborhood',toronto_venues['Neighborhood'])

print(toronto_onehot.shape)
toronto_onehot.head()

(2141, 274)


Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Rouge Hill, Port Union, Highland Creek",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Rouge Hill, Port Union, Highland Creek",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Group rows by Neighborhood with the mean of the frequency of eahc category

In [344]:
toronto_grouped= toronto_onehot.groupby('Neighborhood').mean().reset_index()

Now let's get the top 5 most frequent venues for each Neighborhood

In [388]:
for hood in toronto_grouped['Neighborhood']:
    # to confine the results to 3 hoods
    if hood in (['Agincourt','Bayview Village','Berczy Park']):
        print('----'+hood+'----')
        temp=toronto_grouped[toronto_grouped['Neighborhood']== hood]
        temp= temp.T # transpose 
        temp= temp.reset_index() 
        temp.columns=['venue','freq'] # rename the columns
        temp=temp.iloc[1:] # eliminate the Neighborhood row
        temp= temp.sort_values(by='freq',ascending=False).reset_index(drop=True)

        print(temp.head(5))
        print('\n')

----Agincourt----
                       venue freq
0             Clothing Store  0.2
1             Breakfast Spot  0.2
2                     Lounge  0.2
3               Skating Rink  0.2
4  Latin American Restaurant  0.2


----Bayview Village----
                 venue  freq
0  Japanese Restaurant  0.25
1   Chinese Restaurant  0.25
2                 Bank  0.25
3                 Café  0.25
4    Accessories Store     0


----Berczy Park----
            venue       freq
0     Coffee Shop  0.0909091
1        Beer Bar  0.0363636
2  Farmers Market  0.0363636
3     Cheese Shop  0.0363636
4    Cocktail Bar  0.0363636




##### Create a new DF with the top 10 venues for each Neighborhood

1st we will create a function that will sort the venues in descending order

In [396]:
def return_most_common_venues(row, num_top_venues): 
    row_categories= row.iloc[1:]
    row_categories_sorted= row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

In [397]:
num_top_venues = 10

# create the DF's columns
indicators = ['st','nd','rd']
columns= ['Neighborhood']
for ind in np.arange(num_top_venues):
    try: 
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create the DF using the above columns
neighborhoods_venues_sorted=pd.DataFrame(columns=columns)

# populate the Neighborhood column with toronto neighborhoods
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]): # 274 
    neighborhoods_venues_sorted.iloc[ind,1:]=return_most_common_venues(toronto_grouped.iloc[ind,:],num_top_venues)
    
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Skating Rink,Latin American Restaurant,Clothing Store,Breakfast Spot,Falafel Restaurant,Event Space,Ethiopian Restaurant,Escape Room,Discount Store
1,"Alderwood, Long Branch",Pizza Place,Pharmacy,Gym,Sandwich Place,Coffee Shop,Pub,Dog Run,Dim Sum Restaurant,Diner,Discount Store
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Fried Chicken Joint,Chinese Restaurant,Bridal Shop,Sandwich Place,Diner,Restaurant,Middle Eastern Restaurant,Supermarket
3,Bayview Village,Japanese Restaurant,Café,Chinese Restaurant,Bank,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Yoga Studio
4,"Bedford Park, Lawrence Manor East",Sandwich Place,Italian Restaurant,Coffee Shop,Greek Restaurant,Thai Restaurant,Locksmith,Liquor Store,Comfort Food Restaurant,Juice Bar,Butcher


## Cluster Neighborhoods

In [410]:
from sklearn.cluster import KMeans

k= 5

features= toronto_grouped.drop('Neighborhood',1)

# createa the K-Means model and fit with the features 
kmeans= KMeans(k,random_state=4).fit(features)
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       0, 0, 4, 0, 0, 2, 4, 0, 0, 0, 4, 0, 0, 0, 3, 0, 0, 4, 0, 0, 0, 4,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       4, 0, 0, 0, 0, 0, 4, 1])

In [415]:
# insert new column for cluster labels to the DF
# neighborhoods_venues_sorted.insert(0,'Cluster Labels',kmeans.labels_)

# let's merge the original toronto df with the neighborhoods venues
toronto_merged= toronto_df
toronto_merged= toronto_merged.merge(neighborhoods_venues_sorted.set_index('Neighborhood'),left_on='Neighbourhood',right_on='Neighborhood')

toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,2,Fast Food Restaurant,Dumpling Restaurant,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant,Health & Beauty Service
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,0,Construction & Landscaping,Bar,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Electronics Store
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0,Breakfast Spot,Restaurant,Rental Car Location,Electronics Store,Medical Center,Intersection,Bank,Mexican Restaurant,Yoga Studio,Doner Restaurant
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0,Coffee Shop,Korean BBQ Restaurant,Mexican Restaurant,Yoga Studio,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0,Gas Station,Fried Chicken Joint,Bakery,Bank,Athletics & Sports,Thai Restaurant,Caribbean Restaurant,Hakka Restaurant,Electronics Store,Eastern European Restaurant


## Visualize the Clusters

In [422]:
import matplotlib.cm as cm 
import matplotlib.colors as colors 

map_clusters= folium.Map(location=[latitude,longitude],zoom_start=10)

# coloring the clusters
x= np.arange(k)
ys= [i+x+(i*x)**2 for i in range(k)]
colors_array= cm.rainbow(np.linspace(0,1,len(ys)))
rainbow= [colors.rgb2hex(i) for i in colors_array]

markers_colors= []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'],toronto_merged['Longitude'],toronto_merged['Neighbourhood'],toronto_merged['Cluster Labels']):
    label= folium.Popup(str(poi)+'Cluster'+str(cluster),parse_html=True)
    folium.CircleMarker(
        [lat,lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7
    ).add_to(map_clusters)
    
map_clusters    