In [1]:
#!pip install pandas
#!pip install requests
#!pip install bs4
#!pip install plotly

## Importing required libraries

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Webscrapping 
We start by scrapping the wikipedia page and prepare the soup 

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [4]:
html_data = requests.get(url).text

In [5]:
soup = BeautifulSoup(html_data, 'html.parser')

By navigating the tree page we find that the post code, borough and neighbourhoods are found in the same level of the parse tree

In [6]:
#table = soup.find_all("table")[0].find_all("td")
soup.find_all('tbody')[0].find_all("p")

[<p><b>M1A</b><br/><span style="font-size:85%;"><i>Not assigned</i></span>
 </p>,
 <p><b>M2A</b><br/><span style="font-size:85%;"><i>Not assigned</i></span>
 </p>,
 <p><b>M3A</b><br/><span style="font-size:85%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>)</span>
 </p>,
 <p><b>M4A</b><br/><span style="font-size:85%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>)</span>
 </p>,
 <p><b>M5A</b><br/><span style="font-size:85%;"><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a><br/>(<a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a> / <a href="/wiki/Harbourfront,_Toronto" title="Harbourfront, Toronto">Harbourfront</a>)</span>
 </p>,
 <p><b>M6A</b><br/><span style="font-size:85%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Lawrence_Manor

So we begin by creating a list that we use to create the data frame

In [7]:
ls = []
n = 0
for x in soup.find_all('tbody')[0].find_all("p"):
    ls.append(soup.find_all('tbody')[0].find_all("p")[n].text)
    n = n + 1
print(ls)
    

['M1ANot assigned\n', 'M2ANot assigned\n', 'M3ANorth York(Parkwoods)\n', 'M4ANorth York(Victoria Village)\n', 'M5ADowntown Toronto(Regent Park / Harbourfront)\n', 'M6ANorth York(Lawrence Manor / Lawrence Heights)\n', "M7AQueen's Park(Ontario Provincial Government)\n", 'M8ANot assigned\n', 'M9AEtobicoke(Islington Avenue)\n', 'M1BScarborough(Malvern / Rouge)\n', 'M2BNot assigned\n', 'M3BNorth York(Don Mills)North\n', 'M4BEast York(Parkview Hill / Woodbine Gardens)\n', 'M5BDowntown Toronto(Garden District, Ryerson)\n', 'M6BNorth York(Glencairn)\n', 'M7BNot assigned\n', 'M8BNot assigned\n', 'M9BEtobicoke(West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale)\n', 'M1CScarborough(Rouge Hill / Port Union / Highland Creek)\n', 'M2CNot assigned\n', 'M3CNorth York(Don Mills)South(Flemingdon Park)\n', 'M4CEast York(Woodbine Heights)\n', 'M5CDowntown Toronto(St. James Town)\n', 'M6CYork(Humewood-Cedarvale)\n', 'M7CNot assigned\n', 'M8CNot assigned\n', 'M9CEtobicoke(Eringate / 

The list is converted into a Pandas dataframe

In [8]:
df = pd.DataFrame(ls)

In [9]:
df.head()

Unnamed: 0,0
0,M1ANot assigned\n
1,M2ANot assigned\n
2,M3ANorth York(Parkwoods)\n
3,M4ANorth York(Victoria Village)\n
4,M5ADowntown Toronto(Regent Park / Harbourfront)\n


---
## Cleaning the Dataframe
We creating a dataframe with the required columns and perfom the first data split.

In [10]:
toronto_df = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])

toronto_df["PostalCode"] = df.iloc[:,0].str[0:3]
toronto_df["Borough"] = df.iloc[:,0].str[3:]



toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned\n,
1,M2A,Not assigned\n,
2,M3A,North York(Parkwoods)\n,
3,M4A,North York(Victoria Village)\n,
4,M5A,Downtown Toronto(Regent Park / Harbourfront)\n,


Removing <code>\n</code> from cells

In [11]:
toronto_df["Borough"] = toronto_df["Borough"].str.replace("\n","")
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York(Parkwoods),
3,M4A,North York(Victoria Village),
4,M5A,Downtown Toronto(Regent Park / Harbourfront),


Removing cells with a borough that is <code>Not assigned</code>

In [12]:
toronto_df = toronto_df[toronto_df.Borough != "Not assigned"]
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York(Parkwoods),
3,M4A,North York(Victoria Village),
4,M5A,Downtown Toronto(Regent Park / Harbourfront),
5,M6A,North York(Lawrence Manor / Lawrence Heights),
6,M7A,Queen's Park(Ontario Provincial Government),


Split boroughs and neighborhoods

In [13]:
toronto_df[['Borough','Neighborhood']] = toronto_df["Borough"].str.split('(', n=1, expand=True)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods)
3,M4A,North York,Victoria Village)
4,M5A,Downtown Toronto,Regent Park / Harbourfront)
5,M6A,North York,Lawrence Manor / Lawrence Heights)
6,M7A,Queen's Park,Ontario Provincial Government)


Cleaning <code>Neighborhood</code>

In [14]:
toronto_df["Neighborhood"] = toronto_df["Neighborhood"].str.replace(")","")
toronto_df["Neighborhood"] = toronto_df["Neighborhood"].str.replace(" / ",", ")

  toronto_df["Neighborhood"] = toronto_df["Neighborhood"].str.replace(")","")


Reset the index

In [15]:
toronto_df.reset_index(drop=True, inplace=True)

In [16]:
toronto_df.shape

(103, 3)

Replacing neighborhoods that are not assigned with the name of the Borough

In [17]:
#toronto_df = toronto_df[toronto_df.PostalCode != "M7Y"]
#toronto_df = toronto_df[toronto_df.PostalCode != "M5W"]

print(toronto_df.Borough[toronto_df.PostalCode == "M5W"])
print(toronto_df[toronto_df.PostalCode == "M5W"].index)

print(toronto_df.Borough[toronto_df.PostalCode == "M7Y"])
print(toronto_df[toronto_df.PostalCode == "M7Y"].index)



92    Downtown TorontoStn A PO Boxes25 The Esplanade
Name: Borough, dtype: object
Int64Index([92], dtype='int64')
100    East TorontoBusiness reply mail Processing Cen...
Name: Borough, dtype: object
Int64Index([100], dtype='int64')


In [18]:
toronto_df.iloc[92, toronto_df.columns.get_loc('Borough')] = 'Downtown Toronto'
toronto_df.iloc[92, toronto_df.columns.get_loc('Neighborhood')] = 'Downtown Toronto'
toronto_df.iloc[100, toronto_df.columns.get_loc('Borough')] = 'East Toronto'
toronto_df.iloc[100, toronto_df.columns.get_loc('Neighborhood')] = 'East Toronto'

In [19]:
print(toronto_df.Borough[toronto_df.PostalCode == "M5W"])
print(toronto_df[toronto_df.PostalCode == "M5W"].index)

print(toronto_df.Borough[toronto_df.PostalCode == "M7Y"])
print(toronto_df[toronto_df.PostalCode == "M7Y"].index)

92    Downtown Toronto
Name: Borough, dtype: object
Int64Index([92], dtype='int64')
100    East Toronto
Name: Borough, dtype: object
Int64Index([100], dtype='int64')


exploring data to ensure it is ready for analysis

In [20]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [21]:
toronto_df.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,East Toronto
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [22]:
toronto_df.shape

(103, 3)

In [23]:
geo_tag = pd.read_csv("data/Geospatial_Coordinates.csv")

In [24]:
geo_tag.rename(columns = {'Postal Code': 'PostalCode'}, inplace = True)
geo_tag.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [25]:
coordinates_df = pd.merge(toronto_df, geo_tag, on="PostalCode")
coordinates_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [26]:
coordinates_df.shape

(103, 5)

---
# Data Analysis: Exploring and clustering the neighborhoods 

Importing libraries

In [27]:
# Matplotlib
import matplotlib.cm as cm
import matplotlib.colors as colors

# k-means from clustering
from sklearn.cluster import KMeans

# Folium
import folium

# Geopy
from geopy.geocoders import Nominatim

# Numpy
import numpy as np


### Use geopy library oto get coordinates of Toronto

In [28]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Create a map of Toronto

In [29]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
neighborhoods = coordinates_df
# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Foursquare API Credentials

Removed

## Explore neighborhoods in Toronto

In [31]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

We create a dataframe with Toronto's venues

In [32]:
toronto_venues = getNearbyVenues(names=coordinates_df['Neighborhood'],
                                   latitudes=coordinates_df['Latitude'],
                                   longitudes=coordinates_df['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don MillsNorth
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don MillsSouth(Flemingdon Park
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth  East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
DownsviewEast

Exploring the dataframe

In [33]:
print(toronto_venues.shape)
toronto_venues.head()

(2104, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,TTC stop #8380,43.752672,-79.326351,Bus Stop
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


Exploring the number of returned venues

In [34]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
"Alderwood, Long Branch",9,9,9,9,9,9
"Bathurst Manor, Wilson Heights, Downsview North",21,21,21,21,21,21
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",24,24,24,24,24,24
...,...,...,...,...,...,...
WillowdaleWest,5,5,5,5,5,5
Woburn,3,3,3,3,3,3
Woodbine Heights,5,5,5,5,5,5
York Mills West,3,3,3,3,3,3


In [35]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 271 uniques categories.


### Explore Individual Neighborhoods

In [36]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
#fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
#toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Train Station,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
toronto_onehot['Neighborhood'] 

0                                               Parkwoods
1                                               Parkwoods
2                                               Parkwoods
3                                        Victoria Village
4                                        Victoria Village
                              ...                        
2099    Mimico NW, The Queensway West, South of Bloor,...
2100    Mimico NW, The Queensway West, South of Bloor,...
2101    Mimico NW, The Queensway West, South of Bloor,...
2102    Mimico NW, The Queensway West, South of Bloor,...
2103    Mimico NW, The Queensway West, South of Bloor,...
Name: Neighborhood, Length: 2104, dtype: object

In [38]:
toronto_onehot.shape

(2104, 271)

group by neighbourhood and mean

In [39]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Train Station,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,WillowdaleWest,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,York Mills West,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


new shape

In [40]:
toronto_grouped.shape

(99, 271)

Most common venues

In [41]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt ----
                       venue  freq
0                     Lounge   0.2
1  Latin American Restaurant   0.2
2               Skating Rink   0.2
3             Clothing Store   0.2
4             Breakfast Spot   0.2


----Alderwood, Long Branch----
                venue  freq
0         Pizza Place  0.22
1        Skating Rink  0.11
2      Sandwich Place  0.11
3                 Pub  0.11
4  Athletics & Sports  0.11


----Bathurst Manor, Wilson Heights, Downsview North----
         venue  freq
0  Coffee Shop  0.10
1         Bank  0.10
2  Pizza Place  0.05
3  Gas Station  0.05
4     Pharmacy  0.05


----Bayview Village----
                 venue  freq
0                 Café  0.25
1                 Bank  0.25
2  Japanese Restaurant  0.25
3   Chinese Restaurant  0.25
4  Moroccan Restaurant  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0  Italian Restaurant  0.08
1      Sandwich Place  0.08
2         Coffee Shop  0.08
3      Breakfast Spot  0.04
4 

4       Modern European Restaurant   0.0


----Humberlea, Emery----
                             venue  freq
0                   Baseball Field   1.0
1                Accessories Store   0.0
2               Mexican Restaurant   0.0
3  Molecular Gastronomy Restaurant   0.0
4       Modern European Restaurant   0.0


----Humewood-Cedarvale----
          venue  freq
0  Tennis Court  0.25
1  Hockey Arena  0.25
2         Trail  0.25
3         Field  0.25
4  Optical Shop  0.00


----India Bazaar, The Beaches West----
                  venue  freq
0  Fast Food Restaurant  0.10
1                  Park  0.10
2         Movie Theater  0.05
3               Brewery  0.05
4          Liquor Store  0.05


----Kennedy Park, Ionview, East Birchmount Park----
                venue  freq
0    Department Store  0.25
1          Hobby Shop  0.25
2  Chinese Restaurant  0.25
3         Coffee Shop  0.25
4  Miscellaneous Shop  0.00


----Kensington Market, Chinatown, Grange Park----
                           ven

                venue  freq
0         Pizza Place  0.29
1         Coffee Shop  0.14
2      Sandwich Place  0.14
3      Discount Store  0.14
4  Chinese Restaurant  0.14


----Wexford, Maryvale----
                       venue  freq
0              Shopping Mall  0.17
1             Sandwich Place  0.17
2                Auto Garage  0.17
3                 Smoke Shop  0.17
4  Middle Eastern Restaurant  0.17


----Willowdale, Newtonbrook----
                        venue  freq
0                        Park   1.0
1           Accessories Store   0.0
2                 Men's Store   0.0
3  Modern European Restaurant   0.0
4           Mobile Phone Shop   0.0


----WillowdaleSouth----
              venue  freq
0  Ramen Restaurant  0.09
1       Coffee Shop  0.06
2              Café  0.06
3    Sandwich Place  0.06
4       Pizza Place  0.06


----WillowdaleWest----
           venue  freq
0    Pizza Place   0.2
1  Grocery Store   0.2
2        Butcher   0.2
3    Coffee Shop   0.2
4       Pharmacy   0.2

In [42]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

top 10 venues for each neighborhood

In [43]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Latin American Restaurant,Skating Rink,Clothing Store,Breakfast Spot,Massage Studio,Medical Center,Mediterranean Restaurant,Martial Arts School,Monument / Landmark
1,"Alderwood, Long Branch",Pizza Place,Skating Rink,Sandwich Place,Pub,Athletics & Sports,Coffee Shop,Gym,Pharmacy,Mediterranean Restaurant,Medical Center
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Pizza Place,Gas Station,Pharmacy,Deli / Bodega,Diner,Restaurant,Mobile Phone Shop,Sandwich Place
3,Bayview Village,Café,Bank,Japanese Restaurant,Chinese Restaurant,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop,Accessories Store
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Sandwich Place,Coffee Shop,Breakfast Spot,Toy / Game Store,Juice Bar,Sushi Restaurant,Restaurant,Pub,Thai Restaurant


---
# Clustering Neighborhoods

k-means clustering

In [44]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=1).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 2], dtype=int32)

In [45]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = coordinates_df

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1.0,Park,Bus Stop,Food & Drink Shop,Accessories Store,Metro Station,Mobile Phone Shop,Miscellaneous Shop,Middle Eastern Restaurant,Mexican Restaurant,Mediterranean Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Pizza Place,Coffee Shop,Hockey Arena,Portuguese Restaurant,Intersection,Medical Center,Mediterranean Restaurant,Massage Studio,Martial Arts School,Men's Store
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1.0,Coffee Shop,Bakery,Park,Pub,Breakfast Spot,Theater,Café,Yoga Studio,Shoe Store,French Restaurant
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,1.0,Accessories Store,Clothing Store,Vietnamese Restaurant,Furniture / Home Store,Coffee Shop,Carpet Store,Boutique,Performing Arts Venue,Martial Arts School,Massage Studio
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494,1.0,Coffee Shop,Gym,Sushi Restaurant,Diner,Mexican Restaurant,Beer Bar,Fried Chicken Joint,Smoothie Shop,Burger Joint,Burrito Place


convert <code> Cluster Labels </code> to <code>int</code>

In [46]:
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].fillna(0).astype(int)
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1,Park,Bus Stop,Food & Drink Shop,Accessories Store,Metro Station,Mobile Phone Shop,Miscellaneous Shop,Middle Eastern Restaurant,Mexican Restaurant,Mediterranean Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,Pizza Place,Coffee Shop,Hockey Arena,Portuguese Restaurant,Intersection,Medical Center,Mediterranean Restaurant,Massage Studio,Martial Arts School,Men's Store
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1,Coffee Shop,Bakery,Park,Pub,Breakfast Spot,Theater,Café,Yoga Studio,Shoe Store,French Restaurant
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,1,Accessories Store,Clothing Store,Vietnamese Restaurant,Furniture / Home Store,Coffee Shop,Carpet Store,Boutique,Performing Arts Venue,Martial Arts School,Massage Studio
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494,1,Coffee Shop,Gym,Sushi Restaurant,Diner,Mexican Restaurant,Beer Bar,Fried Chicken Joint,Smoothie Shop,Burger Joint,Burrito Place


Create the map to visualise the clusters

In [47]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters