In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import lxml
from pandas.io.json import json_normalize


In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url).text


In [3]:
soup = BeautifulSoup(page, 'lxml')
type(soup)

bs4.BeautifulSoup

In [4]:
#table = soup.find('table')
table = soup.find('table', class_='wikitable sortable')
#print(table.prettify())

In [5]:
# Get Table head

table_header = table.find_all('tr')

table_head = []
for tr in table_header:
    th = tr.find_all('th')
    row1 = [tr.text.strip() for tr in th if tr.text.strip()]
    if row1:
        table_head.append(row1)


df1 = pd.DataFrame(table_head)
df1



Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood


In [6]:
# Get Table rows

table_rows = table.find_all('tr')

table_body = []
for tr in table_rows:
    td = tr.find_all('td')
    row2 = [tr.text.strip() for tr in td if tr.text.strip()]
    if row2:
        table_body.append(row2)

df2 = pd.DataFrame(table_body) 
df2.head()

Unnamed: 0,0,1,2
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [7]:
#Combine Header and data rows and set column headers

df3 = pd.concat([df1,df2])
df3.columns = df3.iloc[0]
df3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village


In [8]:
#Remove unwanted row from top
df4 = df3[1:]
df4.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [9]:
#Drop the rows with Borough as Not assigned
df5 = df4.drop(df4[df4['Borough'] == 'Not assigned'].index)
df5

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [10]:
#Count the number of Neighbourhoods as Not assigned
df5[df5['Neighbourhood'] == 'Not assigned'].count()

0
Postcode         1
Borough          1
Neighbourhood    1
dtype: int64

In [11]:
#Copy Borough to Neighbourhood where Neighbourhood is Not assigned.
df5.replace('Not assigned', np.nan, inplace=True)
df5.ffill(axis=1)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [12]:
#Group the data by 'Postcode', 'Borough' & 'Neighbourhood'.
df5['Postcode'] = df5['Postcode'].astype(str)
df5['Borough'] = df5['Borough'].astype(str)
df5['Neighbourhood'] = df5['Neighbourhood'].astype(str)

df5.set_index(['Postcode','Borough'],inplace=True)
df5 = df5.groupby(level=['Postcode','Borough'], sort=False).agg( ','.join)


In [13]:

df5 = df5.reset_index()
type(df5)


pandas.core.frame.DataFrame

In [14]:
df5.shape

(103, 3)

# Part 2

In [16]:
# Add Geo data
geodf = pd.read_csv('https://cocl.us/Geospatial_data')
geodf.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
df5 = df5.join(geodf)
df5

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M1B,43.806686,-79.194353
1,M4A,North York,Victoria Village,M1C,43.784535,-79.160497
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",M1E,43.763573,-79.188711
3,M6A,North York,"Lawrence Heights,Lawrence Manor",M1G,43.770992,-79.216917
4,M7A,Queen's Park,,M1H,43.773136,-79.239476
5,M9A,Etobicoke,Islington Avenue,M1J,43.744734,-79.239476
6,M1B,Scarborough,"Rouge,Malvern",M1K,43.727929,-79.262029
7,M3B,North York,Don Mills North,M1L,43.711112,-79.284577
8,M4B,East York,"Woodbine Gardens,Parkview Hill",M1M,43.716316,-79.239476
9,M5B,Downtown Toronto,"Ryerson,Garden District",M1N,43.692657,-79.264848


# Part 3

In [18]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

In [19]:
address = 'TORONTO'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The lati & long coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The lati & long coordinate of Toronto are 43.653963, -79.387207.


In [20]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df5['Latitude'], df5['Longitude'], df5['Borough'], df5['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

In [21]:
#Inspect Downtown Toronto
dtt_data = df5[df5['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
dtt_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront,Regent Park",M1E,43.763573,-79.188711
1,M5B,Downtown Toronto,"Ryerson,Garden District",M1N,43.692657,-79.264848
2,M5C,Downtown Toronto,St. James Town,M1W,43.799525,-79.318389
3,M5E,Downtown Toronto,Berczy Park,M2L,43.75749,-79.374714
4,M5G,Downtown Toronto,Central Bay Street,M2R,43.782736,-79.442259


In [22]:
# Get geo coordinates of Downtown Toronto
address = 'Downtown TORONTO'

geolocator = Nominatim(user_agent="dtt_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The lati & long coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The lati & long coordinate of Downtown Toronto are 43.6541737, -79.3808116451341.


In [23]:
# Create Downtown Toronto Map
# create map of North York using latitude and longitude values
map_dtt = folium.Map(location=[latitude, longitude], zoom_start=14)

# add markers to map
for lat, lng, label in zip(dtt_data['Latitude'], dtt_data['Longitude'], dtt_data['Neighbourhood']):
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_dtt)
    
map_dtt

In [24]:
#Define Foursquare Credentials and Version
CLIENT_ID = '5IGRTVSEE1JRUGYEX24C0NDOMWCBA25EVXUITRZ0D3FCLTQO' 
CLIENT_SECRET = 'VHUHGVBSYGCV1BBUPVRNYSORFNANGTLVBQQ44TSN35IJNFNT' 
VERSION = '20190211' 
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5IGRTVSEE1JRUGYEX24C0NDOMWCBA25EVXUITRZ0D3FCLTQO
CLIENT_SECRET:VHUHGVBSYGCV1BBUPVRNYSORFNANGTLVBQQ44TSN35IJNFNT


In [25]:
dtt_data.loc[0, 'Neighbourhood']

'Harbourfront,Regent Park'

In [26]:
neighbourhood_latitude = dtt_data.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = dtt_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = dtt_data.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Harbourfront,Regent Park are 43.7635726, -79.1887115.


In [27]:
#Create the GET request URL

LIMIT = 100
radius = 500 

#Create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=5IGRTVSEE1JRUGYEX24C0NDOMWCBA25EVXUITRZ0D3FCLTQO&client_secret=VHUHGVBSYGCV1BBUPVRNYSORFNANGTLVBQQ44TSN35IJNFNT&v=20190211&ll=43.7635726,-79.1887115&radius=500&limit=100'

In [28]:
#Send the GET request and examine the resutls
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5c626577dd57977f8e140e4f'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4b6074e3f964a5200fe729e3-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/pizza_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d1ca941735',
         'name': 'Pizza Place',
         'pluralName': 'Pizza Places',
         'primary': True,
         'shortName': 'Pizza'}],
       'id': '4b6074e3f964a5200fe729e3',
       'location': {'address': '4410 Kingston Rd',
        'cc': 'CA',
        'city': 'Scarborough',
        'country': 'Canada',
        'distance': 469,
        'formattedAddress': ['4410 Kingston Rd',
         'Scarborough ON M1E 2N5',
         'Canada'],
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.7676970829

In [29]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [31]:
#Clean the json and structure it into a pandas dataframe

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON


# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Swiss Chalet Rotisserie & Grill,Pizza Place,43.767697,-79.189914
1,G & G Electronics,Electronics Store,43.765309,-79.191537
2,Big Bite Burrito,Mexican Restaurant,43.766299,-79.19072
3,Enterprise Rent-A-Car,Rental Car Location,43.764076,-79.193406
4,Woburn Medical Centre,Medical Center,43.766631,-79.192286


In [32]:
#Number of venues returned by foursqaure:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

6 venues were returned by Foursquare.


Exploring Downtown Toronto

In [33]:
#create a function to repeat the same process to all the neighborhoods in Downtown Toronto
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [34]:
#run the above function on each neighborhood and create a new dataframe called dtt_venues
dtt_venues = getNearbyVenues(names=dtt_data['Neighbourhood'],
                                   latitudes=dtt_data['Latitude'],
                                   longitudes=dtt_data['Longitude']
                                  )

Harbourfront,Regent Park


Ryerson,Garden District


St. James Town


Berczy Park


Central Bay Street


Christie


Adelaide,King,Richmond


Harbourfront East,Toronto Islands,Union Station


Design Exchange,Toronto Dominion Centre


Commerce Court,Victoria Hotel


Harbord,University of Toronto


Chinatown,Grange Park,Kensington Market


CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara


Rosedale


Stn A PO Boxes 25 The Esplanade


Cabbagetown,St. James Town


First Canadian Place,Underground city


Church and Wellesley


In [35]:
# Check the size of resulting dataframe
print(dtt_venues.shape)
dtt_venues.head()

(152, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Harbourfront,Regent Park",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
1,"Harbourfront,Regent Park",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
2,"Harbourfront,Regent Park",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant
3,"Harbourfront,Regent Park",43.763573,-79.188711,Enterprise Rent-A-Car,43.764076,-79.193406,Rental Car Location
4,"Harbourfront,Regent Park",43.763573,-79.188711,Woburn Medical Centre,43.766631,-79.192286,Medical Center


In [36]:
#check how many venues were returned for each neighborhood
dtt_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",3,3,3,3,3,3
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",18,18,18,18,18,18
"Cabbagetown,St. James Town",2,2,2,2,2,2
Central Bay Street,5,5,5,5,5,5
"Chinatown,Grange Park,Kensington Market",37,37,37,37,37,37
Christie,3,3,3,3,3,3
Church and Wellesley,7,7,7,7,7,7
"Commerce Court,Victoria Hotel",2,2,2,2,2,2
"Design Exchange,Toronto Dominion Centre",19,19,19,19,19,19
"First Canadian Place,Underground city",2,2,2,2,2,2


In [37]:
#find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(dtt_venues['Venue Category'].unique())))

There are 84 uniques categories.


Analyze each Neighbourhood

In [39]:
# one hot encoding
dtt_onehot = pd.get_dummies(dtt_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
dtt_onehot['Neighbourhood'] = dtt_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [dtt_onehot.columns[-1]] + list(dtt_onehot.columns[:-1])
dtt_onehot = dtt_onehot[fixed_columns]
dtt_onehot.head()

Unnamed: 0,Neighbourhood,Airport,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Workshop,Bakery,Bar,Baseball Field,Beer Store,...,Steakhouse,Supplement Shop,Sushi Restaurant,Tea Room,Thrift / Vintage Store,Trail,Vegetarian / Vegan Restaurant,Video Store,Wings Joint,Yoga Studio
0,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
#examine the new dataframe size
dtt_onehot.shape

(152, 85)

In [41]:
#group rows by neighbourhood and by taking the mean of the frequency of occurrence of each category
dtt_grouped = dtt_onehot.groupby('Neighbourhood').mean().reset_index()
dtt_grouped

Unnamed: 0,Neighbourhood,Airport,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Workshop,Bakery,Bar,Baseball Field,Beer Store,...,Steakhouse,Supplement Shop,Sushi Restaurant,Tea Room,Thrift / Vintage Store,Trail,Vegetarian / Vegan Restaurant,Video Store,Wings Joint,Yoga Studio
0,"Adelaide,King,Richmond",0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556
2,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Chinatown,Grange Park,Kensington Market",0.0,0.027027,0.0,0.0,0.0,0.0,0.027027,0.0,0.0,...,0.0,0.0,0.054054,0.027027,0.0,0.0,0.027027,0.0,0.0,0.0
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Commerce Court,Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
8,"Design Exchange,Toronto Dominion Centre",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.052632,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"First Canadian Place,Underground city",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
#confirm the new size
dtt_grouped.shape

(17, 85)

In [43]:
#print each neighborhood along with the top 5 most common venues
num_top_venues = 5

for hood in dtt_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = dtt_grouped[dtt_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
       venue  freq
0    Airport  0.33
1       Park  0.33
2   Bus Stop  0.33
3  Locksmith  0.00
4   Pharmacy  0.00


----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
                venue  freq
0  Light Rail Station  0.11
1         Yoga Studio  0.06
2       Burrito Place  0.06
3                Park  0.06
4    Recording Studio  0.06


----Cabbagetown,St. James Town----
                 venue  freq
0          Pizza Place   0.5
1  Empanada Restaurant   0.5
2            Locksmith   0.0
3             Pharmacy   0.0
4            Pet Store   0.0


----Central Bay Street----
           venue  freq
0    Pizza Place   0.2
1  Grocery Store   0.2
2       Pharmacy   0.2
3    Coffee Shop   0.2
4        Butcher   0.2


----Chinatown,Grange Park,Kensington Market----
                venue  freq
0         Coffee Shop  0.11
1         Pizza Place  0.08
2                Café  0.08
3  Italian Restaurant  0.05
4    Sus

                venue  freq
0         Pizza Place  0.29
1        Intersection  0.14
2         Coffee Shop  0.14
3  Chinese Restaurant  0.14
4      Sandwich Place  0.14


----Commerce Court,Victoria Hotel----
                venue  freq
0          Playground   0.5
1               Trail   0.5
2  Light Rail Station   0.0
3           Pet Store   0.0
4                Park   0.0


----Design Exchange,Toronto Dominion Centre----
                venue  freq
0      Sandwich Place  0.11
1                 Pub  0.05
2       Burrito Place  0.05
3                 Gym  0.05
4  Italian Restaurant  0.05


----First Canadian Place,Underground city----
                    venue  freq
0  Furniture / Home Store   0.5
1          Baseball Field   0.5
2                 Airport   0.0
3            Liquor Store   0.0
4                Pharmacy   0.0


----Harbord,University of Toronto----
               venue  freq
0  Convenience Store  0.25
1     Discount Store  0.25
2         Restaurant  0.25
3     Sandwich Pla

                venue  freq
0        Skating Rink  0.17
1      Cosmetics Shop  0.08
2    Asian Restaurant  0.08
3  Athletics & Sports  0.08
4        Dance Studio  0.08


----Harbourfront,Regent Park----
                 venue  freq
0  Rental Car Location  0.17
1    Electronics Store  0.17
2   Mexican Restaurant  0.17
3       Medical Center  0.17
4       Breakfast Spot  0.17


----Rosedale----
            venue  freq
0  Baseball Field   0.5
1       Locksmith   0.5
2         Airport   0.0
3    Liquor Store   0.0
4        Pharmacy   0.0


----Ryerson,Garden District----
                   venue  freq
0  General Entertainment  0.25
1        College Stadium  0.25
2                   Café  0.25
3           Skating Rink  0.25
4                Airport  0.00


----St. James Town----
                  venue  freq
0           Coffee Shop  0.14
1    Chinese Restaurant  0.14
2  Fast Food Restaurant  0.14
3     Indian Restaurant  0.07
4        Breakfast Spot  0.07


----Stn A PO Boxes 25 The Esplana

                  venue  freq
0                   Gym  0.08
1       Supplement Shop  0.08
2     Convenience Store  0.08
3  Fast Food Restaurant  0.08
4        Sandwich Place  0.08




In [44]:
#write a function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [45]:
#create the new dataframe and display the top 10 venues for each neighbourhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = dtt_grouped['Neighbourhood']

for ind in np.arange(dtt_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(dtt_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Airport,Park,Bus Stop,Empanada Restaurant,Curling Ice,Dance Studio,Dessert Shop,Diner,Discount Store,Electronics Store
1,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Light Rail Station,Yoga Studio,Skate Park,Garden Center,Fast Food Restaurant,Farmers Market,Comic Shop,Park,Pizza Place,Butcher
2,"Cabbagetown,St. James Town",Pizza Place,Empanada Restaurant,Convenience Store,Cosmetics Shop,Curling Ice,Dance Studio,Dessert Shop,Diner,Discount Store,Electronics Store
3,Central Bay Street,Pizza Place,Butcher,Coffee Shop,Pharmacy,Grocery Store,French Restaurant,Diner,Convenience Store,Cosmetics Shop,Furniture / Home Store
4,"Chinatown,Grange Park,Kensington Market",Coffee Shop,Café,Pizza Place,Sushi Restaurant,Italian Restaurant,Fish & Chips Shop,Restaurant,Latin American Restaurant,Indie Movie Theater,Dessert Shop


In [46]:
# set number of clusters
kclusters = 5

dtt_grouped_clustering = dtt_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dtt_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 0, 3, 0, 0, 0, 0, 4, 0, 1])

In [49]:
# add clustering labels
#neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

dtt_merged = dtt_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
dtt_merged = dtt_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

dtt_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Harbourfront,Regent Park",M1E,43.763573,-79.188711,0.0,Breakfast Spot,Electronics Store,Pizza Place,Mexican Restaurant,Medical Center,Rental Car Location,Curling Ice,Dance Studio,Dessert Shop,Diner
1,M5B,Downtown Toronto,"Ryerson,Garden District",M1N,43.692657,-79.264848,0.0,General Entertainment,Skating Rink,Café,College Stadium,Athletics & Sports,Falafel Restaurant,Curling Ice,Dance Studio,Dessert Shop,Diner
2,M5C,Downtown Toronto,St. James Town,M1W,43.799525,-79.318389,0.0,Coffee Shop,Fast Food Restaurant,Chinese Restaurant,Pharmacy,Pizza Place,Grocery Store,Indian Restaurant,Breakfast Spot,Japanese Restaurant,Electronics Store
3,M5E,Downtown Toronto,Berczy Park,M2L,43.75749,-79.374714,,,,,,,,,,,
4,M5G,Downtown Toronto,Central Bay Street,M2R,43.782736,-79.442259,0.0,Pizza Place,Butcher,Coffee Shop,Pharmacy,Grocery Store,French Restaurant,Diner,Convenience Store,Cosmetics Shop,Furniture / Home Store


In [53]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dtt_merged['Latitude'], dtt_merged['Longitude'], dtt_merged['Neighbourhood'], dtt_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow,
        fill=True,
        fill_color=rainbow,
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [54]:
# Examine each cluster

#Cluster 1
dtt_merged.loc[dtt_merged['Cluster Labels'] == 0, dtt_merged.columns[[1] + list(range(5, dtt_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,-79.188711,0.0,Breakfast Spot,Electronics Store,Pizza Place,Mexican Restaurant,Medical Center,Rental Car Location,Curling Ice,Dance Studio,Dessert Shop,Diner
1,Downtown Toronto,-79.264848,0.0,General Entertainment,Skating Rink,Café,College Stadium,Athletics & Sports,Falafel Restaurant,Curling Ice,Dance Studio,Dessert Shop,Diner
2,Downtown Toronto,-79.318389,0.0,Coffee Shop,Fast Food Restaurant,Chinese Restaurant,Pharmacy,Pizza Place,Grocery Store,Indian Restaurant,Breakfast Spot,Japanese Restaurant,Electronics Store
4,Downtown Toronto,-79.442259,0.0,Pizza Place,Butcher,Coffee Shop,Pharmacy,Grocery Store,French Restaurant,Diner,Convenience Store,Cosmetics Shop,Furniture / Home Store
5,Downtown Toronto,-79.329656,0.0,Park,Food & Drink Shop,Fast Food Restaurant,Yoga Studio,Electronics Store,Cosmetics Shop,Curling Ice,Dance Studio,Dessert Shop,Diner
7,Downtown Toronto,-79.318389,0.0,Skating Rink,Pharmacy,Beer Store,Cosmetics Shop,Curling Ice,Dance Studio,Bus Stop,Park,Athletics & Sports,Video Store
8,Downtown Toronto,-79.315572,0.0,Sandwich Place,Park,Ice Cream Shop,Movie Theater,Food & Drink Shop,Pet Store,Italian Restaurant,Pizza Place,Pub,Burrito Place
10,Downtown Toronto,-79.476013,0.0,Convenience Store,Restaurant,Sandwich Place,Discount Store,Comic Shop,Cosmetics Shop,Curling Ice,Dance Studio,Dessert Shop,Diner
11,Downtown Toronto,-79.48445,0.0,Coffee Shop,Café,Pizza Place,Sushi Restaurant,Italian Restaurant,Fish & Chips Shop,Restaurant,Latin American Restaurant,Indie Movie Theater,Dessert Shop
12,Downtown Toronto,-79.321558,0.0,Light Rail Station,Yoga Studio,Skate Park,Garden Center,Fast Food Restaurant,Farmers Market,Comic Shop,Park,Pizza Place,Butcher


In [55]:
#Cluster 2
dtt_merged.loc[dtt_merged['Cluster Labels'] == 1, dtt_merged.columns[[1] + list(range(5, dtt_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,Downtown Toronto,-79.498509,1.0,Locksmith,Baseball Field,Yoga Studio,Falafel Restaurant,Curling Ice,Dance Studio,Dessert Shop,Diner,Discount Store,Electronics Store
16,Downtown Toronto,-79.532242,1.0,Furniture / Home Store,Baseball Field,Yoga Studio,Falafel Restaurant,Curling Ice,Dance Studio,Dessert Shop,Diner,Discount Store,Electronics Store


In [56]:
#Cluster 3
dtt_merged.loc[dtt_merged['Cluster Labels'] == 2, dtt_merged.columns[[1] + list(range(5, dtt_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Downtown Toronto,-79.464763,2.0,Airport,Park,Bus Stop,Empanada Restaurant,Curling Ice,Dance Studio,Dessert Shop,Diner,Discount Store,Electronics Store
