In [1]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd

#### First we get the wiki html page with requests.get(), and then read it to a BeautifulSoup object

In [2]:
website_page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_page,'lxml')

#### we use function find() to to extract the table, and the attribute class='wikitable sortable'

In [3]:
FSAs_table = soup.find('table',{'class':"wikitable sortable"})

#### what we interested in is in the 'td' fields:

In [4]:
lines = FSAs_table.findAll('td')
lines[0:9]

[<td>M1A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M2A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M3A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td>]

#### we extract the text from each td field line by line into an array:

In [5]:
FSAs_list = []
for line in lines:
    FSAs_list.append(line.text)
    
FSAs_list[0:9]

['M1A',
 'Not assigned',
 'Not assigned\n',
 'M2A',
 'Not assigned',
 'Not assigned\n',
 'M3A',
 'North York',
 'Parkwoods\n']

#### we rearrange Postcodes and Boroughs into a data frame:

In [6]:
df = pd.DataFrame()
df['Postcode'] = FSAs_list[0::3]
df['Borough'] = FSAs_list[1::3]
df.head()

Unnamed: 0,Postcode,Borough
0,M1A,Not assigned
1,M2A,Not assigned
2,M3A,North York
3,M4A,North York
4,M5A,Downtown Toronto


In [7]:
FSAs_list[2::3][1:5]

['Not assigned\n', 'Parkwoods\n', 'Victoria Village\n', 'Harbourfront\n']

#### the Neighborhood column needs to be processed before its added, namely the "\n" needs to be removed:

In [8]:
Neighborhoods = []
for Neighborhood in FSAs_list[2::3]:
    Neighborhoods.append(Neighborhood[:-1])
Neighborhoods[0:5]

['Not assigned',
 'Not assigned',
 'Parkwoods',
 'Victoria Village',
 'Harbourfront']

and now we attach it:

In [9]:
df['Neighborhood'] = Neighborhoods
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [10]:
df.shape

(288, 3)

#### now we drop the lines where the Borough is Not assigned:

In [11]:
df  = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [12]:
df.shape

(211, 3)

#### now we group by Postcode and concatenate the Neighborhood corresponding to the same Postcode

In [13]:
# first we groupe by borough
borough_groups = df.groupby('Borough')
# and initialize the lists that will make the final dataframe
Postcodes_list = []
Neighborhoods_list = []
Boroughs_list = []

#for each Borough we groupe by the Postcodes and concatenate the Neighborhoods
for Borough, Borough_df in borough_groups:
    Postcodes_groupped = Borough_df.groupby('Postcode')['Neighborhood'].apply(lambda x:  ', '.join(x))
    for Postcode, Neighborhood in Postcodes_groupped.items():
        Postcodes_list.append(Postcode)
        Boroughs_list.append(Borough)
        Neighborhoods_list.append(Neighborhood)


In [14]:
df_groupped = pd.DataFrame()
df_groupped['Postcode'] = Postcodes_list
df_groupped['Borough'] = Boroughs_list
df_groupped['Neighborhood'] = Neighborhoods_list
df_groupped.head(20)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M4N,Central Toronto,Lawrence Park
1,M4P,Central Toronto,Davisville North
2,M4R,Central Toronto,North Toronto West
3,M4S,Central Toronto,Davisville
4,M4T,Central Toronto,"Moore Park, Summerhill East"
5,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi..."
6,M5N,Central Toronto,Roselawn
7,M5P,Central Toronto,"Forest Hill North, Forest Hill West"
8,M5R,Central Toronto,"The Annex, North Midtown, Yorkville"
9,M4W,Downtown Toronto,Rosedale


In [15]:
df_groupped.shape

(103, 3)

#### to make sure there is no 2 Boroughs that share the same Postcode we group by Postcode again for the whole dataframe and check the 0th dimension again:

In [16]:
test = df_groupped.groupby('Postcode').count()
test.shape

(103, 2)

#### same number of lines, all good!

In [17]:
postal_codes = df_groupped['Postcode'].values
postal_codes

array(['M4N', 'M4P', 'M4R', 'M4S', 'M4T', 'M4V', 'M5N', 'M5P', 'M5R',
       'M4W', 'M4X', 'M4Y', 'M5A', 'M5B', 'M5C', 'M5E', 'M5G', 'M5H',
       'M5J', 'M5K', 'M5L', 'M5S', 'M5T', 'M5V', 'M5W', 'M5X', 'M6G',
       'M4E', 'M4K', 'M4L', 'M4M', 'M7Y', 'M4B', 'M4C', 'M4G', 'M4H',
       'M4J', 'M8V', 'M8W', 'M8X', 'M8Y', 'M8Z', 'M9A', 'M9B', 'M9C',
       'M9P', 'M9R', 'M9V', 'M9W', 'M7R', 'M2H', 'M2J', 'M2K', 'M2L',
       'M2M', 'M2N', 'M2P', 'M2R', 'M3A', 'M3B', 'M3C', 'M3H', 'M3J',
       'M3K', 'M3L', 'M3M', 'M3N', 'M4A', 'M5M', 'M6A', 'M6B', 'M6L',
       'M9L', 'M9M', 'M7A', 'M1B', 'M1C', 'M1E', 'M1G', 'M1H', 'M1J',
       'M1K', 'M1L', 'M1M', 'M1N', 'M1P', 'M1R', 'M1S', 'M1T', 'M1V',
       'M1W', 'M1X', 'M6H', 'M6J', 'M6K', 'M6P', 'M6R', 'M6S', 'M6C',
       'M6E', 'M6M', 'M6N', 'M9N'], dtype=object)

#### now lets check if we can get get some coordinates

In [18]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# try to get the coordinates
success = 0
for postal_code in postal_codes:
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng
  if lat_lng_coords != None:
    success  +=  1
print('Successfuly aquired coordinates: {}'.format(success))

Successfuly aquired coordinates: 0


In [19]:
# install geocoder
!pip install geocoder



#### no luck using geocoder so we resort to the csv file

In [20]:
df_coordinates = pd.read_csv('http://cocl.us/Geospatial_data')
df_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### we change the column name 'Postal Code' to 'Postcode' in order to merge the 2 dataframes on it:

In [21]:
df_coordinates.rename(index=str, columns={"Postal Code": "Postcode"},inplace = True)
df_coordinates.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
df_coordinates.shape

(103, 3)

In [23]:
df_merged = pd.merge(df_groupped, df_coordinates, on='Postcode')
df_merged.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
3,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316


In [24]:
Borough_count =  df_merged.groupby('Borough').count()
Borough_count 

Unnamed: 0_level_0,Postcode,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,18,18,18,18
East Toronto,5,5,5,5
East York,5,5,5,5
Etobicoke,12,12,12,12
Mississauga,1,1,1,1
North York,24,24,24,24
Queen's Park,1,1,1,1
Scarborough,17,17,17,17
West Toronto,6,6,6,6


In [25]:
df_merged.shape

(103, 5)

#### we take first 3 Boroughs

In [26]:
df_Toronto= df_merged[(df_merged['Borough'] == 'Central Toronto') | (df_merged['Borough'] =='Downtown Toronto') | (df_merged['Borough'] =='East Toronto') ].reset_index(drop=True)
df_Toronto

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
3,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
5,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049
6,M5N,Central Toronto,Roselawn,43.711695,-79.416936
7,M5P,Central Toronto,"Forest Hill North, Forest Hill West",43.696948,-79.411307
8,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678
9,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529


In [27]:
from geopy.geocoders import Nominatim 
import folium

#### we get the coordinates for Toronto and mark the corresponding postal areas we chose on the map:

In [28]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Ontario are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Ontario are 43.653963, -79.387207.


In [29]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for Postcode, lat, lng, borough, neighborhood in zip(df_Toronto['Postcode'], df_Toronto['Latitude'], df_Toronto['Longitude'], df_Toronto['Borough'], df_Toronto['Neighborhood']):
    label = '{}, {}, {}'.format(Postcode,neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [30]:
fname = 'credentials.txt'
with open(fname) as f:
    content = f.readlines()
content = [x.strip() for x in content] 

CLIENT_ID = content[0]
CLIENT_SECRET = content[1] 
VERSION = '20180605' 

In [31]:
def getNearbyVenues(Postcodes, names, latitudes, longitudes, radius=500,LIMIT=100):
    
    venues_list=[]
    for Postcode, name, lat, lng in zip(Postcodes, names, latitudes, longitudes):
        print(Postcode +" "+name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            Postcode,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode',
                  'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### we define the getNearbyVenues function to get the venues info with foursquare and run it:

In [32]:
toronto_venues = getNearbyVenues(Postcodes=df_Toronto['Postcode'],
                                   names=df_Toronto['Neighborhood'],
                                   latitudes=df_Toronto['Latitude'],
                                   longitudes=df_Toronto['Longitude']
                                  )

M4N Lawrence Park
M4P Davisville North
M4R North Toronto West
M4S Davisville
M4T Moore Park, Summerhill East
M4V Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
M5N Roselawn
M5P Forest Hill North, Forest Hill West
M5R The Annex, North Midtown, Yorkville
M4W Rosedale
M4X Cabbagetown, St. James Town
M4Y Church and Wellesley
M5A Harbourfront, Regent Park
M5B Ryerson, Garden District
M5C St. James Town
M5E Berczy Park
M5G Central Bay Street
M5H Adelaide, King, Richmond
M5J Harbourfront East, Toronto Islands, Union Station
M5K Design Exchange, Toronto Dominion Centre
M5L Commerce Court, Victoria Hotel
M5S Harbord, University of Toronto
M5T Chinatown, Grange Park, Kensington Market
M5V CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
M5W Stn A PO Boxes 25 The Esplanade
M5X First Canadian Place, Underground city
M6G Christie
M4E The Beaches
M4K The Danforth West, Riverdale
M4L The Beaches West, India Bazaar
M4M Studio

In [33]:
print(toronto_venues.shape)
toronto_venues.head()

(1521, 8)


Unnamed: 0,Postcode,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4N,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,M4N,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
2,M4N,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
3,M4P,Davisville North,43.712751,-79.390197,Sherwood Park,43.716551,-79.387776,Park
4,M4P,Davisville North,43.712751,-79.390197,Summerhill Market North,43.715499,-79.392881,Food & Drink Shop


In [34]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot.head()

Unnamed: 0,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
# move neighborhood column to the first column
toronto_onehot.drop(['Neighborhood'], axis=1,inplace = True)
toronto_onehot.insert(loc=0, column='Postcode', value=toronto_venues['Postcode'])
toronto_onehot.insert(loc=1, column='Neighborhood', value=toronto_venues['Neighborhood'])
toronto_onehot.head()

Unnamed: 0,Postcode,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M4N,Lawrence Park,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4N,Lawrence Park,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4N,Lawrence Park,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4P,Davisville North,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4P,Davisville North,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### now to add Neighborhood and Postcode columns:

In [36]:
toronto_grouped = toronto_onehot.groupby('Postcode').mean().reset_index()
#toronto_grouped = pd.merge(toronto_grouped, df_Toronto, on='Postcode')
toronto_grouped.head()

Unnamed: 0,Postcode,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02381
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### we groupe by Postcode since we have the corresponding coordinates for the postal codes not the neighborhoods:

In [37]:
toronto_grouped.shape

(32, 225)

In [38]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### we define a function that returns most common venues and use it to get top 10 for each Postcode:

In [39]:
import numpy as np 

In [40]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postcode'] = toronto_grouped['Postcode']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Health Food Store,Pub,Other Great Outdoors,Trail,Dessert Shop,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
1,M4K,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Bookstore,Brewery,Bubble Tea Shop,Restaurant,Café
2,M4L,Park,Ice Cream Shop,Pet Store,Pizza Place,Pub,Movie Theater,Burrito Place,Burger Joint,Sandwich Place,Brewery
3,M4M,Café,Coffee Shop,Gastropub,Italian Restaurant,Bakery,American Restaurant,Yoga Studio,Comfort Food Restaurant,Brewery,Seafood Restaurant
4,M4N,Bus Line,Park,Swim School,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


In [41]:
from sklearn.cluster import KMeans

#### and run run k-means clustering

In [42]:
# set number of clusters
kclusters = 3
toronto_grouped_clustering = toronto_grouped.drop('Postcode', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

In [43]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# merge neighborhoods_venues_sorted with df_Toronto to add Postcode/Borough/Neighborhood/latitude/longitude for each Postcode
toronto_merged = df_Toronto.join(neighborhoods_venues_sorted.set_index('Postcode'), on='Postcode')

toronto_merged # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Bus Line,Park,Swim School,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197,0,Hotel,Park,Food & Drink Shop,Clothing Store,Breakfast Spot,Sandwich Place,Playground,Grocery Store,Gym,Eastern European Restaurant
2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,0,Coffee Shop,Bagel Shop,Fast Food Restaurant,Mexican Restaurant,Diner,Dessert Shop,Park,Gym / Fitness Center,Clothing Store,Chinese Restaurant
3,M4S,Central Toronto,Davisville,43.704324,-79.38879,0,Pizza Place,Sandwich Place,Dessert Shop,Sushi Restaurant,Restaurant,Italian Restaurant,Café,Coffee Shop,Thai Restaurant,Park
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,1,Playground,Tennis Court,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
5,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049,0,Coffee Shop,Pub,Pizza Place,American Restaurant,Light Rail Station,Sports Bar,Supermarket,Sushi Restaurant,Bagel Shop,Fried Chicken Joint
6,M5N,Central Toronto,Roselawn,43.711695,-79.416936,2,Ice Cream Shop,Garden,Yoga Studio,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
7,M5P,Central Toronto,"Forest Hill North, Forest Hill West",43.696948,-79.411307,0,Park,Trail,Sushi Restaurant,Jewelry Store,Yoga Studio,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
8,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678,0,Coffee Shop,Sandwich Place,Café,Pizza Place,BBQ Joint,Indian Restaurant,Jewish Restaurant,Pub,Burger Joint,American Restaurant
9,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,0,Park,Playground,Trail,Building,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


In [44]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

#### pin down everything on the map

In [45]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [46]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,0,Bus Line,Park,Swim School,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
1,Central Toronto,0,Hotel,Park,Food & Drink Shop,Clothing Store,Breakfast Spot,Sandwich Place,Playground,Grocery Store,Gym,Eastern European Restaurant
2,Central Toronto,0,Coffee Shop,Bagel Shop,Fast Food Restaurant,Mexican Restaurant,Diner,Dessert Shop,Park,Gym / Fitness Center,Clothing Store,Chinese Restaurant
3,Central Toronto,0,Pizza Place,Sandwich Place,Dessert Shop,Sushi Restaurant,Restaurant,Italian Restaurant,Café,Coffee Shop,Thai Restaurant,Park
5,Central Toronto,0,Coffee Shop,Pub,Pizza Place,American Restaurant,Light Rail Station,Sports Bar,Supermarket,Sushi Restaurant,Bagel Shop,Fried Chicken Joint
7,Central Toronto,0,Park,Trail,Sushi Restaurant,Jewelry Store,Yoga Studio,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
8,Central Toronto,0,Coffee Shop,Sandwich Place,Café,Pizza Place,BBQ Joint,Indian Restaurant,Jewish Restaurant,Pub,Burger Joint,American Restaurant
9,Downtown Toronto,0,Park,Playground,Trail,Building,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
10,Downtown Toronto,0,Coffee Shop,Restaurant,Pub,Italian Restaurant,Pizza Place,Bakery,Park,Café,Caribbean Restaurant,Farmers Market
11,Downtown Toronto,0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Gay Bar,Mediterranean Restaurant,Café,Hotel,Gym,Pub


#### analyse the clusters:

In [47]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,1,Playground,Tennis Court,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


In [48]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Central Toronto,2,Ice Cream Shop,Garden,Yoga Studio,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


#### we can conclude the there is 1 main cluster which is the areas where Coffee Shops and Parks are most popular venues, and other 2 outliers 