In [4]:
#import libraries
import numpy as np
import pandas as pd
import json

from pandas.io.json import json_normalize
from sklearn.cluster import KMeans 

! conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 

! conda install -c conda-forge folium=0.5.0 --yes
import folium

%matplotlib inline
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.colors as colors

!conda install -c conda beautifulsoup4 --yes
from bs4 import BeautifulSoup
import requests

print("Libraries imported!") 

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          92 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.21.0-py_0



Downloading and Extracting Packages
geopy-1.21.0         | 58 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ##################################### |

In [5]:
# retrieve data
!wget -q -O 'toronto_data.json' https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M  
print("Dataset downloaded!")

Dataset downloaded!


In [124]:
#scrape and transform data using BeautifulSoup
website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

soup = BeautifulSoup(website_url, 'html')
wikitable = soup.find("table")

wikitable.prettify()
  
A= []
B= []
C= []

for row in wikitable.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

In [132]:
#create dataframe
df= pd.DataFrame(A, columns=['PostalCode'])
df['Borough']= B
df['Neighborhood']= C
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n


In [157]:
#cleanup dataframe per instructions

    #remove "\n" from each cell 
df.replace(to_replace="\n", value="", inplace=True, regex=True)

    #remove rows with unassigned boroughs
Notassigned = df[df['Borough'] == 'Not assigned'].index
df.drop(Notassigned, inplace=True)

    #reset index after removing rows
df.reset_index(drop=True, inplace=True)

    #replacing the slashes with commas
df["Neighborhood"] = df["Neighborhood"].str.replace(" /", ",")

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [158]:
#ensuring that there are no duplicates in the PostalCode column
print("*THIS IS THE RESULT FOR QUESTION 1*")
print("Unique Postal Codes:", df['PostalCode'].nunique())

#printing dataframe shape per instructions 
print("Dataframe Shape:", df.shape)

*THIS IS THE RESULT FOR QUESTION 1*
Unique Postal Codes: 103
Dataframe Shape: (103, 3)


In [159]:
#retrive coordinate data (because geocords is not working)
!wget -q -O 'geocords.csv' http://cocl.us/Geospatial_data
print("Website downloaded!")

df_geocords= pd.read_csv('geocords.csv')
df_geocords.head()

Website downloaded!


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [160]:
#reorder rows of coordinate data by the order of Postal Codes in intital dataframe
df_geocords = df_geocords.set_index('Postal Code')
df_geocords = df_geocords.reindex(index=df['PostalCode'])
df_geocords = df_geocords.reset_index()
df_geocords.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M3A,43.753259,-79.329656
1,M4A,43.725882,-79.315572
2,M5A,43.65426,-79.360636
3,M6A,43.718518,-79.464763
4,M7A,43.662301,-79.389494


In [161]:
#drop PostalCode column and merge both dataframes to reflect instructions 
df_geocords.drop(columns='PostalCode')
df= pd.merge(df, df_geocords, how='left')

print("*THIS IS THE RESULT FOR QUESTION 2*")
df.head()

*THIS IS THE RESULT FOR QUESTION 2*


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [162]:
#list of boroughs
print("Number of Boroughs:", df['Borough'].nunique())
print("Borough Names:", df['Borough'].unique())

Number of Boroughs: 10
Borough Names: ['North York' 'Downtown Toronto' 'Etobicoke' 'Scarborough' 'East York'
 'York' 'East Toronto' 'West Toronto' 'Central Toronto' 'Mississauga']


### As suggested by the instructions, this submission explores neighborhoods that are catalogued within Postal Codes which are located in boroughs containing "Toronto". 
#### This includes "Downtown Toronto", "East Toronto", "West Toronto", and "Central Toronto" 

In [163]:
#filter dataframe for toronto neighborhoods
toronto_df= df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)

print("Number of location points:", toronto_df['PostalCode'].count())
print(toronto_df['Borough'].value_counts())
toronto_df.head()

Number of location points: 39
Downtown Toronto    19
Central Toronto      9
West Toronto         6
East Toronto         5
Name: Borough, dtype: int64


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [164]:
#visualize the 39 location points on a map

address = 'Toronto, ON'
geolocator = Nominatim(user_agent="toronto_explorer")

location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print("Toronto coordinates:", latitude, longitude)

Toronto coordinates: 43.6534817 -79.3839347


In [165]:
# create map of toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

In [187]:
#foursquare API info
CLIENT_ID = 'G2IDKV2T43SF5MNWBLGYK3QQWF4ZDPZ2MD1V03OMSGFP4PNS' 
CLIENT_SECRET = '3VWLALVLRI5GFX4CQ5HBNO42MOGVVCGQZERC1QOV44G5M4QB' 
VERSION = '20180605'

#let's explore 50 venues within 500 meters of Downtown Toronto's "St. James Town" neighborhood (loc. 3 in toronto_df)
LIMIT= 50
radius = 500
neighborhood_lat = toronto_df.loc[3, 'Latitude'] 
neighborhood_lng = toronto_df.loc[3, 'Longitude'] 

url1= 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    neighborhood_lat, 
    neighborhood_lng, 
    VERSION, 
    radius, 
    LIMIT)

results1 = requests.get(url1).json()

In [167]:
#using lab function for extracting category type
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#set up dataframe    
stjames_venues = results1['response']['groups'][0]['items']
stjames_venues = pd.json_normalize(stjames_venues)

#filter dataframe columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
stjames_venues= stjames_venues.loc[:, filtered_columns]

# filter the category for each row using the above lab function
stjames_venues['venue.categories'] = stjames_venues.apply(get_category_type, axis=1)

# cleanup column names
stjames_venues.columns = [col.split(".")[-1] for col in stjames_venues.columns]

#start index at 1 to quickly understand the 50 venues (optional)
stjames_venues.index+=1

stjames_venues.head()

Unnamed: 0,name,categories,lat,lng
1,Gyu-Kaku Japanese BBQ,Japanese Restaurant,43.651422,-79.375047
2,GoodLife Fitness Toronto 137 Yonge Street,Gym,43.651242,-79.378068
3,GEORGE Restaurant,Restaurant,43.653346,-79.374445
4,Fahrenheit Coffee,Coffee Shop,43.652384,-79.372719
5,Aveda Institute Toronto,Cosmetics Shop,43.650096,-79.37363


In [168]:
#instead of showing the whole list, let's see the aggregated category types in our results 
stjames_venues['categories'].value_counts()

Café                             5
Coffee Shop                      3
Beer Bar                         2
Farmers Market                   2
Japanese Restaurant              2
Cocktail Bar                     2
Park                             2
Hotel                            2
Gastropub                        2
Restaurant                       2
Cosmetics Shop                   2
Art Gallery                      1
Tailor Shop                      1
Vegetarian / Vegan Restaurant    1
Creperie                         1
Fountain                         1
American Restaurant              1
Bookstore                        1
Middle Eastern Restaurant        1
Thai Restaurant                  1
Hostel                           1
Diner                            1
Bakery                           1
Seafood Restaurant               1
Ice Cream Shop                   1
Department Store                 1
Cheese Shop                      1
Gym                              1
Breakfast Spot      

In [169]:
#now let's do the same for all 39 location points
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url2 = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        results = requests.get(url2).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues2 = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues2.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude',  
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues2)

toronto_venues = getNearbyVenues(names=toronto_df['Neighborhood'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude']
                                  )

print("Number of venues:", toronto_venues.shape)
toronto_venues.head()

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
High Park, The Junction South
North Toronto West
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst
Rosedale
Stn A PO Boxes
St. James Town, Cabbagetown
First Canadian Place, U

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


In [171]:
print("Number of unique venue categories:", toronto_venues['Venue Category'].nunique())
toronto_venues.groupby('Neighborhood').count()

Number of unique venue categories: 209


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,50,50,50,50,50,50
"Brockton, Parkdale Village, Exhibition Place",23,23,23,23,23,23
Business reply mail Processing CentrE,16,16,16,16,16,16
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst",14,14,14,14,14,14
Central Bay Street,50,50,50,50,50,50
Christie,18,18,18,18,18,18
Church and Wellesley,50,50,50,50,50,50
"Commerce Court, Victoria Hotel",50,50,50,50,50,50
Davisville,34,34,34,34,34,34
Davisville North,7,7,7,7,7,7


In [172]:
# let's analyze each neighborhood by one-hot encoding by the 209 unique venue categories
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe and to first column
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [173]:
#grouped by neighborhood with mean values for each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.columns = [col.split("Venue Category_")[-1] for col in toronto_grouped.columns]

print(toronto_grouped.shape)
toronto_grouped.head()

(39, 209)


Unnamed: 0,Neighborhood,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business reply mail Processing CentrE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.071429,0.071429,0.071429,0.142857,0.142857,0.142857,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0


In [174]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print(hood)
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

Berczy Park
                venue  freq
0         Coffee Shop  0.06
1              Bakery  0.04
2  Seafood Restaurant  0.04
3         Cheese Shop  0.04
4                Café  0.04


Brockton, Parkdale Village, Exhibition Place
                venue  freq
0                Café  0.13
1         Coffee Shop  0.09
2      Breakfast Spot  0.09
3                 Gym  0.04
4  Italian Restaurant  0.04


Business reply mail Processing CentrE
           venue  freq
0    Pizza Place  0.06
1  Auto Workshop  0.06
2     Restaurant  0.06
3        Butcher  0.06
4  Burrito Place  0.06


CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst
                venue  freq
0      Airport Lounge  0.14
1     Airport Service  0.14
2    Airport Terminal  0.14
3            Boutique  0.07
4  Airport Food Court  0.07


Central Bay Street
                venue  freq
0         Coffee Shop  0.18
1  Italian Restaurant  0.06
2                 Spa  0.04
3        Burger Joint  0.04
4                Café  0.

In [175]:
#transform above into dataframe
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.index+=1
neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Berczy Park,Coffee Shop,Bakery,Cheese Shop,Café,Farmers Market,Beer Bar,Seafood Restaurant,Cocktail Bar,Restaurant,Breakfast Spot
2,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Grocery Store,Stadium,Bar,Restaurant,Bakery,Burrito Place,Climbing Gym
3,Business reply mail Processing CentrE,Park,Auto Workshop,Fast Food Restaurant,Farmers Market,Light Rail Station,Comic Shop,Garden Center,Pizza Place,Butcher,Restaurant
4,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Terminal,Airport Lounge,Coffee Shop,Boutique,Boat or Ferry,Harbor / Marina,Airport Gate,Airport Food Court,Airport
5,Central Bay Street,Coffee Shop,Italian Restaurant,Bubble Tea Shop,Burger Joint,Spa,Café,Sandwich Place,Yoga Studio,Department Store,Chinese Restaurant
6,Christie,Grocery Store,Café,Park,Athletics & Sports,Nightclub,Italian Restaurant,Diner,Restaurant,Baby Store,Candy Store
7,Church and Wellesley,Gay Bar,Coffee Shop,Burger Joint,Gastropub,Yoga Studio,Men's Store,Restaurant,Salon / Barbershop,Sake Bar,Dance Studio
8,"Commerce Court, Victoria Hotel",Café,Coffee Shop,Restaurant,Seafood Restaurant,Hotel,Gastropub,Beer Bar,Japanese Restaurant,Gym,American Restaurant
9,Davisville,Sandwich Place,Dessert Shop,Pizza Place,Gym,Café,Italian Restaurant,Coffee Shop,Sushi Restaurant,Gas Station,Diner
10,Davisville North,Park,Gym,Breakfast Spot,Sandwich Place,Hotel,Department Store,Food & Drink Shop,Concert Hall,Comfort Food Restaurant,Dog Run


In [176]:
# set number of clusters
ks= 6

toronto_grouped_clusters= toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=ks, random_state=0).fit(toronto_grouped_clusters)

# check cluster labels generated for each row in the dataframe
kmeans.labels_


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 3, 0,
       0, 0, 0, 0, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [177]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# merge neighborhood_venues_sorted with toronto_df (that's the original df) to add latitude/longitude for each neighborhood
toronto_merged = toronto_df
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

#(you may need to run the below becuase I made a small error when adding clustering labels if the code is run more than once)
#toronto_merged.drop(columns='Cluster Label', inplace=True)

toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Pub,Bakery,Park,Restaurant,Breakfast Spot,Theater,Café,Mexican Restaurant,Farmers Market
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Diner,Mexican Restaurant,Beer Bar,Sandwich Place,Burger Joint,Burrito Place,Café,Park,College Auditorium
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Coffee Shop,Café,Ramen Restaurant,Cosmetics Shop,Italian Restaurant,Restaurant,Bookstore,Theater,Clothing Store,Japanese Restaurant
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Café,Coffee Shop,Japanese Restaurant,Cosmetics Shop,Restaurant,Park,Farmers Market,Gastropub,Hotel,Cocktail Bar
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Trail,Pub,Cuban Restaurant,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop


In [178]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters (color scheme and markers follows lab)
x = np.arange(ks)
ys = [i + x + (i*x)**2 for i in range(ks)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## The map and cluster dataframes below are the responses to question 3
### Because Cluster 0 contains 34 out of the 39, this result seems skewed upon further inspection. This is because the algorithm does not recognize the different food outlets as subsets of "restaurants", etc. 
#### In the least, a cursory review of the five clusters reveals that clusters 1-5 are all mostly located in Central Toronto (the only one that is located in Downtown Toronto is right on the border with Central Toronto) and do not contain any food outlets in the TOP 3 VENUES (wine shop not counting as food). These neighborhoods are more visited for outside attractions such as parks, trails, gardens, and playgrounds, rather than food outlets. More analysis should be done to re-classify the foursquare venue categories, depending on what we optimizing for. However, that's "outside of the scope of this assignment!"

In [179]:
ck= toronto_merged.groupby('Cluster Labels')

### Cluster 1

In [180]:
ck.get_group(1)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
21,M5P,Central Toronto,Forest Hill North & West,43.696948,-79.411307,1,Park,Bus Line,Jewelry Store,Trail,Sushi Restaurant,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner


### Cluster 2

In [181]:
ck.get_group(2)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,M5N,Central Toronto,Roselawn,43.711695,-79.416936,2,Garden,Home Service,Wine Shop,Cupcake Shop,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner


### Cluster 3

In [182]:
ck.get_group(3)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
29,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,3,Trail,Playground,Tennis Court,Wine Shop,Cuban Restaurant,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner


### Cluster 4

In [183]:
ck.get_group(4)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Park,Swim School,Bus Line,Cupcake Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop


### Cluster 5

In [185]:
ck.get_group(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
33,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,5,Park,Trail,Playground,Creperie,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store


### Cluster 0

In [186]:
ck.get_group(0)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Pub,Bakery,Park,Restaurant,Breakfast Spot,Theater,Café,Mexican Restaurant,Farmers Market
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Diner,Mexican Restaurant,Beer Bar,Sandwich Place,Burger Joint,Burrito Place,Café,Park,College Auditorium
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Coffee Shop,Café,Ramen Restaurant,Cosmetics Shop,Italian Restaurant,Restaurant,Bookstore,Theater,Clothing Store,Japanese Restaurant
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Café,Coffee Shop,Japanese Restaurant,Cosmetics Shop,Restaurant,Park,Farmers Market,Gastropub,Hotel,Cocktail Bar
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Trail,Pub,Cuban Restaurant,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0,Coffee Shop,Bakery,Cheese Shop,Café,Farmers Market,Beer Bar,Seafood Restaurant,Cocktail Bar,Restaurant,Breakfast Spot
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,0,Coffee Shop,Italian Restaurant,Bubble Tea Shop,Burger Joint,Spa,Café,Sandwich Place,Yoga Studio,Department Store,Chinese Restaurant
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564,0,Grocery Store,Café,Park,Athletics & Sports,Nightclub,Italian Restaurant,Diner,Restaurant,Baby Store,Candy Store
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,0,Coffee Shop,Café,Pizza Place,Bar,Steakhouse,Restaurant,American Restaurant,Asian Restaurant,New American Restaurant,Seafood Restaurant
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259,0,Bakery,Pharmacy,Supermarket,Recording Studio,Bank,Bar,Café,Gym / Fitness Center,Grocery Store,Middle Eastern Restaurant
