**Explore and cluster the neighborhoods in Toronto. I only work with boroughs that contain the word Toronto**

In [21]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

To obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

In [22]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).content
soup = BeautifulSoup(html,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table), header=0)[0]
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [23]:
df.rename(columns={'Neighbourhood':'Neighborhood'}, inplace = True)
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Ignore cells with a borough that is Not assigned

In [24]:
df.replace("Not assigned", np.nan, inplace=True)
df.dropna(subset=['Borough'], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


Rows will be combined into one row with the neighborhoods separated with a comma

In [25]:
df1 = df.groupby(['Postcode','Borough'])['Neighborhood'].apply(lambda x: ','.join(x.astype(str))).reset_index()
df1.head(20)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


Not assigned neighborhood

In [26]:
df1.tail(20)

Unnamed: 0,Postcode,Borough,Neighborhood
83,M6R,West Toronto,"Parkdale,Roncesvalles"
84,M6S,West Toronto,"Runnymede,Swansea"
85,M7A,Queen's Park,
86,M7R,Mississauga,Canada Post Gateway Processing Centre
87,M7Y,East Toronto,Business reply mail Processing Centre969 Eastern
88,M8V,Etobicoke,"Humber Bay Shores,Mimico South,New Toronto"
89,M8W,Etobicoke,"Alderwood,Long Branch"
90,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
91,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout..."
92,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw..."


Neighborhood will be the same as the borough

In [27]:
df1['Neighborhood'].replace("nan", df1['Borough'], inplace=True)
df1.tail(20)

Unnamed: 0,Postcode,Borough,Neighborhood
83,M6R,West Toronto,"Parkdale,Roncesvalles"
84,M6S,West Toronto,"Runnymede,Swansea"
85,M7A,Queen's Park,Queen's Park
86,M7R,Mississauga,Canada Post Gateway Processing Centre
87,M7Y,East Toronto,Business reply mail Processing Centre969 Eastern
88,M8V,Etobicoke,"Humber Bay Shores,Mimico South,New Toronto"
89,M8W,Etobicoke,"Alderwood,Long Branch"
90,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
91,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout..."
92,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw..."


We add the geographical coordinates of each postal code

In [28]:
url1 = 'http://cocl.us/Geospatial_data'
df2 = pd.read_csv(url1)
df2

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [29]:
df1[['Latitude', 'Longitude']] = df2[['Latitude', 'Longitude']]
df1.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


**We select Toronto**

In [30]:
df2 = df1[df1['Borough'].str.contains("Toronto")].reset_index(drop=True)
df2.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


In [31]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

# All requested packages already installed.



In [32]:
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

**Create a map of Toronto with neighborhoods superimposed on top**

I take the mean of Latitude and the mean of Longitude to create the map

In [33]:
latitude = df2['Latitude'].mean()
longitude = df2['Longitude'].mean()
map = folium.Map(location=[latitude, longitude], zoom_start=10)
map

Neighborhoods superimposed on top

In [34]:
for lat, lon, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='yellow',
        fill_opacity=0.5,
        parse_html=False).add_to(map)  
    
map


Let's slice Toronto dataframe and create a new dataframe of the Central Toronto.

In [42]:
df3 = df2[df2['Borough'] == 'Central Toronto'].reset_index(drop=True)
df3

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
3,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
5,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049
6,M5N,Central Toronto,Roselawn,43.711695,-79.416936
7,M5P,Central Toronto,"Forest Hill North,Forest Hill West",43.696948,-79.411307
8,M5R,Central Toronto,"The Annex,North Midtown,Yorkville",43.67271,-79.405678


I take the mean of Latitude and the mean of Longitude to create the map

In [43]:
latitude1 = df3['Latitude'].mean()
longitude1 = df3['Longitude'].mean()
map1 = folium.Map(location=[latitude1, longitude1], zoom_start=10)
map1

In [44]:
# add markers to map
for lat, lon, label in zip(df3['Latitude'], df3['Longitude'], df3['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='red',
        fill_opacity=0.5,
        parse_html=False).add_to(map1)  
    
map1


Define Foursquare Credentials and Version

In [20]:
CLIENT_ID = '2SRL1KRLKMDOPB53MA30O0BCKNEZFBPHNTAAKKYJD1RCI0IH' # your Foursquare ID
CLIENT_SECRET = '4XFUL1KQVXEFRO0V04XKXYUKKYBCAGIWWJQ1G4Y5C30EGNMH' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

neighborhood_latitude = df3.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df3.loc[0, 'Longitude'] # neighborhood longitude value
neighborhood_name = df3.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Lawrence Park are 43.7280205, -79.3887901.


In [None]:
Now, let's get the top venues that are in Lawrence Park within a radius of 500 meters.

In [45]:
radius = 500
LIMIT = 100

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=2SRL1KRLKMDOPB53MA30O0BCKNEZFBPHNTAAKKYJD1RCI0IH&client_secret=4XFUL1KQVXEFRO0V04XKXYUKKYBCAGIWWJQ1G4Y5C30EGNMH&v=20180605&ll=43.7280205,-79.3887901&radius=500&limit=100'

In [46]:
results = requests.get(url).json()

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']


Now we are ready to clean the json and structure it into a pandas dataframe.

In [48]:
from pandas.io.json import json_normalize

In [52]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()


Unnamed: 0,name,categories,lat,lng
0,Lawrence Park Ravine,Park,43.726963,-79.394382
1,Rue Pigalle HC,Jewelry Store,43.727183,-79.387874
2,Dim Sum Deluxe,Dim Sum Restaurant,43.726953,-79.39426
3,Zodiac Swim School,Swim School,43.728532,-79.38286
4,TTC Bus #162 - Lawrence-Donway,Bus Line,43.728026,-79.382805


In [50]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

5 venues were returned by Foursquare.


Explore Neighborhoods in Central Toronto

In [53]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


Now write the code to run the above function on each neighborhood and create a new dataframe. First, let's separate the Neighborhood

In [61]:
def splitDataFrameList(df,target_column,separator):
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df


In [62]:
df4=splitDataFrameList(df3,'Neighborhood',',')
df4

Unnamed: 0,Borough,Latitude,Longitude,Neighborhood,Postcode
0,Central Toronto,43.72802,-79.38879,Lawrence Park,M4N
1,Central Toronto,43.712751,-79.390197,Davisville North,M4P
2,Central Toronto,43.715383,-79.405678,North Toronto West,M4R
3,Central Toronto,43.704324,-79.38879,Davisville,M4S
4,Central Toronto,43.689574,-79.38316,Moore Park,M4T
5,Central Toronto,43.689574,-79.38316,Summerhill East,M4T
6,Central Toronto,43.686412,-79.400049,Deer Park,M4V
7,Central Toronto,43.686412,-79.400049,Forest Hill SE,M4V
8,Central Toronto,43.686412,-79.400049,Rathnelly,M4V
9,Central Toronto,43.686412,-79.400049,South Hill,M4V


In [63]:
CentralToronto_venues = getNearbyVenues(names=df4['Neighborhood'],
                                   latitudes=df4['Latitude'],
                                   longitudes=df4['Longitude']
                                  )

Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park
Summerhill East
Deer Park
Forest Hill SE
Rathnelly
South Hill
Summerhill West
Roselawn
Forest Hill North
Forest Hill West
The Annex
North Midtown
Yorkville


In [64]:
CentralToronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,38,38,38,38,38,38
Davisville North,7,7,7,7,7,7
Deer Park,14,14,14,14,14,14
Forest Hill North,4,4,4,4,4,4
Forest Hill SE,14,14,14,14,14,14
Forest Hill West,4,4,4,4,4,4
Lawrence Park,5,5,5,5,5,5
Moore Park,3,3,3,3,3,3
North Midtown,27,27,27,27,27,27
North Toronto West,20,20,20,20,20,20


In [65]:
print('There are {} uniques categories.'.format(len(CentralToronto_venues['Venue Category'].unique())))

There are 64 uniques categories.


Analyze Each Neighborhood

In [66]:
# one hot encoding
CentralToronto_onehot = pd.get_dummies(CentralToronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
CentralToronto_onehot['Neighborhood'] = CentralToronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [CentralToronto_onehot.columns[-1]] + list(CentralToronto_onehot.columns[:-1])
CentralToronto_onehot = CentralToronto_onehot[fixed_columns]

CentralToronto_onehot.head()


Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,...,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,Lawrence Park,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [69]:
CentralToronto_grouped = CentralToronto_onehot.groupby('Neighborhood').mean().reset_index()
CentralToronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,...,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.0,0.0,0.0,0.0,0.026316,0.026316,0.0,0.052632,0.0,...,0.0,0.0,0.052632,0.0,0.052632,0.026316,0.0,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.0,0.142857,0.0,0.142857,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Deer Park,0.071429,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,...,0.071429,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.071429,0.0
3,Forest Hill North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0
4,Forest Hill SE,0.071429,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,...,0.071429,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.071429,0.0
5,Forest Hill West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0
6,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,...,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0
7,Moore Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,North Midtown,0.037037,0.037037,0.0,0.0,0.0,0.037037,0.0,0.111111,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037037,0.0,0.0
9,North Toronto West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05


Let's print each neighborhood along with the top 5 most common venues

In [70]:
num_top_venues = 5

for hood in CentralToronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = CentralToronto_grouped[CentralToronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')



----Davisville----
              venue  freq
0      Dessert Shop  0.08
1    Sandwich Place  0.08
2   Thai Restaurant  0.05
3              Café  0.05
4  Sushi Restaurant  0.05


----Davisville North----
               venue  freq
0     Clothing Store  0.14
1       Burger Joint  0.14
2  Food & Drink Shop  0.14
3               Park  0.14
4              Hotel  0.14


----Deer Park----
                 venue  freq
0          Coffee Shop  0.14
1                  Pub  0.14
2           Bagel Shop  0.07
3     Sushi Restaurant  0.07
4  Fried Chicken Joint  0.07


----Forest Hill North----
                venue  freq
0    Sushi Restaurant  0.25
1               Trail  0.25
2       Jewelry Store  0.25
3                Park  0.25
4  Mexican Restaurant  0.00


----Forest Hill SE----
                 venue  freq
0          Coffee Shop  0.14
1                  Pub  0.14
2           Bagel Shop  0.07
3     Sushi Restaurant  0.07
4  Fried Chicken Joint  0.07


----Forest Hill West----
                venu

Let's put that into a pandas dataframe

In [72]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


In [73]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = CentralToronto_grouped['Neighborhood']

for ind in np.arange(CentralToronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(CentralToronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Sandwich Place,Dessert Shop,Pizza Place,Seafood Restaurant,Coffee Shop,Thai Restaurant,Restaurant,Sushi Restaurant,Italian Restaurant,Café
1,Davisville North,Hotel,Breakfast Spot,Park,Burger Joint,Sandwich Place,Food & Drink Shop,Clothing Store,Yoga Studio,French Restaurant,Fast Food Restaurant
2,Deer Park,Pub,Coffee Shop,American Restaurant,Supermarket,Light Rail Station,Convenience Store,Sports Bar,Pizza Place,Sushi Restaurant,Fried Chicken Joint
3,Forest Hill North,Jewelry Store,Trail,Park,Sushi Restaurant,French Restaurant,Diner,Farmers Market,Fast Food Restaurant,Flower Shop,Food & Drink Shop
4,Forest Hill SE,Pub,Coffee Shop,American Restaurant,Supermarket,Light Rail Station,Convenience Store,Sports Bar,Pizza Place,Sushi Restaurant,Fried Chicken Joint
5,Forest Hill West,Jewelry Store,Trail,Park,Sushi Restaurant,French Restaurant,Diner,Farmers Market,Fast Food Restaurant,Flower Shop,Food & Drink Shop
6,Lawrence Park,Bus Line,Park,Swim School,Jewelry Store,Dim Sum Restaurant,French Restaurant,Farmers Market,Fast Food Restaurant,Flower Shop,Food & Drink Shop
7,Moore Park,Intersection,Park,Playground,Yoga Studio,Dim Sum Restaurant,Gym,Greek Restaurant,Gourmet Shop,Garden,Fried Chicken Joint
8,North Midtown,Sandwich Place,Café,Coffee Shop,Pizza Place,History Museum,Pharmacy,Park,Liquor Store,Jewish Restaurant,Indian Restaurant
9,North Toronto West,Clothing Store,Sporting Goods Shop,Coffee Shop,Yoga Studio,Fast Food Restaurant,Diner,Mexican Restaurant,Park,Dessert Shop,Rental Car Location


Cluster Neighborhoods  
4 clusters: East, West, Central, Downtown

In [76]:
kclusters = 4
CentralToronto_grouped_clustering = CentralToronto_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(CentralToronto_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:80]

array([1, 1, 1, 2, 1, 2, 2, 3, 1, 1, 1, 0, 1, 3, 1, 1, 1], dtype=int32)

In [79]:
CentralToronto_merged = df4
# add clustering labels
CentralToronto_merged['Cluster Labels'] = kmeans.labels_
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
CentralToronto_merged = CentralToronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
CentralToronto_merged.head() 

Unnamed: 0,Borough,Latitude,Longitude,Neighborhood,Postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,43.72802,-79.38879,Lawrence Park,M4N,1,Bus Line,Park,Swim School,Jewelry Store,Dim Sum Restaurant,French Restaurant,Farmers Market,Fast Food Restaurant,Flower Shop,Food & Drink Shop
1,Central Toronto,43.712751,-79.390197,Davisville North,M4P,1,Hotel,Breakfast Spot,Park,Burger Joint,Sandwich Place,Food & Drink Shop,Clothing Store,Yoga Studio,French Restaurant,Fast Food Restaurant
2,Central Toronto,43.715383,-79.405678,North Toronto West,M4R,1,Clothing Store,Sporting Goods Shop,Coffee Shop,Yoga Studio,Fast Food Restaurant,Diner,Mexican Restaurant,Park,Dessert Shop,Rental Car Location
3,Central Toronto,43.704324,-79.38879,Davisville,M4S,2,Sandwich Place,Dessert Shop,Pizza Place,Seafood Restaurant,Coffee Shop,Thai Restaurant,Restaurant,Sushi Restaurant,Italian Restaurant,Café
4,Central Toronto,43.689574,-79.38316,Moore Park,M4T,1,Intersection,Park,Playground,Yoga Studio,Dim Sum Restaurant,Gym,Greek Restaurant,Gourmet Shop,Garden,Fried Chicken Joint


In [80]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

In [81]:
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(CentralToronto_merged['Latitude'], CentralToronto_merged['Longitude'], CentralToronto_merged['Neighborhood'], CentralToronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
