# Segmenting and Clustering Neighbourhoods in Toronto

# Part 1

## 1.0 Obtaining neighbourhoods data from Wikipedia 

In [1]:
#Library Import
import pandas as pd # library for data analsysis 
import numpy as np

In [2]:
#This packagae nee to be installed in order to read the html
!pip install lxml
!pip install html5lib
!pip3 install lxml
import lxml




In [3]:
#Scraping wikipedia table to pandas data frame
import requests
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Data Cleaning

### 1.1 Ignore cells with a borough that is Not assigned

In [4]:
#Ignore cells with a borough that is Not assigned
df.drop(df.loc[df['Borough']=='Not assigned'].index, inplace=True)
df=df.reset_index(drop= True) #resetting the index
df.head(10)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### 1.2 Merge Boroughs that have the same postal code in a same row.

In [5]:
#Merge Boroughs that have the same postal code in a same row. Separate by comma
df_borough=df.groupby('Postal Code').agg({'Borough':'first','Neighbourhood':', '.join}).reset_index()
df_borough.sort_values(by=['Postal Code']).head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### 1.3 Assign the name of the Borough to not assiged neighbourhood

In [6]:
#Assign the name of the Borough to not assiged neighbourhood

#Find the Not assigned Neighbourhood
NA_Neighborhood= df_borough['Neighbourhood'] == 'Not assigned'
#NA_Neighborhood[NA_Neighborhood ==False].shape[0] #Count where this condition apply

#Assign Borough name to the neighbourhood that are Not Assigned
df_borough.loc[NA_Neighborhood, 'Neighbourhood'] = df_borough.loc[NA_Neighborhood, 'Borough'] 
df_borough.head()



Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### 1.4 Print Row of the DataFrame

In [7]:
df_borough.shape

(103, 3)

# Part 2

## 2.0 Get the latitude and the longitude coordinates of each neighborhood

In [8]:
#Import the library
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [9]:
#Install and import geoconder
!pip install geocoder
import geocoder # import geocoder




In [10]:
#Install and import BeautifulSoup
!pip install bs4
from bs4 import BeautifulSoup



In [14]:
#Getting the coordinates 
df_coordinate= pd.read_csv('https://cocl.us/Geospatial_data')
df_coordinate.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
#Merging the dataframes
df_merged=pd.merge(df_borough, df_coordinate, on="Postal Code")
df_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Part 3 - Explore and cluster the neighborhoods in Toronto

## 3.1 Create a Map of Toronto with Boroughs and select one

In [45]:
#Count Borough and Neighborhoods
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_merged['Borough'].unique()),
        df_merged.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


### Use geopy library to get the latitude and longitude values of Toronto

In [24]:
address = 'Toronto, TO'

#geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.65238435, -79.38356765.


### Create a map of Toronto with neighborhoods superimposed on top

In [26]:
import folium # map rendering library

In [34]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Borough'], df_merged['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### To simplify the visualisation we focus on Central Toronto 

In [44]:
central_to_data= df_merged[df_merged['Borough'] == 'Central Toronto'].reset_index(drop=True)
central_to_data.head(20)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
5,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049
6,M5N,Central Toronto,Roselawn,43.711695,-79.416936
7,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307
8,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678


### Let's get the coordinate of Central Toronto

In [38]:
address_central_to = 'Central Toronto'

geolocator = Nominatim(user_agent="central_toronto_explorer")
location_central_to = geolocator.geocode(address_central_to)
latitude_central_to = location.latitude
longitude_central_to = location.longitude
print('The geograpical coordinate of Central Toronto are {}, {}.'.format(latitude_central_to, longitude_central_to))

The geograpical coordinate of Central Toronto are 43.6534817, -79.3839347.


### Map of Central Toronto

In [42]:
# create map of Manhattan using latitude and longitude values
map_central_to = folium.Map(location=[latitude_central_to, longitude_central_to], zoom_start=12)

# add markers to map
for lat, lng, label in zip(central_to_data['Latitude'], central_to_data['Longitude'],central_to_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_central_to)  
    
map_central_to

## 3.2 Explore the neighborhoods and segment them with Foursquare API to

### Define Foursquare credential 

### Let's explore the first neighborhood in our dataframe.

In [48]:
central_to_data.loc[0, 'Neighbourhood']

'Lawrence Park'

#### Get the neighborhood's latitude and longitude values.

In [49]:
neighborhood_latitude = central_to_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = central_to_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = central_to_data.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Lawrence Park are 43.7280205, -79.3887901.


#### Now, let's get the top 100 venues that are in Lawrence Park within a radius of 500 meters.

In [110]:
LIMIT = 50 # limit of number of venues returned by Foursquare API
radius = 500 # define radius




#### Send the GET request and examine the results

In [51]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5fa432d3960f157f9119000f'},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.7325205045, 'lng': -79.3825744605273},
   'sw': {'lat': 43.7235204955, 'lng': -79.3950057394727}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '50e6da19e4b0d8a78a0e9794',
       'name': 'Lawrence Park Ravine',
       'location': {'address': '3055 Yonge Street',
        'crossStreet': 'Lawrence Avenue East',
        'lat': 43.72696303913755,
        'lng': -79.39438246708775,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.72696303913755,
          'lng': -79.39438246708775}],
        'distance': 465,
        'cc': 'CA',
  

####  let's borrow the get_category_type function from the Foursquare lab

In [52]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#### Convert the venues find in a Panda DataFrame

In [58]:
import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [61]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Lawrence Park Ravine,Park,43.726963,-79.394382
1,Zodiac Swim School,Swim School,43.728532,-79.38286
2,TTC Bus #162 - Lawrence-Donway,Bus Line,43.728026,-79.382805


In [62]:
#venues returned
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

3 venues were returned by Foursquare.


## Explore all Neighborhoods in Central Toronto

In [63]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Now write the code to run the above function on each neighborhood and create a new dataframe called central_toronto_venues

In [65]:
# type your answer here
central_toronto_venues = getNearbyVenues(names=central_to_data['Neighbourhood'],
                                   latitudes=central_to_data['Latitude'],
                                   longitudes=central_to_data['Longitude']
                                  )

Lawrence Park
Davisville North
North Toronto West, Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville


In [66]:
print(central_toronto_venues.shape)
central_toronto_venues.head()

(104, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
2,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
3,Davisville North,43.712751,-79.390197,Homeway Restaurant & Brunch,43.712641,-79.391557,Breakfast Spot
4,Davisville North,43.712751,-79.390197,Sherwood Park,43.716551,-79.387776,Park


In [73]:
#How many venue for Neighbourhood
central_toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,33,33,33,33,33,33
Davisville North,9,9,9,9,9,9
"Forest Hill North & West, Forest Hill Road Park",4,4,4,4,4,4
Lawrence Park,3,3,3,3,3,3
"Moore Park, Summerhill East",2,2,2,2,2,2
"North Toronto West, Lawrence Park",18,18,18,18,18,18
Roselawn,2,2,2,2,2,2
"Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park",14,14,14,14,14,14
"The Annex, North Midtown, Yorkville",19,19,19,19,19,19


In [69]:
#Check the unique venues

print('There are {} uniques categories.'.format(len(central_toronto_venues['Venue Category'].unique())))

There are 60 uniques categories.


## 3.3 Analyze Each Neighborhood

In [74]:
# one hot encoding
central_toronto_onehot = pd.get_dummies(central_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
central_toronto_onehot['Neighborhood'] = central_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [central_toronto_onehot.columns[-1]] + list(central_toronto_onehot.columns[:-1])
central_toronto_onehot = central_toronto_onehot[fixed_columns]

central_toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Bank,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,...,Spa,Sporting Goods Shop,Supermarket,Sushi Restaurant,Swim School,Thai Restaurant,Toy / Game Store,Trail,Vietnamese Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Davisville North,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Davisville North,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
# check the size of the dataframe 
central_toronto_onehot.shape

(104, 61)

### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [76]:
central_toronto_grouped = central_toronto_onehot.groupby('Neighborhood').mean().reset_index()
central_toronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Bank,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,...,Spa,Sporting Goods Shop,Supermarket,Sushi Restaurant,Swim School,Thai Restaurant,Toy / Game Store,Trail,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,0.0,0.060606,...,0.0,0.0,0.0,0.060606,0.0,0.030303,0.030303,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Forest Hill North & West, Forest Hill Road Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0
3,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,...,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0
4,"Moore Park, Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0
5,"North Toronto West, Lawrence Park",0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.055556,...,0.055556,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556
6,Roselawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Summerhill West, Rathnelly, South Hill, Forest...",0.071429,0.0,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.071429,0.071429,0.0,0.0,0.0,0.0,0.071429,0.0
8,"The Annex, North Midtown, Yorkville",0.0,0.052632,0.0,0.0,0.0,0.0,0.052632,0.0,0.157895,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
#New size
central_toronto_grouped.shape

(9, 61)

### Let's print each neighborhood along with the top 5 most common venues

In [78]:
num_top_venues = 5

for hood in central_toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = central_toronto_grouped[central_toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
                venue  freq
0         Pizza Place  0.09
1        Dessert Shop  0.09
2      Sandwich Place  0.09
3  Italian Restaurant  0.06
4                 Gym  0.06


----Davisville North----
                  venue  freq
0                 Hotel  0.11
1          Dance Studio  0.11
2  Gym / Fitness Center  0.11
3                  Park  0.11
4     Food & Drink Shop  0.11


----Forest Hill North & West, Forest Hill Road Park----
                 venue  freq
0                Trail  0.25
1        Jewelry Store  0.25
2     Sushi Restaurant  0.25
3                 Park  0.25
4  American Restaurant  0.00


----Lawrence Park----
                 venue  freq
0          Swim School  0.33
1             Bus Line  0.33
2                 Park  0.33
3  American Restaurant  0.00
4                  Pub  0.00


----Moore Park, Summerhill East----
                 venue  freq
0           Playground   0.5
1                Trail   0.5
2  American Restaurant   0.0
3    Indian Restaurant

#### Convert the previous info in a Pandas dataframe

In [79]:
#Descending the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Dataframe with top 10 venues per Neiborhood

In [81]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = central_toronto_grouped['Neighborhood']

for ind in np.arange(central_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(central_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Dessert Shop,Pizza Place,Sandwich Place,Italian Restaurant,Coffee Shop,Sushi Restaurant,Café,Gym,Farmers Market,Indian Restaurant
1,Davisville North,Gym / Fitness Center,Park,Department Store,Dance Studio,Sandwich Place,Food & Drink Shop,Dog Run,Breakfast Spot,Hotel,Garden
2,"Forest Hill North & West, Forest Hill Road Park",Park,Jewelry Store,Trail,Sushi Restaurant,Diner,Dog Run,Donut Shop,Farmers Market,Fast Food Restaurant,Yoga Studio
3,Lawrence Park,Swim School,Bus Line,Park,Yoga Studio,Diner,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Furniture / Home Store
4,"Moore Park, Summerhill East",Playground,Trail,Department Store,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Furniture / Home Store,Fried Chicken Joint,Food & Drink Shop


## 3.4 Clustering 

In [83]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

In [84]:
# set number of clusters
kclusters = 5

central_toronto_grouped_clustering = central_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(central_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 3, 4, 2, 0, 1, 0, 0], dtype=int32)

#### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [94]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

central_toronto_merged = central_to_data

#merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
central_toronto_merged = central_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

central_toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Swim School,Bus Line,Park,Yoga Studio,Diner,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Furniture / Home Store
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197,0,Gym / Fitness Center,Park,Department Store,Dance Studio,Sandwich Place,Food & Drink Shop,Dog Run,Breakfast Spot,Hotel,Garden
2,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678,0,Coffee Shop,Clothing Store,Salon / Barbershop,Furniture / Home Store,Ice Cream Shop,Fast Food Restaurant,Diner,Mexican Restaurant,Park,Chinese Restaurant
3,M4S,Central Toronto,Davisville,43.704324,-79.38879,0,Dessert Shop,Pizza Place,Sandwich Place,Italian Restaurant,Coffee Shop,Sushi Restaurant,Café,Gym,Farmers Market,Indian Restaurant
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,2,Playground,Trail,Department Store,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Furniture / Home Store,Fried Chicken Joint,Food & Drink Shop


### Visualise the cluster 

In [96]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [98]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(central_toronto_merged['Latitude'], central_toronto_merged['Longitude'], central_toronto_merged['Neighbourhood'], central_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 3.5 Examine the Cluster 

### Cluster 0: Hang-out Area

In [99]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 0, central_toronto_merged.columns[[1] + list(range(5, central_toronto_merged.shape[1]))]]




Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Central Toronto,0,Gym / Fitness Center,Park,Department Store,Dance Studio,Sandwich Place,Food & Drink Shop,Dog Run,Breakfast Spot,Hotel,Garden
2,Central Toronto,0,Coffee Shop,Clothing Store,Salon / Barbershop,Furniture / Home Store,Ice Cream Shop,Fast Food Restaurant,Diner,Mexican Restaurant,Park,Chinese Restaurant
3,Central Toronto,0,Dessert Shop,Pizza Place,Sandwich Place,Italian Restaurant,Coffee Shop,Sushi Restaurant,Café,Gym,Farmers Market,Indian Restaurant
5,Central Toronto,0,Coffee Shop,Restaurant,Vietnamese Restaurant,Fried Chicken Joint,Light Rail Station,Liquor Store,Pizza Place,Pub,American Restaurant,Bank
8,Central Toronto,0,Sandwich Place,Café,Coffee Shop,History Museum,Indian Restaurant,Donut Shop,Liquor Store,Middle Eastern Restaurant,Park,Pizza Place


### Cluster 1: Quite Area

In [101]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 1, central_toronto_merged.columns[[1] + list(range(5, central_toronto_merged.shape[1]))]]




Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Central Toronto,1,Garden,Music Venue,Yoga Studio,Gym / Fitness Center,Greek Restaurant,Gourmet Shop,Gas Station,Furniture / Home Store,Fried Chicken Joint,Food & Drink Shop


### Cluster 2: Residential Area

In [104]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 2, central_toronto_merged.columns[[1] + list(range(5, central_toronto_merged.shape[1]))]]




Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,2,Playground,Trail,Department Store,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Furniture / Home Store,Fried Chicken Joint,Food & Drink Shop


### Cluster 3: Quite Area

In [105]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 3, central_toronto_merged.columns[[1] + list(range(5, central_toronto_merged.shape[1]))]]




Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,Central Toronto,3,Park,Jewelry Store,Trail,Sushi Restaurant,Diner,Dog Run,Donut Shop,Farmers Market,Fast Food Restaurant,Yoga Studio


### Cluster 4: Residential Area

In [107]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 4, central_toronto_merged.columns[[1] + list(range(5, central_toronto_merged.shape[1]))]]




Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,4,Swim School,Bus Line,Park,Yoga Studio,Diner,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Furniture / Home Store
