**SEGMENTING AND CLUSTERING OF NEIGHBOURHOODS FOR THE CITY OF TORONTO**

**Importing required libraries**

In [1]:
!pip install beautifulsoup4
!pip install lxml
!pip install html5lib
!pip install requests
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np



**Reading the data from the Url**

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url).text
soup = BeautifulSoup(page, 'lxml') # get the page html

**Finding the tabel and getting required entries**

In [3]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighbourhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})


df.head()

[{'PostalCode': 'M3A', 'Borough': 'North York', 'Neighbourhood': 'Parkwoods'}, {'PostalCode': 'M4A', 'Borough': 'North York', 'Neighbourhood': 'Victoria Village'}, {'PostalCode': 'M5A', 'Borough': 'Downtown Toronto', 'Neighbourhood': 'Regent Park, Harbourfront'}, {'PostalCode': 'M6A', 'Borough': 'North York', 'Neighbourhood': 'Lawrence Manor, Lawrence Heights'}, {'PostalCode': 'M7A', 'Borough': "Queen's Park", 'Neighbourhood': 'Ontario Provincial Government'}, {'PostalCode': 'M9A', 'Borough': 'Etobicoke', 'Neighbourhood': 'Islington Avenue'}, {'PostalCode': 'M1B', 'Borough': 'Scarborough', 'Neighbourhood': 'Malvern, Rouge'}, {'PostalCode': 'M3B', 'Borough': 'North York', 'Neighbourhood': 'Don Mills North'}, {'PostalCode': 'M4B', 'Borough': 'East York', 'Neighbourhood': 'Parkview Hill, Woodbine Gardens'}, {'PostalCode': 'M5B', 'Borough': 'Downtown Toronto', 'Neighbourhood': 'Garden District, Ryerson'}, {'PostalCode': 'M6B', 'Borough': 'North York', 'Neighbourhood': 'Glencairn'}, {'Posta

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


**Dropping rows where boroug in not assigned and resetting the index**

In [4]:

df.drop(df.index[df['Borough'] == 'Not assigned'], inplace = True)

# Reset Index
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


**Combining neihbourhoods with same postal code**

In [5]:
df = df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()

df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


**Changing Neighbourhood value to Borough value if Neighbourhood is "Not Assigned"**

In [6]:
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df['Borough']

df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
# getting the shape of data
df.shape

(103, 3)

**Transfering data to csv**

In [8]:
df.to_csv(r'df_wiki.csv')

**Adding and loading Geospatial Data**

In [9]:
geo = pd.read_csv('https://cocl.us/Geospatial_data')
df = df.join(geo)
df

Unnamed: 0,PostalCode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
...,...,...,...,...,...,...
98,M9N,York,Weston,M9N,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,M9P,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",M9R,43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",M9V,43.739416,-79.588437


**Clustering Neighbourhoods**

**Importing libraries**

In [10]:
import json 
!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering librar



**Getting Coordinates of Toronto**

In [11]:
address = 'TORONTO'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The lati & long coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The lati & long coordinate of Toronto are 43.6534817, -79.3839347.


**Mapping Toronto**

In [12]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    map_toronto

**Getting code for downtown Toronto**

In [13]:
dtt_data = df[df['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
dtt_data.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,M4W,43.679563,-79.377529
1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",M4X,43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.66586,-79.38316
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",M5A,43.65426,-79.360636
4,M5B,Downtown Toronto,"Garden District, Ryerson",M5B,43.657162,-79.378937


**Getting Latitude and longitude of downtown Toronto**

In [14]:
address = 'Downtown TORONTO'

geolocator = Nominatim(user_agent="dtt_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The lati & long coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The lati & long coordinate of Downtown Toronto are 43.6541737, -79.38081162653639.


**Setting map for toronto and adding markers and Visulizing the map**

In [15]:
map_dtt = folium.Map(location=[latitude, longitude], zoom_start=14)

# add markers to map
for lat, lng, label in zip(dtt_data['Latitude'], dtt_data['Longitude'], dtt_data['Neighbourhood']):
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_dtt)
    
map_dtt

**Setting up the API**

In [16]:
CLIENT_ID = 'OBRLEJ4AYCPBM3I35VGXMDOKI3OEGZAL0M0HIKK1FYUARJZK' # your Foursquare ID
CLIENT_SECRET = 'ZIKV32D2HQZN3IJM5STB0CX55FFZW2ZF0AUD3DDBUAQNMSDK' # your Foursquare Secret
ACCESS_TOKEN = 'H51LTDOHQ543LUSQSIAM124N0S1D10Y2DDX40KGZG1EIQ1DJ' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: OBRLEJ4AYCPBM3I35VGXMDOKI3OEGZAL0M0HIKK1FYUARJZK
CLIENT_SECRET:ZIKV32D2HQZN3IJM5STB0CX55FFZW2ZF0AUD3DDBUAQNMSDK


In [17]:
dtt_data.loc[1, 'Neighbourhood']

'St. James Town, Cabbagetown'

**Getting Latitude and Longitude**

In [18]:
neighbourhood_latitude = dtt_data.loc[1, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = dtt_data.loc[1, 'Longitude'] # neighborhood longitude value

neighbourhood_name = dtt_data.loc[1, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of St. James Town, Cabbagetown are 43.667967, -79.3676753.


**Creating URL for API**

In [19]:
LIMIT = 100
radius = 500 

#Create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=OBRLEJ4AYCPBM3I35VGXMDOKI3OEGZAL0M0HIKK1FYUARJZK&client_secret=ZIKV32D2HQZN3IJM5STB0CX55FFZW2ZF0AUD3DDBUAQNMSDK&v=20180604&ll=43.667967,-79.3676753&radius=500&limit=100'

**Sending GET requests for the Json file**

In [20]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60c8a046f6a1442834c7c257'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Cabbagetown',
  'headerFullLocation': 'Cabbagetown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 36,
  'suggestedBounds': {'ne': {'lat': 43.6724670045, 'lng': -79.3614658826597},
   'sw': {'lat': 43.663466995499995, 'lng': -79.3738847173403}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '5093f826e4b08bdedeedcc16',
       'name': 'Cabbagetown Brew',
       'location': {'address': '552 Parliament St.',
        'lat': 43.66692279890784,
        'lng': -79.36928929560437,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.666

**Creating function for getnearby venues**

In [21]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

**Creating Function that exteracts the categories of venues**

In [22]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

**Getting the near venues**

In [24]:
dtt_venues = getNearbyVenues(names=dtt_data['Neighbourhood'],
                                   latitudes=dtt_data['Latitude'],
                                   longitudes=dtt_data['Longitude']
                                  )

Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
First Canadian Place, Underground city
Christie


**Getting the data values**

In [26]:
print(dtt_venues.shape)
dtt_venues.head()

(1068, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,"St. James Town, Cabbagetown",43.667967,-79.367675,Cabbagetown Brew,43.666923,-79.369289,Café


**Grouping by neighbourhoods**

In [27]:
dtt_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,46,46,46,46,46,46
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16
Central Bay Street,62,62,62,62,62,62
Christie,15,15,15,15,15,15
Church and Wellesley,69,69,69,69,69,69
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"First Canadian Place, Underground city",100,100,100,100,100,100
"Garden District, Ryerson",100,100,100,100,100,100
"Harbourfront East, Union Station, Toronto Islands",100,100,100,100,100,100
"Kensington Market, Chinatown, Grange Park",59,59,59,59,59,59


In [28]:
print('There are {} uniques categories.'.format(len(dtt_venues['Venue Category'].unique())))

There are 189 uniques categories.


**One-Hot Coding**

In [29]:
dtt_onehot = pd.get_dummies(dtt_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
dtt_onehot['Neighbourhood'] = dtt_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [dtt_onehot.columns[-1]] + list(dtt_onehot.columns[:-1])
dtt_onehot = dtt_onehot[fixed_columns]
dtt_onehot.head()

Unnamed: 0,Neighbourhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thrift / Vintage Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rosedale,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,"St. James Town, Cabbagetown",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Groupby and resetting the index**

In [30]:
dtt_grouped = dtt_onehot.groupby('Neighbourhood').mean().reset_index()
dtt_grouped

Unnamed: 0,Neighbourhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thrift / Vintage Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0625,0.0625,0.0625,0.125,0.1875,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.016129,0.0,0.016129,0.0,0.016129,0.0,0.016129
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.014493,0.014493,0.0,0.0,0.0,0.0,0.0,0.0,0.014493,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014493,0.014493
5,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.0
6,"First Canadian Place, Underground city",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0
7,"Garden District, Ryerson",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.0,0.0
8,"Harbourfront East, Union Station, Toronto Islands",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0
9,"Kensington Market, Chinatown, Grange Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.016949,0.0,0.0,0.050847,0.0,0.0,0.033898,0.016949,0.0,0.0


**Each Neighbourhood along with most common Venues**

In [31]:
num_top_venues = 5

for hood in dtt_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = dtt_grouped[dtt_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
            venue  freq
0    Cocktail Bar  0.09
1     Coffee Shop  0.09
2  Sandwich Place  0.07
3          Bakery  0.07
4        Beer Bar  0.04


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
              venue  freq
0   Airport Service  0.19
1    Airport Lounge  0.12
2  Airport Terminal  0.12
3   Harbor / Marina  0.06
4     Boat or Ferry  0.06


----Central Bay Street----
                 venue  freq
0          Coffee Shop  0.16
1       Sandwich Place  0.08
2     Sushi Restaurant  0.06
3   Italian Restaurant  0.05
4  Japanese Restaurant  0.05


----Christie----
                venue  freq
0       Grocery Store  0.27
1                Café  0.20
2                Park  0.13
3           Nightclub  0.07
4  Italian Restaurant  0.07


----Church and Wellesley----
                 venue  freq
0     Sushi Restaurant  0.09
1  Japanese Restaurant  0.07
2              Gay Bar  0.04
3           Restaurant  0.

**Function to sort venues in descending order**

In [32]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

**Displaying top 10 most common venues of each neighbourhood**

In [33]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = dtt_grouped['Neighbourhood']

for ind in np.arange(dtt_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(dtt_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Cocktail Bar,Coffee Shop,Sandwich Place,Bakery,Beer Bar,Seafood Restaurant,Vegetarian / Vegan Restaurant,Farmers Market,Park,Museum
1,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Harbor / Marina,Boat or Ferry,Rental Car Location,Coffee Shop,Sculpture Garden,Bar,Airport Gate
2,Central Bay Street,Coffee Shop,Sandwich Place,Sushi Restaurant,Italian Restaurant,Japanese Restaurant,Café,Burger Joint,Salad Place,Restaurant,Pizza Place
3,Christie,Grocery Store,Café,Park,Nightclub,Italian Restaurant,Baby Store,Athletics & Sports,Restaurant,Coffee Shop,Modern European Restaurant
4,Church and Wellesley,Sushi Restaurant,Japanese Restaurant,Gay Bar,Restaurant,Coffee Shop,Burrito Place,Indian Restaurant,Mediterranean Restaurant,Gym,Pizza Place


**Clustering Neighbourhoods**

In [34]:
kclusters = 5

dtt_grouped_clustering = dtt_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dtt_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 2, 4, 3, 0, 4, 4, 4, 4, 0])

In [35]:
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

dtt_merged = dtt_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
dtt_merged = dtt_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

dtt_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighbourhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4W,Downtown Toronto,Rosedale,M4W,43.679563,-79.377529,1,Park,Playground,Trail,Moroccan Restaurant,Massage Studio,Mediterranean Restaurant,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop
1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",M4X,43.667967,-79.367675,0,Coffee Shop,Italian Restaurant,Café,Restaurant,Bakery,Pub,Pizza Place,Yoga Studio,Convenience Store,Sandwich Place
2,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.66586,-79.38316,0,Sushi Restaurant,Japanese Restaurant,Gay Bar,Restaurant,Coffee Shop,Burrito Place,Indian Restaurant,Mediterranean Restaurant,Gym,Pizza Place
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",M5A,43.65426,-79.360636,4,Coffee Shop,Bakery,Park,Pub,Restaurant,Café,Dessert Shop,French Restaurant,Breakfast Spot,Mexican Restaurant
4,M5B,Downtown Toronto,"Garden District, Ryerson",M5B,43.657162,-79.378937,4,Coffee Shop,Sandwich Place,Clothing Store,Café,Pizza Place,Bank,Cosmetics Shop,Hotel,Japanese Restaurant,Theater


**Creating Map clusters and Visulaizing the map**

In [36]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dtt_merged['Latitude'], dtt_merged['Longitude'], dtt_merged['Neighbourhood'], dtt_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow,
        fill=True,
        fill_color=rainbow,
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

**Examing Cluster**

In [37]:
dtt_merged.loc[dtt_merged['Cluster Labels'] == 0, dtt_merged.columns[[1] + list(range(5, dtt_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Downtown Toronto,-79.367675,0,Coffee Shop,Italian Restaurant,Café,Restaurant,Bakery,Pub,Pizza Place,Yoga Studio,Convenience Store,Sandwich Place
2,Downtown Toronto,-79.38316,0,Sushi Restaurant,Japanese Restaurant,Gay Bar,Restaurant,Coffee Shop,Burrito Place,Indian Restaurant,Mediterranean Restaurant,Gym,Pizza Place
5,Downtown Toronto,-79.375418,0,Coffee Shop,Café,Cocktail Bar,Italian Restaurant,Clothing Store,Gastropub,Restaurant,American Restaurant,Diner,Department Store
6,Downtown Toronto,-79.373306,0,Cocktail Bar,Coffee Shop,Sandwich Place,Bakery,Beer Bar,Seafood Restaurant,Vegetarian / Vegan Restaurant,Farmers Market,Park,Museum
13,Downtown Toronto,-79.400049,0,Café,Coffee Shop,Vegetarian / Vegan Restaurant,Burger Joint,Bar,Mexican Restaurant,Vietnamese Restaurant,Thai Restaurant,Gaming Cafe,Comfort Food Restaurant
