# Segmenting and Clustering Neighborhoods in Toronto

### Scraping data from Wikipedia

In [225]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page
import pandas as pd

In [226]:
#Due to changes in the wikipedia page I have to use another URL containing the original information needed for this practice
url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969"

In [227]:
data  = requests.get(url).text

In [228]:
soup = BeautifulSoup(data,"html5lib")

In [229]:
tables = soup.find_all('table')
len(tables)

3

In [230]:
#print(tables[0].prettify())

In [231]:
Nei_data = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])

for row in tables[0].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        PCode = col[0].text.rstrip('\n')
        Borough = col[1].text.rstrip('\n')
        Neighborhood = col[2].text.rstrip('\n')
        Nei_data = Nei_data.append({"PostalCode":PCode, "Borough":Borough, "Neighborhood":Neighborhood}, ignore_index=True)

Nei_data

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [232]:
#Nei_data[Nei_data['Borough'] == 'Not assigned']

In [233]:
#Nei_data[Nei_data['Neighborhood']== 'Not assigned']

### Pre - procesing Borough Data
#### - Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
#### - More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma.
#### - If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [234]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
print("Before: ",Nei_data.shape)

Nei_data.drop(Nei_data[Nei_data['Borough'] == 'Not assigned'].index, inplace = True)

print("After: ",Nei_data.shape)

Before:  (180, 3)
After:  (103, 3)


In [235]:
# More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma

postal_code = Nei_data[['PostalCode','Neighborhood']]
duplicates = postal_code['PostalCode'].value_counts().to_frame()
duplicates[duplicates['PostalCode'] > 1]

# There is no duplicates per Postal Code, maybe someone has changed the source table in wikipedia. It has no sense to combine rows to get all the neighborhoods linked to one postal code in one row and separated by commas because that's the way it is in Wikipedia.

Unnamed: 0,PostalCode


In [236]:
# despite there is no duplicates postal codes, next you can find the code to combine several Neighborhoods under the same postal code.

# We get a list of all postal codes without duplicates.
postal_code_list = list(Nei_data['PostalCode'].drop_duplicates())

# and we create a dicionary getting all the Neighborhoods (separated by commas) linked to each postal code.
postal_code_dic = dict({'PostalCode':postal_code_list,'Neighborhood':[','.join(list(postal_code[postal_code['PostalCode']==item]['Neighborhood'])) for item in postal_code_list]})

#transform the dictionary to a dataframe
postal_code_datafr = pd.DataFrame(postal_code_dic)

postal_code_datafr.head()


Unnamed: 0,PostalCode,Neighborhood
0,M3A,Parkwoods
1,M4A,Victoria Village
2,M5A,"Regent Park, Harbourfront"
3,M6A,"Lawrence Manor, Lawrence Heights"
4,M7A,"Queen's Park, Ontario Provincial Government"


In [237]:
# We drop the original Neighborhood column
Nei_data.drop(columns = ['Neighborhood'], inplace = True)

# delete duplicates
Nei_data.drop_duplicates(inplace = True)

# and place the new column Neighborhood with the values separted by commas 
Nei_data = Nei_data.merge(postal_code_datafr, on = 'PostalCode')

Nei_data.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [238]:
#  If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

Nei_data['Neighborhood'].replace('Not assigned',Nei_data['Borough'], inplace=True)
Nei_data.head(12)
# Please notice that the order of the elements differ from the exam screenshot example due to changes in the wikipedia changes.

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [239]:
# In case you want to check the same records given by the author next the dataframe filtered getting the exam example postal codes same order.

Order = {'M5G':0,'M2H':1,'M4B':2,'M1J':3,'M4G':4,'M4M':5,'M1R':6,'M9V':7,'M9L':8,'M5V':9,'M1B':10,'M5A':11}

Nei_data[Nei_data['PostalCode'].isin(['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V','M9L','M5V','M1B','M5A'])]

Nei_data_exam = Nei_data[Nei_data['PostalCode'].isin(['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V','M9L','M5V','M1B','M5A'])]
Nei_data_exam['Order'] = [Order[item] for item in Nei_data_exam['PostalCode']]
Nei_data_exam.set_index('Order', inplace = True)
Nei_data_exam.sort_values(by=['Order'], inplace = True)
Nei_data_exam.reset_index(inplace = True)
Nei_data_exam.drop(columns = ['Order'], inplace = True)
Nei_data_exam

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


In [240]:
Nei_data.shape

(103, 3)

### Getting Longitude and Latitude

In [241]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.



import geocoder # import geocoder

def lat_long (postal_code):
    # initialize your variable to None
    print(postal_code)
    for item in postal_code:
        lat_lng_coords = None
        print('item',str(item))
        # loop until you get the coordinates
        print('starting loop',lat_lng_coords)
        while(lat_lng_coords is None):
          g = geocoder.google('{}, Toronto, Ontario'.format(str(item)))
          lat_lng_coords = g.latlng
        print('lat_long',lat_lng_coords)

        latitude.extend(lat_lng_coords[0])
        longitude.extend(lat_lng_coords[1])
    
    return latitude, longitude

Nei_data_test = Nei_data.loc[0:2,:]

Nei_data_test['Latitude'], Nei_data_test['Longitude'] = lat_long(list(Nei_data_test['PostalCode']))

#Nei_data['Latitude'], Nei_data['Longitude'] = lat_long(Nei_data['PostalCode'])

Nei_data_test
    

In [242]:
# As I am not getting anything from geocoderI am going to try with th csv file.
# The URL given by the author is not working.

#lat_long_pd = pd.read_csv('http://cocl.us/Geospatial_data')
#lat_long_pd.head()

# Once I have manually downloaded in my PC the csv file and once I have uploaded it to my IBM Cloud Project:

import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_bb27e20917e047638a8ac4e4bb04c59d = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='83_0prJpPAE4r_6XaRZ4AbaluezDEXajJo8F1eF-fiRH',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_bb27e20917e047638a8ac4e4bb04c59d.get_object(Bucket='datasciencecapstone-donotdelete-pr-nifdfkgyttirop',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

lat_long_pd = pd.read_csv(body)
lat_long_pd.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [243]:
Nei_data = Nei_data.merge(lat_long_pd, left_on = 'PostalCode', right_on = 'Postal Code')
Nei_data.drop(columns='Postal Code', inplace = True)
Nei_data.head(12)
# Please notice that the order of the elements differ from the exam screenshot example due to changes in the wikipedia changes.

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [244]:
# In case you want to check the same records given by the author next the dataframe filtered getting the exam example postal codes same order.

Order = {'M5G':0,'M2H':1,'M4B':2,'M1J':3,'M4G':4,'M4M':5,'M1R':6,'M9V':7,'M9L':8,'M5V':9,'M1B':10,'M5A':11}

Nei_data[Nei_data['PostalCode'].isin(['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V','M9L','M5V','M1B','M5A'])]

Nei_data_exam = Nei_data[Nei_data['PostalCode'].isin(['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V','M9L','M5V','M1B','M5A'])]
Nei_data_exam['Order'] = [Order[item] for item in Nei_data_exam['PostalCode']]
Nei_data_exam.set_index('Order', inplace = True)
Nei_data_exam.sort_values(by=['Order'], inplace = True)
Nei_data_exam.reset_index(inplace = True)
Nei_data_exam.drop(columns = ['Order'], inplace = True)
Nei_data_exam

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


In [245]:
Nei_data.shape

(103, 5)

### Explore and Cluster the Neighborhoods in Toronto

In [246]:
# filtering only those Boroughs that contains 'Toronto'
toronto_data = Nei_data[Nei_data['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [247]:
toronto_data.shape

(40, 5)

In [248]:
# Installing folium to print the results
!pip install folium



In [249]:
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab

In [250]:
import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

Getting Latitude and Longitud for Toronto to center the map.

In [251]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Printing the map and marking all the Neighborhood

In [252]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Getting connect to FourSquare

In [253]:
CLIENT_ID = 'V5WZ3224L3CJMBORRTRGZWIA5WHQKTXQKWO2H3AIN1X4JXQM' # your Foursquare ID
CLIENT_SECRET = 'ZXCRS32S04XLSSKBJMME3XE5D4XADXKRFCLIWK1THX4O12NL' # your Foursquare Secret
VERSION = '20180604' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: V5WZ3224L3CJMBORRTRGZWIA5WHQKTXQKWO2H3AIN1X4JXQM
CLIENT_SECRET:ZXCRS32S04XLSSKBJMME3XE5D4XADXKRFCLIWK1THX4O12NL


Getting all the venues linked to each Neighborhood

In [254]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [255]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],latitudes=toronto_data['Latitude'],longitudes=toronto_data['Longitude'])
toronto_venues.head()

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Runnymede, The Junction, Weston-Pellam Park, Carlton Village
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West,  Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Har

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


Checking that we are not loosing any Neighborhood

In [256]:
toronto_venues['Neighborhood'].value_counts().shape

(40,)

In [257]:
print(toronto_venues.shape)
toronto_venues.head()

(1596, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


We get the number of items per Neighborhood

In [258]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,59,59,59,59,59,59
"Brockton, Parkdale Village, Exhibition Place",23,23,23,23,23,23
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",13,13,13,13,13,13
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",17,17,17,17,17,17
Central Bay Street,60,60,60,60,60,60
Christie,15,15,15,15,15,15
Church and Wellesley,78,78,78,78,78,78
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,34,34,34,34,34,34
Davisville North,8,8,8,8,8,8


In [259]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 233 uniques categories.


Next step is to get the dummy values for each Venue Category and adding the column Neighborhood from the toronto_venues dataframe

In [260]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 


In [261]:
# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[list(toronto_onehot.columns).index('Neighborhood')]] + list(toronto_onehot.columns[:list(toronto_onehot.columns).index('Neighborhood')]) + list(toronto_onehot.columns[list(toronto_onehot.columns).index('Neighborhood')+1:-1])

toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Calculate the mean for every Venue Category within each Neighborhood. This dataframe will be used by Kmeans algorithm.

In [262]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.016949,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.058824,0.058824,0.058824,0.117647,0.117647,0.117647,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.016667,0.0,0.0,0.016667,0.0
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.012821,0.0,...,0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [263]:
toronto_grouped.shape

(40, 232)

We create another dataset containg N columns showing the N most common venues for each Neighborhood.

In [264]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [265]:
import numpy as np # library to handle data in a vectorized manner

#places_neig = ['Neighborhood','Restaurant','Park','Bakery','Gym','Bar','Grocery Store','Beer Bar','Steakhouse','Bank','Pharmacy']

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind,:], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Pharmacy,Cheese Shop,Restaurant,Farmers Market,Beer Bar,Seafood Restaurant,Greek Restaurant
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Gym / Fitness Center,Bakery,Stadium,Burrito Place,Restaurant,Climbing Gym,Performing Arts Venue
2,"Business reply mail Processing Centre, South C...",Gym / Fitness Center,Garden Center,Brewery,Park,Farmers Market,Light Rail Station,Fast Food Restaurant,Burrito Place,Restaurant,Garden
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Boutique,Plane,Sculpture Garden,Boat or Ferry,Rental Car Location,Bar,Coffee Shop
4,Central Bay Street,Coffee Shop,Italian Restaurant,Sandwich Place,Café,Bubble Tea Shop,Salad Place,Burger Joint,Japanese Restaurant,Thai Restaurant,Comic Shop


## Cluster Neighborhoods

#### We are going to set the number of cluster to seven as we think with this number the cluster are usefull.

In [266]:
from sklearn.cluster import KMeans
# set number of clusters

kclusters = 7

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)


# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 6, 1, 1, 1, 6, 1, 1, 1, 1], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top N venues for each neighborhood.

In [267]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')



In [268]:
toronto_merged.tail(10) # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
30,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,4,Gym,Trail,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
31,M5T,Downtown Toronto,"Kensington Market, Chinatown, Grange Park",43.653206,-79.400049,6,Café,Coffee Shop,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Mexican Restaurant,Park,Grocery Store,Burger Joint,Bar,Caribbean Restaurant
32,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049,1,Coffee Shop,Sushi Restaurant,Fried Chicken Joint,Liquor Store,Restaurant,Bank,Supermarket,Bagel Shop,Pub,Light Rail Station
33,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442,1,Airport Lounge,Airport Service,Airport Terminal,Boutique,Plane,Sculpture Garden,Boat or Ferry,Rental Car Location,Bar,Coffee Shop
34,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,3,Park,Playground,Trail,Wine Shop,Diner,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant
35,M5W,Downtown Toronto,Stn A PO Boxes,43.646435,-79.374846,1,Coffee Shop,Seafood Restaurant,Restaurant,Beer Bar,Café,Italian Restaurant,Japanese Restaurant,Bakery,Cocktail Bar,Cheese Shop
36,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675,1,Bakery,Pizza Place,Coffee Shop,Café,Restaurant,Italian Restaurant,Pub,Chinese Restaurant,Outdoor Sculpture,Pet Store
37,M5X,Downtown Toronto,"First Canadian Place, Underground city",43.648429,-79.38228,1,Coffee Shop,Café,Hotel,Restaurant,Japanese Restaurant,Gym,Steakhouse,Deli / Bodega,American Restaurant,Asian Restaurant
38,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,1,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Gay Bar,Restaurant,Mediterranean Restaurant,Fast Food Restaurant,Hotel,Men's Store,Café
39,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,1,Gym / Fitness Center,Garden Center,Brewery,Park,Farmers Market,Light Rail Station,Fast Food Restaurant,Burrito Place,Restaurant,Garden


Finally, let's visualize the resulting clusters

In [269]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [270]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Let´s check the two most common clusters:

In [271]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Regent Park, Harbourfront",1,Coffee Shop,Park,Bakery,Restaurant,Café,Theater,Breakfast Spot,Pub,Hotel,Event Space
1,"Queen's Park, Ontario Provincial Government",1,Coffee Shop,Diner,Sushi Restaurant,Art Gallery,Italian Restaurant,Distribution Center,Bar,Bank,Japanese Restaurant,Fried Chicken Joint
2,"Garden District, Ryerson",1,Clothing Store,Coffee Shop,Italian Restaurant,Café,Bubble Tea Shop,Middle Eastern Restaurant,Japanese Restaurant,Hotel,Cosmetics Shop,Ramen Restaurant
3,St. James Town,1,Café,Coffee Shop,Cocktail Bar,Gastropub,Park,Farmers Market,Seafood Restaurant,Restaurant,Italian Restaurant,Gym
5,Berczy Park,1,Coffee Shop,Cocktail Bar,Bakery,Pharmacy,Cheese Shop,Restaurant,Farmers Market,Beer Bar,Seafood Restaurant,Greek Restaurant
6,Central Bay Street,1,Coffee Shop,Italian Restaurant,Sandwich Place,Café,Bubble Tea Shop,Salad Place,Burger Joint,Japanese Restaurant,Thai Restaurant,Comic Shop
8,"Richmond, Adelaide, King",1,Coffee Shop,Café,Clothing Store,Restaurant,Bakery,Deli / Bodega,Thai Restaurant,Hotel,Gym,Asian Restaurant
10,"Harbourfront East, Union Station, Toronto Islands",1,Coffee Shop,Aquarium,Hotel,Café,Fried Chicken Joint,Restaurant,Brewery,Italian Restaurant,Scenic Lookout,Pizza Place
12,"The Danforth West, Riverdale",1,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Bubble Tea Shop,Indian Restaurant,Spa,Japanese Restaurant,Juice Bar
13,"Toronto Dominion Centre, Design Exchange",1,Coffee Shop,Hotel,Café,Restaurant,Italian Restaurant,Salad Place,Seafood Restaurant,Bakery,Japanese Restaurant,Concert Hall


In [272]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 6, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,Christie,6,Grocery Store,Café,Park,Coffee Shop,Italian Restaurant,Candy Store,Restaurant,Nightclub,Baby Store,Cupcake Shop
9,"Dufferin, Dovercourt Village",6,Pharmacy,Bakery,Music Venue,Café,Middle Eastern Restaurant,Bar,Supermarket,Bank,Pool,Brewery
11,"Little Portugal, Trinity",6,Bar,Men's Store,Restaurant,Vegetarian / Vegan Restaurant,Asian Restaurant,Café,Beer Store,Japanese Restaurant,Brewery,Juice Bar
14,"Brockton, Parkdale Village, Exhibition Place",6,Café,Breakfast Spot,Coffee Shop,Gym / Fitness Center,Bakery,Stadium,Burrito Place,Restaurant,Climbing Gym,Performing Arts Venue
23,"High Park, The Junction South",6,Café,Mexican Restaurant,Thai Restaurant,Bakery,Arts & Crafts Store,Discount Store,Diner,Bar,Flea Market,Fried Chicken Joint
25,"The Annex, North Midtown, Yorkville",6,Café,Sandwich Place,Coffee Shop,Liquor Store,Indian Restaurant,Flower Shop,Middle Eastern Restaurant,Pub,BBQ Joint,History Museum
28,"University of Toronto, Harbord",6,Café,Japanese Restaurant,Bar,Bookstore,Bakery,Dessert Shop,Poutine Place,Pub,Restaurant,Sushi Restaurant
31,"Kensington Market, Chinatown, Grange Park",6,Café,Coffee Shop,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Mexican Restaurant,Park,Grocery Store,Burger Joint,Bar,Caribbean Restaurant
