# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
#Importing various required libraries
import numpy as np
import pandas as pd
import json
import requests

In [2]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [3]:
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup

In [4]:
from geopy.geocoders import Nominatim

In [5]:
!pip install folium #installing folium

Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 5.2 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [6]:
import folium 

In [7]:
from pandas import json_normalize #normalizing and flattens JSON data

In [8]:
#Getting data from the Wikipedia page and parsing data using BeautifulSoup (transforms the data in the Wiki page into a more readable format)
Data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
Parse = BeautifulSoup(Data, 'html.parser')

In [9]:
#Creating necessary headers for a dataframe and retrieving information needed
PostalCode = []
Borough = []
Neighborhood = []

for row in Parse.find('table').find_all('tr'):
    unit = row.find_all('td')
    if(len(unit)>0):
        PostalCode.append(unit[0].text)
        Borough.append(unit[1].text)
        Neighborhood.append(unit[2].text)

In [10]:
#Creating the dataframe with the necessary columns: PostalCode, Borough and Neighborhood
df_Toronto = pd.DataFrame({"PostalCode": PostalCode,
                          "Borough":Borough,
                          "Neighborhood": Neighborhood})
df_Toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


### Removing 'Not Assigned' values from the needed table

In [11]:
#Removing the \n and ignoring the 'Not assigned' Boroughs
df_Toronto['PostalCode'] = df_Toronto['PostalCode'].str.replace("\n","")
df_Toronto['Borough'] = df_Toronto['Borough'].str.replace("\n","")
df_Toronto['Neighborhood'] = df_Toronto['Neighborhood'].str.replace("\n","")

df_Toronto_update = df_Toronto[df_Toronto.Borough !="Not assigned"].reset_index(drop=True)
df_Toronto_update.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Combining the Neighborhoods by grouping PostalCode and Borough

In [12]:
#Combinging various neighborhood into one row with a comma
df_Toronto_Group = df_Toronto_update.groupby(["PostalCode","Borough"], as_index = False).agg(lambda x:",".join(x))
for index, row in df_Toronto_Group.iterrows():
    if row["Neighborhood"] =="Not assigned":
        row["Neighborhood"] = row["Borough"]
df_Toronto_Group.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Displaying the required table with PostalCode, Borough and Neighborhood

In [13]:
#Creating a new df for Question 2
Headers = ["PostalCode","Borough","Neighborhood"]
df_Tor = pd.DataFrame(columns=Headers)
List = ["M5G","M2H","M4B","M1J","M4G","M4M",'M1R',"M9V","M9L","M5V","M1B","M5A"]
for PostCode in List:
    df_Tor = df_Tor.append(df_Toronto_Group[df_Toronto_Group["PostalCode"]==PostCode], ignore_index = True)

df_Tor

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


### Displaying the number of rows in the dataframe

In [14]:
df_Toronto_Group.shape #using .shape to display the number of values in the matrix
print('Shape of the dataframe is: {}'.format(df_Toronto_Group))

Shape of the dataframe is:     PostalCode      Borough                                       Neighborhood
0          M1B  Scarborough                                     Malvern, Rouge
1          M1C  Scarborough             Rouge Hill, Port Union, Highland Creek
2          M1E  Scarborough                  Guildwood, Morningside, West Hill
3          M1G  Scarborough                                             Woburn
4          M1H  Scarborough                                          Cedarbrae
..         ...          ...                                                ...
98         M9N         York                                             Weston
99         M9P    Etobicoke                                          Westmount
100        M9R    Etobicoke  Kingsview Village, St. Phillips, Martin Grove ...
101        M9V    Etobicoke  South Steeles, Silverstone, Humbergate, Jamest...
102        M9W    Etobicoke                Northwest, West Humber - Clairville

[103 rows x 3 columns]


In [15]:
!ls

In [16]:
!pwd

/home/wsuser/work


In [17]:
!wget http://cocl.us/Geospatial_data

--2021-02-10 00:41:12--  http://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.63.96.176, 169.63.96.194
Connecting to cocl.us (cocl.us)|169.63.96.176|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cocl.us/Geospatial_data [following]
--2021-02-10 00:41:12--  https://cocl.us/Geospatial_data
Connecting to cocl.us (cocl.us)|169.63.96.176|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2021-02-10 00:41:13--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.29.197
Connecting to ibm.box.com (ibm.box.com)|107.152.29.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2021-02-10 00:41:13--  https://ibm.box.com/public/static/9afzr83p

In [18]:
#Using linux method to upload the Geospatial CSV
!ls 

Geospatial_data


In [19]:
!ls Geospatial_data

Geospatial_data


In [20]:
!cat Geospatial_data

Postal Code,Latitude,Longitude
M1B,43.8066863,-79.1943534
M1C,43.7845351,-79.1604971
M1E,43.7635726,-79.1887115
M1G,43.7709921,-79.2169174
M1H,43.773136,-79.2394761
M1J,43.7447342,-79.2394761
M1K,43.7279292,-79.2620294
M1L,43.7111117,-79.2845772
M1M,43.716316,-79.2394761
M1N,43.692657,-79.2648481
M1P,43.7574096,-79.273304
M1R,43.7500715,-79.2958491
M1S,43.7942003,-79.2620294
M1T,43.7816375,-79.3043021
M1V,43.8152522,-79.2845772
M1W,43.7995252,-79.3183887
M1X,43.8361247,-79.2056361
M2H,43.8037622,-79.3634517
M2J,43.7785175,-79.3465557
M2K,43.7869473,-79.385975
M2L,43.7574902,-79.3747141
M2M,43.789053,-79.4084928
M2N,43.7701199,-79.4084928
M2P,43.7527583,-79.4000493
M2R,43.7827364,-79.4422593
M3A,43.7532586,-79.3296565
M3B,43.7459058,-79.352188
M3C,43.7258997,-79.340923
M3H,43.7543283,-79.4422593
M3J,43.7679803,-79.4872619
M3K,43.7374732,-79.4647633
M3L,43.7390146,-79.5069436
M3M,43.7284964,-79.4956974
M3N,43.7616313,-79

In [21]:
!mv Geospatial_data Geospatial_Coordinates.csv #Changing Geospatial_data to Geospatial_Coordinates.csv

In [22]:
!ls

Geospatial_Coordinates.csv


In [23]:
# Read the the csv uploaded to the virtual machine
df_Geo_data = pd.read_csv("Geospatial_Coordinates.csv") 

In [24]:
df_Geo_data.head() #displaying the top 5 rows

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [25]:
#Merging the two dataframes using Left join method and the "Postal Code" column
df_Tor_PC = pd.merge(df_Toronto_Group,df_Geo_data, left_on = "PostalCode", right_on = "Postal Code", how="left")
df_Tor_PC.drop('Postal Code', axis=1, inplace = True) #removing the redundant column
df_Tor_PC.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Displaying the required table with PostalCode, Borough, Neighborhood, Latitude and Longitude

In [26]:
#Creating a new dataframe with the latitude and longitude
Headers2 = ["PostalCode","Borough","Neighborhood","Latitude","Longitude"] #required headers for the needed table
df_Tor_1 = pd.DataFrame(columns=Headers2)

for postcode in List:
    df_Tor_1 = df_Tor_1.append(df_Tor_PC[df_Tor_PC["PostalCode"]==postcode])

df_Tor_2 = df_Tor_1.reset_index()
df_Tor_2 = df_Tor_2.drop("index", axis = 1)
df_Tor_2

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


In [27]:
#Using geocoder and Nominatim to get the exact coordinates of Toronto, ON
address = 'Toronto, ON'
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent='Toronto_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Toronto are {}, {}'.format(latitude, longitude))

The geographical coordinate of Toronto are 43.6534817, -79.3839347


## Map of the Neighborhood of Toronto

In [28]:
map_Toronto = folium.Map(location = [latitude, longitude], zoom_start =10)

for lat, lng, borough, neighborhood in zip(df_Tor_PC['Latitude'],
                                          df_Tor_PC['Longitude'],
                                          df_Tor_PC['Borough'],
                                          df_Tor_PC['Neighborhood']):
    label = '{},{}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat, lng],
    radius =5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_Toronto)
    
map_Toronto

In [29]:
#Creating a dataframe with Boroughs that consist the word Toronto
df_Tor_data = df_Tor_PC[df_Tor_PC['Borough'].str.contains('Toronto')].reset_index(drop=True) #have to include.str.contains() as the Boroughs have East, Downtown Toronto etc.
df_Tor_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


## Map of the Neighborhood of Toronto with Boroughs involving the word 'Toronto'

In [30]:
map_Tor_data = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, borough, neighborhood in zip(df_Tor_data['Latitude'],
                           df_Tor_data['Longitude'],
                           df_Tor_data['Borough'],
                           df_Tor_data['Neighborhood']):
    label = '{},{}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat, lng],
    radius =5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_Tor_data)

map_Tor_data

In [31]:
#Using Foursquare API and Credentials to retrieve various venues
CLIENT_ID = 'VQSPI1IZ52EK30N3P41WN1ZKLKMXWWVAARP00PEFD1YV3TIC'
CLIENT_SECRET = 'LGSC53KN0SLJUFFCWLRS0MANOZ3PH2N1YZKG40E1W2FURE5X'
VERSION = '20201212'

### Picking a random neighborhood to explore

In [32]:
# Picking a random Neighborhood to explore the venues around
df_Ngbh = df_Tor_data.loc[0, 'Neighborhood']
df_Ngbh

'The Beaches'

In [33]:
#Getting the specific Neighborhood location
df_Ngbh_lat = df_Tor_data.loc[0, 'Latitude']
df_Ngbh_lng = df_Tor_data.loc[0, 'Longitude']

print('Latitude and Longitude of {} are {}, {}'.format(df_Ngbh, df_Ngbh_lat, df_Ngbh_lng))

Latitude and Longitude of The Beaches are 43.67635739999999, -79.2930312


In [34]:
#Utilizing Foursquare to get data for various locations
LIMIT = 50
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, 
                                                                                                                            CLIENT_SECRET,
                                                                                                                            VERSION,
                                                                                                                            df_Ngbh_lat,
                                                                                                                            df_Ngbh_lng,
                                                                                                                            radius,
                                                                                                                            LIMIT)
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60232bb1dba4b9223d677f2f'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.67682094413784,
          'lng': -79.29394208780985}],
        'distanc

In [35]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list)==0:
        return None
    else:
        return categories_list[0]['name']

In [36]:
#Transforming the data extracted into a pandas dataframe
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)

#Getting specific columns and categories for the dataframe
filtered_columns = ['venue.name','venue.categories','venue.location.lat','venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

nearby_venues['venue.categories']=nearby_venues.apply(get_category_type, axis=1)

nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869


In [37]:
# To get nearby venues
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        #create API request URL
        url ='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, 
                                                                                                                            CLIENT_SECRET,
                                                                                                                            VERSION,
                                                                                                                            lat,
                                                                                                                            lng,
                                                                                                                            radius,
                                                                                                                            LIMIT)
        #make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        #return only relevant info for each nearby venue
        venues_list.append([(
            name,
        lat,
        lng,
        v['venue']['name'],
        v['venue']['location']['lat'],
        v['venue']['location']['lng'],
        v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood',
                            'Neighborhood Latitude',
                            'Neighborhood Longitude',
                            'Venue',
                            'Venue Latitude',
                            'Venue Longitude',
                            'Venue Category']
    return(nearby_venues)

### Venues nearby for each neighborhood

In [38]:
Toronto_venues = getNearbyVenues(names=df_Tor_data['Neighborhood'],
                                latitudes=df_Tor_data['Latitude'],
                                longitudes=df_Tor_data['Longitude'])

Toronto_venues.head() #displaying the top 5 venues

The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West,  Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop


In [39]:
Toronto_venues.groupby('Neighborhood').count() #grouping the venues by the neighborhood

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,50,50,50,50,50,50
"Brockton, Parkdale Village, Exhibition Place",23,23,23,23,23,23
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",18,18,18,18,18,18
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16
Central Bay Street,50,50,50,50,50,50
Christie,15,15,15,15,15,15
Church and Wellesley,50,50,50,50,50,50
"Commerce Court, Victoria Hotel",50,50,50,50,50,50
Davisville,35,35,35,35,35,35
Davisville North,8,8,8,8,8,8


In [40]:
#Number of unique categories
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 211 uniques categories.


### Analyzing each neighborhood

In [41]:
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Yoga Studio,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
Toronto_onehot.shape

(1166, 211)

### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category


In [43]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0625,0.0625,0.0625,0.125,0.1875,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.04,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Each neighborhood along with the top 10 most common venues


In [44]:
num_top_venues = 5

for hood in Toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')
    
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#Top 10 venues for each neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

----Berczy Park----
                venue  freq
0         Coffee Shop  0.08
1        Cocktail Bar  0.06
2         Cheese Shop  0.04
3  Seafood Restaurant  0.04
4              Bakery  0.04


----Brockton, Parkdale Village, Exhibition Place----
            venue  freq
0            Café  0.13
1     Coffee Shop  0.09
2  Breakfast Spot  0.09
3          Bakery  0.09
4             Gym  0.04


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
                venue  freq
0  Light Rail Station  0.11
1         Yoga Studio  0.06
2                 Spa  0.06
3       Garden Center  0.06
4              Garden  0.06


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
              venue  freq
0   Airport Service  0.19
1    Airport Lounge  0.12
2  Airport Terminal  0.12
3     Boat or Ferry  0.06
4   Harbor / Marina  0.06


----Central Bay Street----
             venue  freq
0      Coffee Shop  0.1

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Cheese Shop,Beer Bar,Restaurant,Bakery,Farmers Market,Department Store,Liquor Store
1,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Bakery,Breakfast Spot,Furniture / Home Store,Performing Arts Venue,Stadium,Nightclub,Intersection,Italian Restaurant
2,"Business reply mail Processing Centre, South C...",Light Rail Station,Yoga Studio,Gym / Fitness Center,Fast Food Restaurant,Burrito Place,Restaurant,Auto Workshop,Spa,Smoke Shop,Brewery
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Boat or Ferry,Harbor / Marina,Boutique,Airport,Airport Food Court,Airport Gate,Rental Car Location
4,Central Bay Street,Coffee Shop,Café,Sandwich Place,Burger Joint,Bubble Tea Shop,Italian Restaurant,Yoga Studio,Indian Restaurant,Spa,Donut Shop


### Clustering Neighborhoods via K-Means

In [51]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

#Using K-Cluster
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [52]:
# Now creating a dataframe to encompass the clusters
Toronto_merged = df_Tor_data

# merge Toronto_grouped with Toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Trail,Health Food Store,Pub,Wine Shop,Cupcake Shop,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,1,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Restaurant,Ice Cream Shop,Yoga Studio,Café,Dessert Shop,Spa
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,1,Sandwich Place,Fast Food Restaurant,Gym,Liquor Store,Brewery,Restaurant,Italian Restaurant,Intersection,Ice Cream Shop,Pub
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Coffee Shop,Gastropub,Bakery,Brewery,Café,American Restaurant,Yoga Studio,Convenience Store,Bookstore,Cheese Shop
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Bus Line,Park,Swim School,Wine Shop,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner


### Map of Toronto with 5 clusters

In [53]:
# create map for visualization
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], 
                                  Toronto_merged['Longitude'], 
                                  Toronto_merged['Neighborhood'],
                                  Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine each of the 5 clusters

### 1st Cluster

In [54]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 0, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,0,Trail,Health Food Store,Pub,Wine Shop,Cupcake Shop,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store


### 2nd Cluster

In [55]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 1, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,East Toronto,1,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Restaurant,Ice Cream Shop,Yoga Studio,Café,Dessert Shop,Spa
2,East Toronto,1,Sandwich Place,Fast Food Restaurant,Gym,Liquor Store,Brewery,Restaurant,Italian Restaurant,Intersection,Ice Cream Shop,Pub
3,East Toronto,1,Coffee Shop,Gastropub,Bakery,Brewery,Café,American Restaurant,Yoga Studio,Convenience Store,Bookstore,Cheese Shop
5,Central Toronto,1,Hotel,Dance Studio,Park,Breakfast Spot,Gym / Fitness Center,Sandwich Place,Department Store,Food & Drink Shop,Doner Restaurant,Dog Run
6,Central Toronto,1,Clothing Store,Coffee Shop,Gift Shop,Sporting Goods Shop,Café,Chinese Restaurant,Diner,Fast Food Restaurant,Ice Cream Shop,Mexican Restaurant
7,Central Toronto,1,Pizza Place,Sandwich Place,Dessert Shop,Gym,Coffee Shop,Café,Italian Restaurant,Sushi Restaurant,Dance Studio,Farmers Market
9,Central Toronto,1,Coffee Shop,American Restaurant,Fried Chicken Joint,Supermarket,Sushi Restaurant,Bagel Shop,Bank,Restaurant,Pub,Pizza Place
11,Downtown Toronto,1,Coffee Shop,Café,Restaurant,Pub,Bakery,Italian Restaurant,Pizza Place,Gift Shop,Butcher,Caribbean Restaurant
12,Downtown Toronto,1,Sushi Restaurant,Coffee Shop,Japanese Restaurant,Yoga Studio,Men's Store,Café,Restaurant,Gay Bar,Ice Cream Shop,Diner
13,Downtown Toronto,1,Coffee Shop,Pub,Park,Bakery,Breakfast Spot,Café,Theater,Wine Shop,Electronics Store,Performing Arts Venue


### 3rd Cluster

In [56]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 2, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Central Toronto,2,Park,Trail,Wine Shop,Dance Studio,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store
10,Downtown Toronto,2,Park,Playground,Trail,Wine Shop,Cuban Restaurant,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner
23,Central Toronto,2,Park,Jewelry Store,Trail,Sushi Restaurant,Wine Shop,Dance Studio,Donut Shop,Doner Restaurant,Dog Run,Distribution Center


### 4th Cluster

In [57]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 3, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,3,Fast Food Restaurant,Garden,Home Service,Wine Shop,Dance Studio,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store


### 5th Cluster

In [58]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 4, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,4,Bus Line,Park,Swim School,Wine Shop,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner
