# Week 3 - Capstone Project:
## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import csv
import json
import requests
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

### Step 1: Scraping Data (table) from Wikipedia:

In [2]:
#fetch html from website WIKI:

origin = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
H = BeautifulSoup(origin, 'lxml')

#print (H) -  To check on the HTML

#fetch table from HTML:

table = H.find('table', {'class':'wikitable sortable'})
X0_table = H.find_all('tr')

#print (table)
#print (X0_table)

#Add data from table in the HTML to a vector
raw_data = []

for X in X0_table:
    raw_data.append([Y.text.strip() for Y in X.find_all('td')])

print (raw_data[181:])

[['', ''], ['NL\n\nNS\n\nPE\n\nNB\n\nQC\n\nON\n\nMB\n\nSK\n\nAB\n\nBC\n\nNU/NT\n\nYT\n\n\nA\n\nB\n\nC\n\nE\n\nG\n\nH\n\nJ\n\nK\n\nL\n\nM\n\nN\n\nP\n\nR\n\nS\n\nT\n\nV\n\nX\n\nY', 'NL', 'NS', 'PE', 'NB', 'QC', 'ON', 'MB', 'SK', 'AB', 'BC', 'NU/NT', 'YT', 'A', 'B', 'C', 'E', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'R', 'S', 'T', 'V', 'X', 'Y'], ['NL', 'NS', 'PE', 'NB', 'QC', 'ON', 'MB', 'SK', 'AB', 'BC', 'NU/NT', 'YT'], ['A', 'B', 'C', 'E', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'R', 'S', 'T', 'V', 'X', 'Y']]


In [3]:
print (raw_data[1])

['M1A', 'Not assigned', 'Not assigned']


### Step 2: Cleansing raw_data from bad data with Pandas:

In [4]:
del raw_data[0]
del raw_data[180:]
len(raw_data)

180

### Step 3: Creating Data Frame from List (raw_data) with Pandas:

In [5]:
tordf = pd.DataFrame(raw_data, columns = {'Postal Code','Borough', 'Neighborhood'})
tordf.head()

Unnamed: 0,Borough,Postal Code,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Columns have mixed due to a dictionary order functionality, so I reorganized the columns

In [6]:
tordf.rename(columns = {'Borough':'Postal Code', 'Neighborhood':'Borough', 'Postal Code':'Neighborhood'}, inplace = True)
tordf.head()

Unnamed: 0,Postal Code,Neighborhood,Borough
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Step 4 - More Data Cleansing: Not Assigned elimination, and same Postal Code

In [7]:
newdf = tordf.set_index('Borough').drop(['Not assigned'])

In [8]:
newdf.shape

(103, 2)

In [9]:
newdf.reset_index(level=0, inplace=True)

In [10]:
newdf.shape

(103, 3)

In [11]:
newdf.head()

Unnamed: 0,Borough,Postal Code,Neighborhood
0,Parkwoods,M3A,North York
1,Victoria Village,M4A,North York
2,"Regent Park, Harbourfront",M5A,Downtown Toronto
3,"Lawrence Manor, Lawrence Heights",M6A,North York
4,"Queen's Park, Ontario Provincial Government",M7A,Downtown Toronto


## Instruction: Build a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name with its proper geospatial data.

### Step 1: Load geospatial data from csv.

In [12]:
geotor = pd.read_csv(r'C:\Users\Andres\Documents\GEODATA\Geospatial_Coordinates_TOR-CA.csv')

In [13]:
geotor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Step 2: Create two Datasets that will be indexed by Postal Code

In [14]:
pcdf = newdf.set_index('Postal Code')

In [15]:
pcdf.head()

Unnamed: 0_level_0,Borough,Neighborhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,Parkwoods,North York
M4A,Victoria Village,North York
M5A,"Regent Park, Harbourfront",Downtown Toronto
M6A,"Lawrence Manor, Lawrence Heights",North York
M7A,"Queen's Park, Ontario Provincial Government",Downtown Toronto


In [16]:
pcgeodf = geotor.set_index('Postal Code')

In [17]:
pcgeodf.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


### Step 3: Merge the two new Datasets with an Inner Join

In [18]:
thedf = pd.concat([pcdf, pcgeodf], axis=1, join='inner')

In [19]:
thedf.reset_index(level=0, inplace=True)

In [20]:
thedf.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,Parkwoods,North York,43.753259,-79.329656
1,M4A,Victoria Village,North York,43.725882,-79.315572
2,M5A,"Regent Park, Harbourfront",Downtown Toronto,43.65426,-79.360636
3,M6A,"Lawrence Manor, Lawrence Heights",North York,43.718518,-79.464763
4,M7A,"Queen's Park, Ontario Provincial Government",Downtown Toronto,43.662301,-79.389494


In [21]:
thedf.shape

(103, 5)

## Instruction: Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. It is up to you.

### Step 1: We use geolocator to get exact coordinates from Toronto

#### (The following code come from straight from the lab, I adjusted them for the Toronto area)

In [22]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [23]:
# Create map of Toronto with Folium
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# With this for we add the markers with the respected colors.
for lat, lng, borough, neighborhood in zip(thedf['Latitude'], thedf['Longitude'], thedf['Borough'], thedf['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Step 2: Test Connection to the API, make sure the version is the right day.

In [24]:
CLIENT_ID = 'CBR5UZU5JDYU2FAXPU1OQQWPYB4YVKBLJOTMIQTC3SDYO2CZ' # your Foursquare ID
CLIENT_SECRET = 'B3GM2R54H3S0X5QGLVDDO0EFOSQKPZ5G1TM4JEWTFJWUG2HV' # your Foursquare Secret
VERSION = '20200724' # Foursquare API version


print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: CBR5UZU5JDYU2FAXPU1OQQWPYB4YVKBLJOTMIQTC3SDYO2CZ
CLIENT_SECRET:B3GM2R54H3S0X5QGLVDDO0EFOSQKPZ5G1TM4JEWTFJWUG2HV


In [25]:
limit = 100
radius = 500
ll = 43.6534817,-79.3839347

# URL Creation, you can also test the API url directly from your explorer.
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    latitude, 
    longitude,
    VERSION,
    radius,
    limit)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=CBR5UZU5JDYU2FAXPU1OQQWPYB4YVKBLJOTMIQTC3SDYO2CZ&client_secret=B3GM2R54H3S0X5QGLVDDO0EFOSQKPZ5G1TM4JEWTFJWUG2HV&ll=43.6534817,-79.3839347&v=20200724&radius=500&limit=100'

### Step 3: JSON-Foursquare venues: Extraction, and "Pandification" to a DF.

In [26]:
#get the json structure with the url
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f1c35a6f753817df2fd855c'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Bay Street Corridor',
  'headerFullLocation': 'Bay Street Corridor, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 82,
  'suggestedBounds': {'ne': {'lat': 43.6579817045, 'lng': -79.37772678059432},
   'sw': {'lat': 43.6489816955, 'lng': -79.39014261940568}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '5227bb01498e17bf485e6202',
       'name': 'Downtown Toronto',
       'location': {'lat': 43.65323167517444,
        'lng': -79.38529600606677,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.65323167517444,
          'lng'

In [27]:
#Lets extact categories of the venues in Toronto
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [28]:
#"Pandification" = converting the results into a Panda's Dataframe

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON
#Columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
#Categories
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

#Data cleansing - Columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Downtown Toronto,Neighborhood,43.653232,-79.385296
1,Nathan Phillips Square,Plaza,43.65227,-79.383516
2,Poke Guys,Poke Place,43.654895,-79.385052
3,Japango,Sushi Restaurant,43.655268,-79.385165
4,Indigo,Bookstore,43.653515,-79.380696


In [29]:
#understanding the quantity of venues
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

82 venues were returned by Foursquare.


### Step 4: Lets explore Toronto, create a function to that will do the same call on all neighborhoods in Manhattan

In [30]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        #Get requests:
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        #return venue location
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Step 5 - Let's create a new Data frame with the venues per latitude and longitude that return Neighborhood

In [32]:
radius = 500
LIMIT = 100

toronto_venues = getNearbyVenues(names=thedf['Neighborhood'],
                                   latitudes=thedf['Latitude'],
                                   longitudes=thedf['Longitude'],
                                  )

North York
North York
Downtown Toronto
North York
Downtown Toronto
Etobicoke
Scarborough
North York
East York
Downtown Toronto
North York
Etobicoke
Scarborough
North York
East York
Downtown Toronto
York
Etobicoke
Scarborough
East Toronto
Downtown Toronto
York
Scarborough
East York
Downtown Toronto
Downtown Toronto
Scarborough
North York
North York
East York
Downtown Toronto
West Toronto
Scarborough
North York
North York
East York
Downtown Toronto
West Toronto
Scarborough
North York
North York
East Toronto
Downtown Toronto
West Toronto
Scarborough
North York
North York
East Toronto
Downtown Toronto
North York
North York
Scarborough
North York
North York
East Toronto
North York
York
North York
Scarborough
North York
North York
Central Toronto
Central Toronto
York
York
Scarborough
North York
Central Toronto
Central Toronto
West Toronto
Etobicoke
Scarborough
North York
Central Toronto
Central Toronto
West Toronto
Mississauga
Etobicoke
Scarborough
Central Toronto
Downtown Toronto
West Toron

In [33]:
print(toronto_venues.shape)
toronto_venues.head()

(2141, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,North York,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,North York,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,North York,43.753259,-79.329656,Corrosion Service Company Limited,43.752432,-79.334661,Construction & Landscaping
3,North York,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,North York,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [34]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Toronto,112,112,112,112,112,112
Downtown Toronto,1245,1245,1245,1245,1245,1245
East Toronto,122,122,122,122,122,122
East York,73,73,73,73,73,73
Etobicoke,70,70,70,70,70,70
Mississauga,13,13,13,13,13,13
North York,240,240,240,240,240,240
Scarborough,90,90,90,90,90,90
West Toronto,158,158,158,158,158,158
York,18,18,18,18,18,18


In [35]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 269 uniques categories.


### Step 6: Neighborhood Analysis

In [36]:
# one hot encoding in order to tranform into binary numeric values
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# create column neigborhood and insert into dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
toronto_onehot.shape

(2141, 269)

In [38]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Central Toronto,0.008929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.008929,0.0,0.0,0.008929,0.0,0.008929,0.0,0.0,0.0,0.0
1,Downtown Toronto,0.005622,0.0,0.000803,0.000803,0.000803,0.000803,0.001606,0.00241,0.001606,...,0.000803,0.00241,0.0,0.011245,0.001606,0.003213,0.0,0.006426,0.0,0.000803
2,East Toronto,0.016393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.016393,0.0,0.0,0.0,0.0,0.0,0.0,0.008197,0.0,0.0
3,East York,0.013699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.013699,0.0,0.0,0.0
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014286,0.0
5,Mississauga,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,North York,0.0,0.004167,0.0,0.004167,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.004167,0.008333,0.0,0.0,0.0,0.004167
7,Scarborough,0.0,0.011111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.011111,0.0,0.0,0.0,0.0
8,West Toronto,0.012658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.018987,0.0,0.012658,0.0,0.006329,0.0,0.0
9,York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.055556,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.055556


In [39]:
toronto_grouped.shape

(10, 269)

In [40]:
#TOP 5 Most common venues
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Central Toronto----
            venue  freq
0     Coffee Shop  0.07
1     Pizza Place  0.06
2  Sandwich Place  0.06
3            Café  0.05
4             Gym  0.04


----Downtown Toronto----
         venue  freq
0  Coffee Shop  0.11
1         Café  0.05
2   Restaurant  0.03
3        Hotel  0.03
4          Gym  0.02


----East Toronto----
                venue  freq
0    Greek Restaurant  0.07
1         Coffee Shop  0.05
2                Café  0.04
3             Brewery  0.04
4  Italian Restaurant  0.04


----East York----
                 venue  freq
0          Coffee Shop  0.05
1                 Bank  0.05
2                 Park  0.04
3         Burger Joint  0.04
4  Sporting Goods Shop  0.04


----Etobicoke----
            venue  freq
0     Pizza Place  0.10
1  Sandwich Place  0.07
2        Pharmacy  0.06
3     Coffee Shop  0.06
4             Gym  0.04


----Mississauga----
                 venue  freq
0          Coffee Shop  0.15
1                Hotel  0.15
2                  Gy

In [41]:
#"Pandification"
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [42]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,Coffee Shop,Pizza Place,Sandwich Place,Café,Park,Gym,Sushi Restaurant,Restaurant,Dessert Shop,Pub
1,Downtown Toronto,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Italian Restaurant,Gym,Park,Bakery,Clothing Store
2,East Toronto,Greek Restaurant,Coffee Shop,Café,Italian Restaurant,Brewery,Restaurant,Ice Cream Shop,American Restaurant,Park,Pub
3,East York,Bank,Coffee Shop,Pizza Place,Pharmacy,Park,Sporting Goods Shop,Burger Joint,Grocery Store,Fast Food Restaurant,Restaurant
4,Etobicoke,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Grocery Store,Gym,Fast Food Restaurant,Discount Store,Café,Pet Store


### Step 7: Use kMeans to cluster venues

In [43]:

kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)


kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)


kmeans.labels_[0:10] 

array([2, 2, 2, 4, 0, 1, 2, 4, 2, 3])

### Step 8: Cluster merging and indexed by Neighborhood

In [44]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = thedf

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,Parkwoods,North York,43.753259,-79.329656,2,Coffee Shop,Clothing Store,Restaurant,Grocery Store,Japanese Restaurant,Pizza Place,Sandwich Place,Bank,Fast Food Restaurant,Park
1,M4A,Victoria Village,North York,43.725882,-79.315572,2,Coffee Shop,Clothing Store,Restaurant,Grocery Store,Japanese Restaurant,Pizza Place,Sandwich Place,Bank,Fast Food Restaurant,Park
2,M5A,"Regent Park, Harbourfront",Downtown Toronto,43.65426,-79.360636,2,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Italian Restaurant,Gym,Park,Bakery,Clothing Store
3,M6A,"Lawrence Manor, Lawrence Heights",North York,43.718518,-79.464763,2,Coffee Shop,Clothing Store,Restaurant,Grocery Store,Japanese Restaurant,Pizza Place,Sandwich Place,Bank,Fast Food Restaurant,Park
4,M7A,"Queen's Park, Ontario Provincial Government",Downtown Toronto,43.662301,-79.389494,2,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Italian Restaurant,Gym,Park,Bakery,Clothing Store


### Step 8: Draw the map integrated to clusters

In [45]:
# Create Map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# Color Scheme for clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Step 9: Examine all Clusters

In [46]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,"Islington Avenue, Humber Valley Village",0,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Grocery Store,Gym,Fast Food Restaurant,Discount Store,Café,Pet Store
11,"West Deane Park, Princess Gardens, Martin Grov...",0,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Grocery Store,Gym,Fast Food Restaurant,Discount Store,Café,Pet Store
17,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",0,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Grocery Store,Gym,Fast Food Restaurant,Discount Store,Café,Pet Store
70,Westmount,0,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Grocery Store,Gym,Fast Food Restaurant,Discount Store,Café,Pet Store
77,"Kingsview Village, St. Phillips, Martin Grove ...",0,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Grocery Store,Gym,Fast Food Restaurant,Discount Store,Café,Pet Store
88,"New Toronto, Mimico South, Humber Bay Shores",0,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Grocery Store,Gym,Fast Food Restaurant,Discount Store,Café,Pet Store
89,"South Steeles, Silverstone, Humbergate, Jamest...",0,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Grocery Store,Gym,Fast Food Restaurant,Discount Store,Café,Pet Store
93,"Alderwood, Long Branch",0,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Grocery Store,Gym,Fast Food Restaurant,Discount Store,Café,Pet Store
94,"Northwest, West Humber - Clairville",0,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Grocery Store,Gym,Fast Food Restaurant,Discount Store,Café,Pet Store
98,"The Kingsway, Montgomery Road, Old Mill North",0,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Grocery Store,Gym,Fast Food Restaurant,Discount Store,Café,Pet Store


In [47]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
76,Canada Post Gateway Processing Centre,1,Coffee Shop,Hotel,Sandwich Place,Middle Eastern Restaurant,Intersection,Gym,Fried Chicken Joint,Gas Station,American Restaurant,Burrito Place


In [48]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Parkwoods,2,Coffee Shop,Clothing Store,Restaurant,Grocery Store,Japanese Restaurant,Pizza Place,Sandwich Place,Bank,Fast Food Restaurant,Park
1,Victoria Village,2,Coffee Shop,Clothing Store,Restaurant,Grocery Store,Japanese Restaurant,Pizza Place,Sandwich Place,Bank,Fast Food Restaurant,Park
2,"Regent Park, Harbourfront",2,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Italian Restaurant,Gym,Park,Bakery,Clothing Store
3,"Lawrence Manor, Lawrence Heights",2,Coffee Shop,Clothing Store,Restaurant,Grocery Store,Japanese Restaurant,Pizza Place,Sandwich Place,Bank,Fast Food Restaurant,Park
4,"Queen's Park, Ontario Provincial Government",2,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Italian Restaurant,Gym,Park,Bakery,Clothing Store
7,Don Mills,2,Coffee Shop,Clothing Store,Restaurant,Grocery Store,Japanese Restaurant,Pizza Place,Sandwich Place,Bank,Fast Food Restaurant,Park
9,"Garden District, Ryerson",2,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Italian Restaurant,Gym,Park,Bakery,Clothing Store
10,Glencairn,2,Coffee Shop,Clothing Store,Restaurant,Grocery Store,Japanese Restaurant,Pizza Place,Sandwich Place,Bank,Fast Food Restaurant,Park
13,Don Mills,2,Coffee Shop,Clothing Store,Restaurant,Grocery Store,Japanese Restaurant,Pizza Place,Sandwich Place,Bank,Fast Food Restaurant,Park
15,St. James Town,2,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Italian Restaurant,Gym,Park,Bakery,Clothing Store


In [49]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,Humewood-Cedarvale,3,Park,Convenience Store,Women's Store,Breakfast Spot,Bar,Field,Hockey Arena,Pool,Bus Line,Pizza Place
21,Caledonia-Fairbanks,3,Park,Convenience Store,Women's Store,Breakfast Spot,Bar,Field,Hockey Arena,Pool,Bus Line,Pizza Place
56,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",3,Park,Convenience Store,Women's Store,Breakfast Spot,Bar,Field,Hockey Arena,Pool,Bus Line,Pizza Place
63,"Runnymede, The Junction North",3,Park,Convenience Store,Women's Store,Breakfast Spot,Bar,Field,Hockey Arena,Pool,Bus Line,Pizza Place
64,Weston,3,Park,Convenience Store,Women's Store,Breakfast Spot,Bar,Field,Hockey Arena,Pool,Bus Line,Pizza Place


In [50]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,"Malvern, Rouge",4,Coffee Shop,Bank,Bakery,Breakfast Spot,Fast Food Restaurant,Chinese Restaurant,Indian Restaurant,Pizza Place,Skating Rink,Park
8,"Parkview Hill, Woodbine Gardens",4,Bank,Coffee Shop,Pizza Place,Pharmacy,Park,Sporting Goods Shop,Burger Joint,Grocery Store,Fast Food Restaurant,Restaurant
12,"Rouge Hill, Port Union, Highland Creek",4,Coffee Shop,Bank,Bakery,Breakfast Spot,Fast Food Restaurant,Chinese Restaurant,Indian Restaurant,Pizza Place,Skating Rink,Park
14,Woodbine Heights,4,Bank,Coffee Shop,Pizza Place,Pharmacy,Park,Sporting Goods Shop,Burger Joint,Grocery Store,Fast Food Restaurant,Restaurant
18,"Guildwood, Morningside, West Hill",4,Coffee Shop,Bank,Bakery,Breakfast Spot,Fast Food Restaurant,Chinese Restaurant,Indian Restaurant,Pizza Place,Skating Rink,Park
22,Woburn,4,Coffee Shop,Bank,Bakery,Breakfast Spot,Fast Food Restaurant,Chinese Restaurant,Indian Restaurant,Pizza Place,Skating Rink,Park
23,Leaside,4,Bank,Coffee Shop,Pizza Place,Pharmacy,Park,Sporting Goods Shop,Burger Joint,Grocery Store,Fast Food Restaurant,Restaurant
26,Cedarbrae,4,Coffee Shop,Bank,Bakery,Breakfast Spot,Fast Food Restaurant,Chinese Restaurant,Indian Restaurant,Pizza Place,Skating Rink,Park
29,Thorncliffe Park,4,Bank,Coffee Shop,Pizza Place,Pharmacy,Park,Sporting Goods Shop,Burger Joint,Grocery Store,Fast Food Restaurant,Restaurant
32,Scarborough Village,4,Coffee Shop,Bank,Bakery,Breakfast Spot,Fast Food Restaurant,Chinese Restaurant,Indian Restaurant,Pizza Place,Skating Rink,Park


# The End