In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')


#### Scrap web data using BeautifulSoup

In [312]:
# import BeautifulSoup for web scraping
from bs4 import BeautifulSoup

In [323]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')
#print(soup.prettify())

In [314]:
# Find the table that stores the data
table = soup.find('table',{'class':'wikitable sortable'})
#table

In [315]:
# scrap column names
th = table.find_all('th')
cols = []
for i in th:
    cols.append(i.string.rstrip("\n\r"))
cols

['Postcode', 'Borough', 'Neighbourhood']

In [324]:
# list of rows
l = []
table_rows = table.findAll('tr')
table_rows.pop(0)
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.rstrip("\n\r") for tr in td]
    if td[1].string != "Not assigned": 
        l.append(row)
l[0:5]

[['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights']]

In [325]:
# creating the dataframe
df = pd.DataFrame(l, columns=cols)

# correct the column names
df.rename(columns={'Postcode':'PostalCode', 'Neighbourhood':'Neighborhood'}, inplace=True)

# If neighborhood is not assigned, use borough for neighborhood.
mask = df.Neighborhood == 'Not assigned'
df.loc[mask, 'Neighborhood'] = df.loc[mask, 'Borough']

df=df.groupby(['PostalCode','Borough'],as_index=False).agg(lambda x: "%s" % ', '.join(x))
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [326]:
df.shape

(103, 3)

In [327]:
# get geographical coordinates
url = 'https://cocl.us/Geospatial_data'
df2 = pd.read_csv(url)
df2.columns.values

array(['Postal Code', 'Latitude', 'Longitude'], dtype=object)

In [328]:
# renaming column name
df2.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
df2.columns.values

array(['PostalCode', 'Latitude', 'Longitude'], dtype=object)

In [329]:
# merge two dataframes
df = pd.merge(df, df2, on='PostalCode')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [330]:
df.shape

(103, 5)

#### Use geopy library to get the latitude and longitude values of Toronto

In [331]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


#### Create a map of Toronto with neighborhoods superimposed on top.

In [296]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

We will be working with only boroughs that contain the word Toronto 

In [297]:
toronto_data = df[df['Borough'].str.contains('Toronto')].sort_values(by=['Neighborhood']).reset_index(drop=True)
toronto_data.reset_index()
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
1,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
2,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191
3,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
4,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442


Visualize neighborhoods with borough containing the word Toronto. 

In [300]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Define Foursquare Credentials and Version

In [243]:
CLIENT_ID = 'xxx' # your Foursquare ID
CLIENT_SECRET = 'xxxx' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: xxxx
CLIENT_SECRET:xxxx


In [244]:
toronto_data.loc[0, 'Neighborhood']

'Adelaide, King, Richmond'

Get the neighborhood's latitude and longitude values.

In [245]:
neighborhood_latitude = toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Adelaide, King, Richmond are 43.65057120000001, -79.3845675.


#### Now, let's get the top 100 venues that are in Rouge Malvern within a radius of 500 meters.

First, let's create the GET request URL. Name your URL **url**.

In [246]:
# type your answer here
radius = 500
LIMIT = 100

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=xxxx&client_secret=xxxx&v=20180605&ll=43.65057120000001,-79.3845675&radius=500&limit=100'

Send GET request

In [247]:
results = requests.get(url).json()

From the Foursquare lab in the previous module, we know that all the information is in the *items* key. Before we proceed, let's borrow the **get_category_type** function from the Foursquare lab.

In [248]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a *pandas* dataframe.

In [249]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()
nearby_venues.shape

(100, 4)

And how many venues were returned by Foursquare?

In [250]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

100 venues were returned by Foursquare.


<a id='item2'></a>

## 2. Explore Neighborhoods in Toronto

#### Let's create a function to repeat the same process to all the neighborhoods in Toronto

In [268]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called *toronto_venues*.

In [269]:
# type your answer here
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Adelaide, King, Richmond
Berczy Park
Brockton, Exhibition Place, Parkdale Village
Business Reply Mail Processing Centre 969 Eastern
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Cabbagetown, St. James Town
Central Bay Street
Chinatown, Grange Park, Kensington Market
Christie
Church and Wellesley
Commerce Court, Victoria Hotel
Davisville
Davisville North
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Design Exchange, Toronto Dominion Centre
Dovercourt Village, Dufferin
First Canadian Place, Underground city
Forest Hill North, Forest Hill West
Harbord, University of Toronto
Harbourfront East, Toronto Islands, Union Station
Harbourfront, Regent Park
High Park, The Junction South
Lawrence Park
Little Portugal, Trinity
Moore Park, Summerhill East
North Toronto West
Parkdale, Roncesvalles
Rosedale
Roselawn
Runnymede, Swansea
Ryerson, Garden District
St. James Town
Stn A PO Boxes 25 The Esplanade
Studio District
T

#### Let's check the size of the resulting dataframe

In [272]:
print(toronto_venues.shape)
toronto_venues.head(3)

(298, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Adelaide, King, Richmond",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,Berczy Park,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,Berczy Park,43.784535,-79.160497,Affordable Toronto Movers,43.787919,-79.162977,Moving Target


Let's check how many venues were returned for each neighborhood

In [273]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",1,1,1,1,1,1
Berczy Park,2,2,2,2,2,2
"Brockton, Exhibition Place, Parkdale Village",6,6,6,6,6,6
Business Reply Mail Processing Centre 969 Eastern,4,4,4,4,4,4
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",8,8,8,8,8,8
"Cabbagetown, St. James Town",2,2,2,2,2,2
Central Bay Street,6,6,6,6,6,6
"Chinatown, Grange Park, Kensington Market",10,10,10,10,10,10
Christie,3,3,3,3,3,3
Church and Wellesley,4,4,4,4,4,4


#### Let's find out how many unique categories can be curated from all the returned venues

In [274]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 117 uniques categories.


<a id='item3'></a>

## 3. Analyze Each Neighborhood

In [275]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head(1)

Unnamed: 0,Women's Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Baseball Field,Beer Store,Bike Shop,Boutique,Breakfast Spot,Bridal Shop,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Line,Bus Station,Bus Stop,Butcher,Café,Candy Store,Caribbean Restaurant,Chinese Restaurant,Clothing Store,Coffee Shop,College Stadium,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Electronics Store,Fast Food Restaurant,Food & Drink Shop,Food Court,Food Truck,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Gastropub,General Entertainment,Golf Course,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Hakka Restaurant,Health Food Store,Hockey Arena,Hotel,Indian Restaurant,Indonesian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Juice Bar,Kids Store,Korean Restaurant,Latin American Restaurant,Liquor Store,Lounge,Luggage Store,Massage Studio,Medical Center,Mediterranean Restaurant,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Motel,Movie Theater,Moving Target,Neighborhood,Noodle House,Park,Pet Store,Pharmacy,Pizza Place,Playground,Plaza,Pool,Portuguese Restaurant,Pub,Ramen Restaurant,Rental Car Location,Restaurant,Rock Climbing Spot,Salon / Barbershop,Sandwich Place,Shopping Mall,Skating Rink,Smoke Shop,Smoothie Shop,Soccer Field,Spa,Sporting Goods Shop,Steakhouse,Supplement Shop,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Thrift / Vintage Store,Toy / Game Store,Train Station,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Adelaide, King, Richmond",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [276]:
toronto_onehot.shape

(298, 117)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [277]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

#### Let's confirm the new size

In [278]:
toronto_grouped.shape

(35, 117)

#### Let's print each neighborhood along with the top 5 most common venues

In [279]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                  venue  freq
0  Fast Food Restaurant   1.0
1         Women's Store   0.0
2    Mexican Restaurant   0.0
3            Playground   0.0
4           Pizza Place   0.0


----Berczy Park----
           venue  freq
0  Moving Target   0.5
1            Bar   0.5
2  Women's Store   0.0
3  Metro Station   0.0
4     Playground   0.0


----Brockton, Exhibition Place, Parkdale Village----
                 venue  freq
0   Mexican Restaurant  0.17
1          Pizza Place  0.17
2  Rental Car Location  0.17
3    Electronics Store  0.17
4       Medical Center  0.17


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0         Coffee Shop  0.50
1            Pharmacy  0.25
2   Korean Restaurant  0.25
3  Mexican Restaurant  0.00
4               Plaza  0.00


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
                  venue  freq
0   Fried Chicken Joint

#### Let's put that into a *pandas* dataframe

First, let's write a function to sort the venues in descending order.

In [280]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [281]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Fast Food Restaurant,Wings Joint,Food Court,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
1,Berczy Park,Moving Target,Bar,Wings Joint,Food Court,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
2,"Brockton, Exhibition Place, Parkdale Village",Breakfast Spot,Rental Car Location,Mexican Restaurant,Electronics Store,Medical Center,Pizza Place,Fast Food Restaurant,Construction & Landscaping,Cosmetics Shop,Curling Ice
3,Business Reply Mail Processing Centre 969 Eastern,Coffee Shop,Pharmacy,Korean Restaurant,Food Court,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Lounge,Athletics & Sports,Fried Chicken Joint,Bank,Thai Restaurant,Caribbean Restaurant,Bakery,Hakka Restaurant,Department Store,Deli / Bodega
5,"Cabbagetown, St. James Town",Playground,Construction & Landscaping,Food & Drink Shop,College Stadium,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
6,Central Bay Street,Discount Store,Chinese Restaurant,Train Station,Department Store,Coffee Shop,Wings Joint,Food & Drink Shop,Construction & Landscaping,Cosmetics Shop,Curling Ice
7,"Chinatown, Grange Park, Kensington Market",Bakery,Bus Line,Park,Soccer Field,Metro Station,Intersection,Bus Station,Fast Food Restaurant,Discount Store,Dog Run
8,Christie,American Restaurant,Intersection,Motel,Wings Joint,Food Court,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant
9,Church and Wellesley,College Stadium,Café,General Entertainment,Skating Rink,Fast Food Restaurant,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store


In [282]:
toronto_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
1,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
2,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191
3,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
4,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442
5,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M5T,Downtown Toronto,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049
8,M6G,Downtown Toronto,Christie,43.669542,-79.422564
9,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316


For some reason I got different results for neighborhoods_venues_sorted and toronto_data shapes. I have removed the neighborhoods that are not in neighborhoods_venues_sorted to prevent further problems when assigning labels.

In [303]:
neighborhoods_venues_sorted.shape
toronto_data = toronto_data[toronto_data.Neighborhood != 'Forest Hill North, Forest Hill West']
toronto_data = toronto_data[toronto_data.Neighborhood != 'Harbourfront, Regent Park']
toronto_data = toronto_data[toronto_data.Neighborhood != 'High Park, The Junction South']

<a id='item4'></a>

In [304]:
neighborhoods_venues_sorted.sort_values(by=['Neighborhood'])
toronto_data.reset_index()
toronto_data.sort_values(by=['Neighborhood'])

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
1,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
2,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191
3,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
4,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442
5,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M5T,Downtown Toronto,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049
8,M6G,Downtown Toronto,Christie,43.669542,-79.422564
9,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316


## 4. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [285]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 
kmeans.labels_.size

35

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [286]:
toronto_merged = toronto_data

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto data (df) to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568,1,Fast Food Restaurant,Wings Joint,Food Court,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
1,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,3,Moving Target,Bar,Wings Joint,Food Court,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
2,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191,3,Breakfast Spot,Rental Car Location,Mexican Restaurant,Electronics Store,Medical Center,Pizza Place,Fast Food Restaurant,Construction & Landscaping,Cosmetics Shop,Curling Ice
3,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,0,Coffee Shop,Pharmacy,Korean Restaurant,Food Court,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
4,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442,3,Lounge,Athletics & Sports,Fried Chicken Joint,Bank,Thai Restaurant,Caribbean Restaurant,Bakery,Hakka Restaurant,Department Store,Deli / Bodega


Finally, let's visualize the resulting clusters

In [305]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<a id='item5'></a>

## 5. Examine Clusters

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.

#### Cluster 1: Coffee Shops

In [306]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,East Toronto,0,Coffee Shop,Pharmacy,Korean Restaurant,Food Court,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
6,Downtown Toronto,0,Discount Store,Chinese Restaurant,Train Station,Department Store,Coffee Shop,Wings Joint,Food & Drink Shop,Construction & Landscaping,Cosmetics Shop,Curling Ice
8,Downtown Toronto,0,American Restaurant,Intersection,Motel,Wings Joint,Food Court,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant
24,Central Toronto,0,Coffee Shop,Grocery Store,Pharmacy,Butcher,Pizza Place,Gym,Dog Run,Hakka Restaurant,Construction & Landscaping,Cosmetics Shop
29,West Toronto,0,Coffee Shop,Bar,Miscellaneous Shop,Furniture / Home Store,Massage Studio,Grocery Store,Fast Food Restaurant,Cosmetics Shop,Curling Ice,Deli / Bodega
34,Central Toronto,0,Coffee Shop,Hockey Arena,Intersection,Portuguese Restaurant,Food & Drink Shop,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store
37,East Toronto,0,Coffee Shop,Gym / Fitness Center,Pub,Wings Joint,Fast Food Restaurant,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store


#### Cluster 2: Restaurants

In [307]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,1,Fast Food Restaurant,Wings Joint,Food Court,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
25,Central Toronto,1,Food & Drink Shop,Park,Fast Food Restaurant,Wings Joint,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant


#### Cluster 3: Playground

In [308]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Downtown Toronto,2,Playground,Construction & Landscaping,Food & Drink Shop,College Stadium,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner


#### Cluster 4: Restaurant and Shops are common. Probably downtown area.

In [309]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Downtown Toronto,3,Moving Target,Bar,Wings Joint,Food Court,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
2,West Toronto,3,Breakfast Spot,Rental Car Location,Mexican Restaurant,Electronics Store,Medical Center,Pizza Place,Fast Food Restaurant,Construction & Landscaping,Cosmetics Shop,Curling Ice
4,Downtown Toronto,3,Lounge,Athletics & Sports,Fried Chicken Joint,Bank,Thai Restaurant,Caribbean Restaurant,Bakery,Hakka Restaurant,Department Store,Deli / Bodega
7,Downtown Toronto,3,Bakery,Bus Line,Park,Soccer Field,Metro Station,Intersection,Bus Station,Fast Food Restaurant,Discount Store,Dog Run
9,Downtown Toronto,3,College Stadium,Café,General Entertainment,Skating Rink,Fast Food Restaurant,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store
10,Downtown Toronto,3,Indian Restaurant,Latin American Restaurant,Vietnamese Restaurant,Pet Store,Chinese Restaurant,Furniture / Home Store,Wings Joint,Electronics Store,Dog Run,Discount Store
11,Central Toronto,3,Smoke Shop,Shopping Mall,Breakfast Spot,Sandwich Place,Auto Garage,Bakery,Discount Store,Fast Food Restaurant,Electronics Store,Dog Run
12,Central Toronto,3,Breakfast Spot,Sandwich Place,Lounge,Skating Rink,Food & Drink Shop,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store
13,Central Toronto,3,Pizza Place,Pharmacy,Fast Food Restaurant,Chinese Restaurant,Noodle House,Italian Restaurant,Thai Restaurant,Fried Chicken Joint,Department Store,Diner
15,West Toronto,3,Chinese Restaurant,Fast Food Restaurant,Breakfast Spot,Japanese Restaurant,Pizza Place,Cosmetics Shop,Sandwich Place,Coffee Shop,Pharmacy,Grocery Store


#### Cluster 5: Park

In [310]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Downtown Toronto,4,Park,Playground,Wings Joint,Food & Drink Shop,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant
30,Downtown Toronto,4,Playground,Airport,Park,Bus Stop,Food & Drink Shop,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store


#### Result: 
It is hard to distinguish the clusters since they are skewed. However, we can clearly see that cluster 1 has coffee many shops and cluster 4 mainly consists of restaurants and shops.