In [1]:
!pip install pandas
import pandas as pd
import numpy as np
import requests
!pip install bs4
from bs4 import BeautifulSoup
!pip install lxml
import lxml



# **Part 1 Scrap the data from Wikipedia**

**1 Transform the data that is in the table of postal codes into a pandas dataframe**

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df=pd.read_html(url, header=0)[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
df.rename(columns={'Neighbourhood': 'Neighborhood'}, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


**Using the shape method to find rows and columns**

In [4]:
df.shape

(287, 3)

**2 Ignore cells with a borough value of Not assigned**

In [5]:
df1= df[df['Borough'] != 'Not assigned']
df1.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


**shape after filtering out the borough values Not assigned**

In [6]:
df1.shape

(210, 3)

**3 Combining rows with the same postal code**

In [7]:
df_rows=df1.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_rows.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


**Shape of the dataframe after removing duplicate Postcode**

In [8]:
df_rows.shape

(103, 3)

**4 If the Cell has a borough but a Not assigned neighborhood then the neighborhood will be same as borough**

In [9]:
#get index of rows where condition is satisfied
Notass = df_rows.index[df_rows['Neighborhood'] == 'Not assigned']  

# then replace
for idx in Notass:
    df_rows['Neighborhood'][idx] = neighborhood_df['Borough'][idx]
    
df_rows.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


**5 Use the shape method to print the number of rows in the dataframe**

In [10]:
df_rows.shape

(103, 3)

In [11]:
df_rows

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


# **Part 2 Load the geospatial data**

**Now that you have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.**

In [12]:
!wget -q -O 'Toronto_data.csv'  http://cocl.us/Geospatial_data
df_loc = pd.read_csv('Toronto_data.csv')
df_loc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


**1 Rename the column header**

In [13]:
df_loc.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
df_loc.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


**2 Merge the two datasets**

In [14]:
df_m = pd.merge(df, df_loc, on='Postcode')
df_m.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763


# **Part 3 Explore and cluster the neighborhoods in Toronto**

In [15]:
import folium
# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install geopy
from geopy.geocoders import Nominatim

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from pandas.io.json import json_normalize



**Dataframe with boroughs that contain the word Toronto**

In [16]:
df_toronto = df_m[df_m['Borough'].str.contains('Toronto')]
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
5,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
12,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
13,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
26,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418


**Details of the dataframe**

In [17]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_toronto['Borough'].unique()),
        df_toronto.shape[0]
    )
)

The dataframe has 4 boroughs and 74 neighborhoods.


**Get geographic coordinates of boroughs that have their name containing Toronto**

In [18]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronoto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronoto are 43.653963, -79.387207.


**Create a map of Toronto with neighborhoods superimposed on top**

In [19]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## **We will segment and cluster only the neighborhoods in Downtown Toronto. So let's slice the original dataframe and create a new dataframe of the Downtown Toronto data.**

In [20]:
downtown_data = df_toronto[df_toronto['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
downtown_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418


In [21]:
address = 'Downtown Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6563221, -79.3809161.


**create and visualize Downtown Toronto**

In [22]:
# create map of Manhattan using latitude and longitude values
map_downtown = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(downtown_data['Latitude'], downtown_data['Longitude'], downtown_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown

**Define foursquare credentials and version**

In [23]:
CLIENT_ID = 'QLSEKAXEBNSVSJ1BRCQ5003GIJGYCHEURXXRYAEGK23SPTCY' # your Foursquare ID
CLIENT_SECRET = 'OJLMU1Y1V1EDAFONQOTIEMS3XDUCNZTIYX44L40IYGPOOMMN' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: QLSEKAXEBNSVSJ1BRCQ5003GIJGYCHEURXXRYAEGK23SPTCY
CLIENT_SECRET:OJLMU1Y1V1EDAFONQOTIEMS3XDUCNZTIYX44L40IYGPOOMMN


**Let's explore the first neighborhood**

In [24]:
downtown_data.loc[0, 'Neighborhood']

'Harbourfront'

**Get the latitude and longitude of Harbourfront**

In [25]:
neighborhood_latitude = downtown_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = downtown_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = downtown_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Harbourfront are 43.6542599, -79.3606359.


**Get the 100 Venues that are with 500 meters**

In [26]:
# type your answer here
LIMIT=100

radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

url


'https://api.foursquare.com/v2/venues/explore?&client_id=QLSEKAXEBNSVSJ1BRCQ5003GIJGYCHEURXXRYAEGK23SPTCY&client_secret=OJLMU1Y1V1EDAFONQOTIEMS3XDUCNZTIYX44L40IYGPOOMMN&v=20180605&ll=43.6542599,-79.3606359&radius=500&limit=100'

**get request sent and results received**

In [27]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e63b27eaba297001bf310f4'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 48,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label': 'display',
 

In [28]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

**lets find the venues**

In [29]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Morning Glory Cafe,Breakfast Spot,43.653947,-79.361149


**Number of venues returned**

In [30]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

48 venues were returned by Foursquare.


**Number of venues by categories**

In [31]:
nearby_venues.groupby('categories').count()

Unnamed: 0_level_0,name,lat,lng
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Antique Shop,1,1,1
Art Gallery,1,1,1
Asian Restaurant,1,1,1
Bakery,3,3,3
Bank,1,1,1
Beer Store,1,1,1
Breakfast Spot,2,2,2
Café,3,3,3
Chocolate Shop,1,1,1
Coffee Shop,7,7,7


**Explore Neighborhoods in Downtown Toronoto**

**Write a function to get the nearby venues**

In [32]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [33]:
toronto_venues = getNearbyVenues(names=downtown_data['Neighborhood'],
                                   latitudes=downtown_data['Latitude'],
                                   longitudes=downtown_data['Longitude']
                                  )

Harbourfront
Queen's Park
Ryerson
Garden District
St. James Town
Berczy Park
Central Bay Street
Christie
Adelaide
King
Richmond
Harbourfront East
Toronto Islands
Union Station
Design Exchange
Toronto Dominion Centre
Commerce Court
Victoria Hotel
Harbord
University of Toronto
Chinatown
Grange Park
Kensington Market
CN Tower
Bathurst Quay
Island airport
Harbourfront West
King and Spadina
Railway Lands
South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown
St. James Town
First Canadian Place
Underground city
Church and Wellesley


In [34]:
#Check the size of the dataframe

print(toronto_venues.shape)
toronto_venues.head()

(2471, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Harbourfront,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Harbourfront,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,Harbourfront,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,Harbourfront,43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


In [35]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,100,100,100,100,100,100
Bathurst Quay,17,17,17,17,17,17
Berczy Park,56,56,56,56,56,56
CN Tower,17,17,17,17,17,17
Cabbagetown,47,47,47,47,47,47
Central Bay Street,79,79,79,79,79,79
Chinatown,86,86,86,86,86,86
Christie,18,18,18,18,18,18
Church and Wellesley,85,85,85,85,85,85
Commerce Court,100,100,100,100,100,100


**Let's find out how many unique categories can be curated from all the returned venues**

In [36]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 206 uniques categories.


**Analyze Each Neighborhood**

In [37]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Let us examine the new dataframe**

In [38]:
toronto_onehot.shape

(2471, 206)

**Let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category**

In [39]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.01
1,Bathurst Quay,0.0,0.0,0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.0
3,CN Tower,0.0,0.0,0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Cabbagetown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021277,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Central Bay Street,0.012658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012658,...,0.0,0.0,0.0,0.012658,0.0,0.0,0.012658,0.0,0.0,0.0
6,Chinatown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.034884,0.0,0.05814,0.011628,0.0,0.0,0.0
7,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Church and Wellesley,0.011765,0.011765,0.0,0.0,0.0,0.0,0.0,0.0,0.011765,...,0.0,0.0,0.0,0.0,0.0,0.011765,0.0,0.011765,0.011765,0.0
9,Commerce Court,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0


**Lets confirm the size of the new dataframe**

In [40]:
toronto_grouped.shape

(36, 206)

**Lets print each neighborhood along with the top 5 most common venues**

In [41]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide----
              venue  freq
0       Coffee Shop  0.07
1        Restaurant  0.05
2   Thai Restaurant  0.04
3              Café  0.04
4  Sushi Restaurant  0.03


----Bathurst Quay----
              venue  freq
0   Airport Service  0.18
1    Airport Lounge  0.12
2  Airport Terminal  0.12
3       Coffee Shop  0.06
4           Airport  0.06


----Berczy Park----
                venue  freq
0         Coffee Shop  0.09
1        Cocktail Bar  0.04
2          Restaurant  0.04
3         Cheese Shop  0.04
4  Seafood Restaurant  0.04


----CN Tower----
              venue  freq
0   Airport Service  0.18
1    Airport Lounge  0.12
2  Airport Terminal  0.12
3       Coffee Shop  0.06
4           Airport  0.06


----Cabbagetown----
                venue  freq
0         Coffee Shop  0.09
1          Restaurant  0.06
2  Italian Restaurant  0.04
3                Café  0.04
4                Park  0.04


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.16
1  Ital

**Write a function to sort the venues in descending order**

In [42]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

**Display top 10 venues of each neighborhood**

In [43]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Restaurant,Café,Thai Restaurant,Sushi Restaurant,Bar,Gastropub,Cosmetics Shop,Concert Hall,Steakhouse
1,Bathurst Quay,Airport Service,Airport Terminal,Airport Lounge,Boutique,Plane,Harbor / Marina,Coffee Shop,Sculpture Garden,Boat or Ferry,Airport Gate
2,Berczy Park,Coffee Shop,Bakery,Seafood Restaurant,Beer Bar,Farmers Market,Restaurant,Cocktail Bar,Café,Cheese Shop,Breakfast Spot
3,CN Tower,Airport Service,Airport Terminal,Airport Lounge,Boutique,Plane,Harbor / Marina,Coffee Shop,Sculpture Garden,Boat or Ferry,Airport Gate
4,Cabbagetown,Coffee Shop,Restaurant,Bakery,Italian Restaurant,Park,Café,Pharmacy,Pizza Place,Flower Shop,Pub


# **Cluster the neighborhoods**

In [44]:
# Run k-means

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 1, 0, 1, 0, 0, 2, 4, 0, 0], dtype=int32)

**Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.**

In [45]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = downtown_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,0,Coffee Shop,Bakery,Café,Park,Pub,Theater,Mexican Restaurant,Breakfast Spot,Restaurant,Electronics Store
1,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,0,Coffee Shop,Burger Joint,Park,Portuguese Restaurant,Seafood Restaurant,Sandwich Place,Italian Restaurant,Japanese Restaurant,Distribution Center,Juice Bar
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937,0,Coffee Shop,Clothing Store,Bubble Tea Shop,Middle Eastern Restaurant,Café,Japanese Restaurant,Ramen Restaurant,Restaurant,Bookstore,Burger Joint
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937,0,Coffee Shop,Clothing Store,Bubble Tea Shop,Middle Eastern Restaurant,Café,Japanese Restaurant,Ramen Restaurant,Restaurant,Bookstore,Burger Joint
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Coffee Shop,Restaurant,Café,Italian Restaurant,Diner,Bakery,Hotel,Park,Breakfast Spot,Clothing Store


In [46]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## **Examine the clusters (only first three as an example)**

**The first cluster**

In [47]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,0,Coffee Shop,Bakery,Café,Park,Pub,Theater,Mexican Restaurant,Breakfast Spot,Restaurant,Electronics Store
1,Downtown Toronto,0,Coffee Shop,Burger Joint,Park,Portuguese Restaurant,Seafood Restaurant,Sandwich Place,Italian Restaurant,Japanese Restaurant,Distribution Center,Juice Bar
2,Downtown Toronto,0,Coffee Shop,Clothing Store,Bubble Tea Shop,Middle Eastern Restaurant,Café,Japanese Restaurant,Ramen Restaurant,Restaurant,Bookstore,Burger Joint
3,Downtown Toronto,0,Coffee Shop,Clothing Store,Bubble Tea Shop,Middle Eastern Restaurant,Café,Japanese Restaurant,Ramen Restaurant,Restaurant,Bookstore,Burger Joint
4,Downtown Toronto,0,Coffee Shop,Restaurant,Café,Italian Restaurant,Diner,Bakery,Hotel,Park,Breakfast Spot,Clothing Store
5,Downtown Toronto,0,Coffee Shop,Bakery,Seafood Restaurant,Beer Bar,Farmers Market,Restaurant,Cocktail Bar,Café,Cheese Shop,Breakfast Spot
6,Downtown Toronto,0,Coffee Shop,Italian Restaurant,Burger Joint,Sandwich Place,Ice Cream Shop,Japanese Restaurant,Thai Restaurant,Juice Bar,Salad Place,Café
8,Downtown Toronto,0,Coffee Shop,Restaurant,Café,Thai Restaurant,Sushi Restaurant,Bar,Gastropub,Cosmetics Shop,Concert Hall,Steakhouse
9,Downtown Toronto,0,Coffee Shop,Restaurant,Café,Thai Restaurant,Sushi Restaurant,Bar,Gastropub,Cosmetics Shop,Concert Hall,Steakhouse
10,Downtown Toronto,0,Coffee Shop,Restaurant,Café,Thai Restaurant,Sushi Restaurant,Bar,Gastropub,Cosmetics Shop,Concert Hall,Steakhouse


**The seconds cluster**

In [48]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,Downtown Toronto,1,Airport Service,Airport Terminal,Airport Lounge,Boutique,Plane,Harbor / Marina,Coffee Shop,Sculpture Garden,Boat or Ferry,Airport Gate
24,Downtown Toronto,1,Airport Service,Airport Terminal,Airport Lounge,Boutique,Plane,Harbor / Marina,Coffee Shop,Sculpture Garden,Boat or Ferry,Airport Gate
25,Downtown Toronto,1,Airport Service,Airport Terminal,Airport Lounge,Boutique,Plane,Harbor / Marina,Coffee Shop,Sculpture Garden,Boat or Ferry,Airport Gate
26,Downtown Toronto,1,Airport Service,Airport Terminal,Airport Lounge,Boutique,Plane,Harbor / Marina,Coffee Shop,Sculpture Garden,Boat or Ferry,Airport Gate
27,Downtown Toronto,1,Airport Service,Airport Terminal,Airport Lounge,Boutique,Plane,Harbor / Marina,Coffee Shop,Sculpture Garden,Boat or Ferry,Airport Gate
28,Downtown Toronto,1,Airport Service,Airport Terminal,Airport Lounge,Boutique,Plane,Harbor / Marina,Coffee Shop,Sculpture Garden,Boat or Ferry,Airport Gate
29,Downtown Toronto,1,Airport Service,Airport Terminal,Airport Lounge,Boutique,Plane,Harbor / Marina,Coffee Shop,Sculpture Garden,Boat or Ferry,Airport Gate


**The third cluster**

In [49]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,Downtown Toronto,2,Café,Bookstore,Italian Restaurant,Japanese Restaurant,Restaurant,Bakery,Bar,Chinese Restaurant,Comfort Food Restaurant,College Gym
19,Downtown Toronto,2,Café,Bookstore,Italian Restaurant,Japanese Restaurant,Restaurant,Bakery,Bar,Chinese Restaurant,Comfort Food Restaurant,College Gym
20,Downtown Toronto,2,Bar,Vietnamese Restaurant,Café,Bakery,Coffee Shop,Chinese Restaurant,Dumpling Restaurant,Vegetarian / Vegan Restaurant,Mexican Restaurant,Cocktail Bar
21,Downtown Toronto,2,Bar,Vietnamese Restaurant,Café,Bakery,Coffee Shop,Chinese Restaurant,Dumpling Restaurant,Vegetarian / Vegan Restaurant,Mexican Restaurant,Cocktail Bar
22,Downtown Toronto,2,Bar,Vietnamese Restaurant,Café,Bakery,Coffee Shop,Chinese Restaurant,Dumpling Restaurant,Vegetarian / Vegan Restaurant,Mexican Restaurant,Cocktail Bar


Observation: these three clusters seems to have totally different venues in 1st , 2nd most common venues.