
# Question 1

## Import Toronto Postcodes from Wikipedia into DF

In [19]:
import pandas as pd
import requests
print('Libs imported')

Libs imported


#### Importing data , reading into Dataframe using Pandas

In [20]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = requests.get(url)
toronto_raw = pd.read_html(response.content)[0]
print(toronto_raw.shape)
toronto_raw.head()

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Clean up DataFrame

In [21]:
# Removing 'Not assigned' postcodes
toronto_clean = toronto_raw[toronto_raw.Borough != 'Not assigned']
#resetting index
toronto_clean.reset_index(drop=True)
toronto_clean.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [22]:
#checking new data size
toronto_clean.shape

(103, 3)

# Question 2

## Retrieve latitude, longitude for all postcodes for all entries

In [23]:
# obtain csv file with coordinates for all postcode, read into pandas df
!wget -q -O 'Toronto_lat_lng_data.csv' http://cocl.us/Geospatial_data
toronto_lat_lng = pd.read_csv('Toronto_lat_lng_data.csv')
toronto_lat_lng.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [24]:
# merge with existing postal codes. Since columns have same names, pandas autofill data.
toronto_agg = pd.merge(toronto_clean, toronto_lat_lng)
print(toronto_agg.shape)
toronto_agg

(103, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


# Question 3

## Selecting Boroughs with "Toronto" only

In [25]:
#obtaining list of boroughs
toronto_borough= toronto_agg.groupby(['Borough'])['Borough'].count()
toronto_borough

Borough
Central Toronto      9
Downtown Toronto    19
East Toronto         5
East York            5
Etobicoke           12
Mississauga          1
North York          24
Scarborough         17
West Toronto         6
York                 5
Name: Borough, dtype: int64

In [26]:
# Selecting 'Toronto' boroughs only, reindexing
toronto_central = toronto_agg[(toronto_agg['Borough']== 'Central Toronto')
                              |(toronto_agg['Borough']== 'Downtown Toronto')
                              |(toronto_agg['Borough']== 'East Toronto')
                              |(toronto_agg['Borough']== 'West Toronto') ]
print(toronto_central.shape)
toronto_central.reset_index(level=None, drop=True, inplace=True, col_level=0, col_fill='')
toronto_central

(39, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


## Explore and cluster Toronto neighborhoods

### Import required libraries

In [27]:
# Matplotlib and associated plotting modules
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

import json 
from pandas.io.json import json_normalize 

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
#! conda install -c conda - forge geopy --yes
from geopy.geocoders import Nominatim

print('Libraries imported.')

Libraries imported.


### Obtain Toronto coordinates, create original map

In [30]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent = 'Toronto')
location = geolocator.geocode(address)
toronto_lat = location.latitude
toronto_lng = location.longitude
print('Coordinates for {} are {},{}:'.format(address, toronto_lat, toronto_lng))

Coordinates for Toronto, Canada are 43.6534817,-79.3839347:


In [113]:
#create initial map of Toronto
map_toronto = folium.Map(location=[toronto_lat, toronto_lng], zoom_start=12)

# add markers for each neighborhood to map
for lat, lng, borough, neighborhood in zip(toronto_central['Latitude'], toronto_central['Longitude'], toronto_central['Borough'], toronto_central['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Define Foursquare Credential and version

In [32]:
CLIENT_ID = 'XXX'
CLIENT_SECRET = 'XXX'
VERSION = '20180605'

In [33]:
# Foursquare parameters
radius = 500
limit = 100

### Define function to retrieve venues, run for all Toronto neighborhoods

In [34]:
def getNearbyVenues(names,latitudes,longitudes):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        #create API call
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, limit)
        # obtain response, place in df
        results = requests.get(url).json()['response']['groups'][0]['items']
        # retain useful info only
        venues_list.append([(name, lat, lng,
                             v['venue']['name'], 
                             v['venue']['location']['lat'],
                             v['venue']['location']['lng'],  
                             v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)     

In [35]:
# run function on all neighborhoods
toronto_venues = getNearbyVenues(toronto_central['Neighborhood'],toronto_central['Latitude'],toronto_central['Longitude'])

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West, Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
R

#### Look at results

In [36]:
print(toronto_venues.shape)
toronto_venues.head(10)

(1620, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Dominion Pub and Kitchen,43.656919,-79.358967,Pub
5,"Regent Park, Harbourfront",43.65426,-79.360636,Corktown Common,43.655618,-79.356211,Park
6,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant
7,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
8,"Regent Park, Harbourfront",43.65426,-79.360636,The Extension Room,43.653313,-79.359725,Gym / Fitness Center
9,"Regent Park, Harbourfront",43.65426,-79.360636,The Distillery Historic District,43.650244,-79.359323,Historic Site


#### Look at results by neighborhood

In [37]:
toronto_venues.groupby(['Neighborhood']).count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,56,56,56,56,56,56
"Brockton, Parkdale Village, Exhibition Place",24,24,24,24,24,24
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",16,16,16,16,16,16
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16
Central Bay Street,64,64,64,64,64,64
Christie,17,17,17,17,17,17
Church and Wellesley,75,75,75,75,75,75
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,33,33,33,33,33,33
Davisville North,9,9,9,9,9,9


### Analysing Each Neighborhood

In [41]:
#One-hot encoding to list occurences of each Caetgory by neighborhood
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
print(toronto_onehot.shape)
toronto_onehot.head()

(1620, 237)


Unnamed: 0,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Group by neighborhood, look at mean of frequency of occurrence for each venue category

In [48]:
toronto_grouped= toronto_onehot.groupby(['Neighborhood']).mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0625,0.0625,0.0625,0.125,0.1875,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015625,0.0,0.0,0.015625,0.0,0.0,0.015625


#### Extract top 5 most frequent venue types for each neighborhood

In [64]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("*** "+hood+" ***")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index() # extract data for specified hood, tranpose results to DF
    temp.columns = ['venue','frequency'] # rename columns
    temp = temp.iloc[1:] # keep only venue and frequency data (discard hood name, etc in first row)
    temp['frequency'] = temp['frequency'].astype(float)
    temp = temp.round({'frequency':2})
    print(temp.sort_values('frequency', ascending = False).reset_index(drop=True).head(num_top_venues))
    print('\n')


*** Berczy Park ***
                venue  frequency
0         Coffee Shop       0.09
1        Cocktail Bar       0.05
2                Café       0.04
3  Seafood Restaurant       0.04
4         Cheese Shop       0.04


*** Brockton, Parkdale Village, Exhibition Place ***
            venue  frequency
0            Café       0.12
1     Coffee Shop       0.08
2  Breakfast Spot       0.08
3          Bakery       0.08
4     Yoga Studio       0.04


*** Business reply mail Processing Centre, South Central Letter Processing Plant Toronto ***
           venue  frequency
0    Pizza Place       0.06
1  Auto Workshop       0.06
2     Restaurant       0.06
3        Butcher       0.06
4  Burrito Place       0.06


*** CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport ***
                 venue  frequency
0      Airport Service       0.19
1       Airport Lounge       0.12
2     Airport Terminal       0.12
3  Rental Car Location       0.06
4  

### Transfer results in dataframe for further analysis

#### Write function to extract N top venues for a given 'hood'

In [72]:
def return_N_top_venues(row, N_top_venues):
    row_categories = row.iloc[1:] # select all row from second row
    row_categories_sorted = row_categories.sort_values(ascending = False) # sorts df in descending order
    return row_categories_sorted.index.values[0:N_top_venues] # obtain labels for N top venues

#### Aggregate data for all 'hoods' in DF

In [77]:
N_top_venues = 10

indicators = ['st', 'nd', 'rd']

# creating column headers for DF
columns = ['Neighborhood'] # create single item list containing Neighborhood, to be appended with another one of each top venue to select
for ind in np.arange(N_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new DF from the above columns as headers, populates 'hoods' names
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

# populates frequency data for each 'hood', using above function
for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_N_top_venues(toronto_grouped.iloc[ind, :], N_top_venues)

neighborhoods_venues_sorted.head() # check results

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Bakery,Seafood Restaurant,Cheese Shop,Restaurant,Café,Gourmet Shop,Pub
1,"Brockton, Parkdale Village, Exhibition Place",Café,Bakery,Coffee Shop,Breakfast Spot,Yoga Studio,Convenience Store,Performing Arts Venue,Pet Store,Climbing Gym,Restaurant
2,"Business reply mail Processing Centre, South C...",Park,Pizza Place,Skate Park,Brewery,Spa,Burrito Place,Farmers Market,Fast Food Restaurant,Restaurant,Butcher
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Sculpture Garden,Airport,Airport Food Court,Airport Gate,Boutique,Boat or Ferry,Rental Car Location
4,Central Bay Street,Coffee Shop,Sandwich Place,Italian Restaurant,Café,Japanese Restaurant,Thai Restaurant,Salad Place,Bubble Tea Shop,Burger Joint,Department Store


## Cluster Analysis of Neighborhoods

In [95]:
Kclusters = 5 # choose clusters numbers 

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1) # preparing Features data, dropping 'hoods' name

# built and fit KNN model to DF of Features
Kmeans = KMeans(n_clusters = Kclusters, random_state = 0).fit(toronto_grouped_clustering)
Kmeans.labels_ # obtain cluster labels output for all 'hoods'

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 0, 1, 0,
       0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

#### Reintegrate results into original DF

In [106]:
# add Cluster label to top N venues DF
neighborhoods_venues_sorted.insert(0, 'Cluster_Labels', Kmeans.labels_)

toronto_merged = toronto_central # initialize merger of initial lat, lng data with venues analysis results

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

#checking results
print(toronto_merged.shape)
toronto_merged.head()

(39, 16)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Park,Pub,Bakery,Café,Theater,Restaurant,Breakfast Spot,Yoga Studio,Shoe Store
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Diner,Sushi Restaurant,Discount Store,Bar,Beer Bar,Smoothie Shop,Sculpture Garden,Sandwich Place,Distribution Center
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Clothing Store,Coffee Shop,Italian Restaurant,Bubble Tea Shop,Cosmetics Shop,Middle Eastern Restaurant,Café,Japanese Restaurant,Ramen Restaurant,Tea Room
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Café,Coffee Shop,Restaurant,Gastropub,American Restaurant,Cocktail Bar,Department Store,Clothing Store,Creperie,Lingerie Store
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Asian Restaurant,Pub,Trail,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Yoga Studio


## Vizualize clusters on map

In [112]:
map_clusters = folium.Map(location=[toronto_lat, toronto_lng], zoom_start=12)

x = np.arange(Kclusters)
ys = [i+x+(i*x)**2 for i in range(Kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'],Kmeans.labels_):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon], radius=5, popup=label, color=rainbow[cluster-1], fill=True, fill_color=rainbow[cluster-1], fill_opacity=0.7).add_to(map_clusters)
map_clusters