## Diversity Our Strength: Toronto City

#### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import folium
from geopy.geocoders import Nominatim
import requests
import lxml.html as lh
from sklearn.cluster import KMeans

#### Get the HTML page of Wiki as dataset, and using read_html and convert the html data into list of Data frame object

In [2]:
wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_page = requests.get(wiki)

#Drop cells with a borough that is "Not assigned"
wiki_raw = pd.read_html(wiki_page.content, header = 0)[0]
df = wiki_raw[wiki_raw.Borough != "Not assigned"].reset_index(drop=True)
df.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### For Neighborhood="Not assigned", make the value the same as Borough

In [3]:
for index, row in df.iterrows():
    if row["Neighbourhood"] == "Not assigned":
        row["Neighbourhood"] = row["Borough"]
df.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [4]:
url = 'http://cocl.us/Geospatial_data'
df_geo = pd.read_csv(url)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Number of rows of the cleaned dataframe

In [5]:
df = df.join(df_geo.set_index('Postal Code'), on='Postal Code')
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


#### Define Foursquare Credentials and Version

In [6]:
CLIENT_ID = '3YOHBI0EHVVD03BNRROV1O2KENPZYIZO5IP1R20YPDF0HRRT' # Enter your Foursquare Client ID
CLIENT_SECRET = 'QXAPPEYIMUIV5YTTJ4UAEETXIDISEPE4ANXHEE3K40RQK5NV' # Enter your Foursquare Client Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3YOHBI0EHVVD03BNRROV1O2KENPZYIZO5IP1R20YPDF0HRRT
CLIENT_SECRET:QXAPPEYIMUIV5YTTJ4UAEETXIDISEPE4ANXHEE3K40RQK5NV


#### Find the top 300 venues that are within a radius of 500 meters for each Postal Code

In [7]:
radius = 500
limit = 300
venues = []

for lat, long, post, borough, neighbourhood in zip(df['Latitude'], df['Longitude'], df['Postal Code'], df['Borough'], df['Neighbourhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,        CLIENT_SECRET,        VERSION,        lat,        long,        radius,         limit)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post,
            borough,
            neighbourhood,
            lat,
            long,
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))
        venues_df = pd.DataFrame(venues)
        venues_df.head()

In [8]:
venues_df.columns = ['Postal Code', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(2130, 9)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M3A,North York,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,North York,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,M4A,North York,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,M4A,North York,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,M4A,North York,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [9]:
venues_df.shape

(2130, 9)

In [10]:
venues_df.groupby(['Postal Code', 'Borough', 'Neighbourhood']).count().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Postal Code,Borough,Neighbourhood,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M1B,Scarborough,"Malvern, Rouge",1,1,1,1,1,1
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",2,2,2,2,2,2
M1E,Scarborough,"Guildwood, Morningside, West Hill",7,7,7,7,7,7
M1G,Scarborough,Woburn,3,3,3,3,3,3
M1H,Scarborough,Cedarbrae,8,8,8,8,8,8


In [11]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 271 uniques categories.


In [12]:
venues_df['VenueCategory'].unique()[:20]

array(['Park', 'Food & Drink Shop', 'Hockey Arena',
       'Portuguese Restaurant', 'Coffee Shop', 'French Restaurant',
       'Intersection', 'Pizza Place', 'Bakery', 'Distribution Center',
       'Breakfast Spot', 'Spa', 'Restaurant', 'Gym / Fitness Center',
       'Historic Site', 'Chocolate Shop', 'Farmers Market',
       'Dessert Shop', 'Pub', 'Performing Arts Venue'], dtype=object)

#### Analyze Each Postal Code for Venue Category

In [13]:
#one hot encoding
toronto_encoding = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_encoding['Postal Code'] = venues_df['Postal Code'] 
toronto_encoding['Borough'] = venues_df['Borough'] 
toronto_encoding['Neighbourhood'] = venues_df['Neighbourhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_encoding.columns[-3:]) + list(toronto_encoding.columns[:-3])
toronto_encoding = toronto_encoding[fixed_columns]

print(toronto_encoding.shape)
toronto_encoding.head()

(2130, 274)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M3A,North York,Parkwoods,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,North York,Parkwoods,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4A,North York,Victoria Village,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4A,North York,Victoria Village,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,North York,Victoria Village,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
grouped = toronto_encoding.groupby(["Postal Code", "Borough", "Neighbourhood"]).mean().reset_index()

print(grouped.shape)
grouped.head()

(99, 274)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,Scarborough,"Malvern, Rouge",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,Scarborough,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,Scarborough,Cedarbrae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### List and display the top 5 existing facilities for each Borough

In [15]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ["Postal Code", "Borough", "Neighbourhood"]
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postal Code'] = grouped['Postal Code']
neighborhoods_venues_sorted['Borough'] = grouped['Borough']
neighborhoods_venues_sorted['Neighbourhood'] = grouped['Neighbourhood']

for ind in np.arange(grouped.shape[0]):
    row_categories = grouped.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

# neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted.head()

(99, 8)


Unnamed: 0,Postal Code,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",Fast Food Restaurant,Donut Shop,Dim Sum Restaurant,Diner,Discount Store
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",Construction & Landscaping,Bar,Yoga Studio,Drugstore,Discount Store
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",Intersection,Breakfast Spot,Electronics Store,Restaurant,Medical Center
3,M1G,Scarborough,Woburn,Coffee Shop,Korean BBQ Restaurant,Yoga Studio,Drugstore,Discount Store
4,M1H,Scarborough,Cedarbrae,Hakka Restaurant,Gas Station,Bakery,Bank,Athletics & Sports


In [16]:
# Set manually to get proper fit in the map
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [17]:
merged = df.copy()
merged = merged.join(neighborhoods_venues_sorted[["Postal Code", "1st Most Common Venue"]].set_index("Postal Code"), on="Postal Code")
print(merged.shape)
merged.head()

(103, 6)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,1st Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,Food & Drink Shop
1,M4A,North York,Victoria Village,43.725882,-79.315572,Hockey Arena
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Coffee Shop
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,Clothing Store
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,Coffee Shop


#### Map visualization of each venues

In [18]:
map = folium.Map(location=[latitude, longitude], zoom_start=11)
# add markers to map
for lat, lng, label1,common in zip(merged['Latitude'], merged['Longitude'], merged['Postal Code'],merged['1st Most Common Venue'] ):
    labelnew =  'Post office : {} , Top Existing Infrastructure  : {}'.format(label1,common)
    label = folium.Popup( labelnew, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map)  
map

#### Feature Engineering for Business Problem

In [19]:
venues_df['VenueCategory'].unique()

array(['Park', 'Food & Drink Shop', 'Hockey Arena',
       'Portuguese Restaurant', 'Coffee Shop', 'French Restaurant',
       'Intersection', 'Pizza Place', 'Bakery', 'Distribution Center',
       'Breakfast Spot', 'Spa', 'Restaurant', 'Gym / Fitness Center',
       'Historic Site', 'Chocolate Shop', 'Farmers Market',
       'Dessert Shop', 'Pub', 'Performing Arts Venue', 'Event Space',
       'Yoga Studio', 'Café', 'Mexican Restaurant', 'Theater',
       'Shoe Store', 'Brewery', 'Art Gallery', 'Cosmetics Shop',
       'Electronics Store', 'Beer Store', 'Bank', 'Hotel',
       'Health Food Store', 'Antique Shop', 'Boutique',
       'Furniture / Home Store', 'Vietnamese Restaurant',
       'Clothing Store', 'Accessories Store', 'Arts & Crafts Store',
       'Miscellaneous Shop', 'Italian Restaurant', 'Beer Bar', 'Creperie',
       'Diner', 'Sushi Restaurant', 'Hobby Shop', 'Burrito Place',
       'Fried Chicken Joint', 'Discount Store', 'Japanese Restaurant',
       'Smoothie Shop', 'S

In [20]:
# Quality Infrastructure Picked
search_query= ['Restaurant', 'Hotel', 'Farmers Market', 'Shopping Mall', ' Shopping Plaza', 'Gym / Fitness Center', 'Pharmacy',
               'Electronics Store', 'Indie Movie Theater', 'Light Rail Station','Metro Station', 'Train','Train Station', 'Garden',
               'Theater', 'ATM', 'Office', 'Bus Station', 'Bank', 'Market' , 'Business Service', 'Monument / Landmark' ,
               'Resort', 'Hospital', 'Police Station', 'School', 'College', 'Café' , 'Park', 'Playground', 'Gas Station',
               'Convention Center', 'College Auditorium', 'Government Building', 'Airport Terminal', 'Lounge', 'Lounge', 'University']
print(len(search_query))

38


In [21]:
quality_dataframe = []
quality_dataframe= venues_df.loc[venues_df['VenueCategory'].isin(search_query)]
quality_dataframe.shape

(409, 9)

In [22]:
quality_dataframe

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M3A,North York,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.332140,Park
13,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,Impact Kitchen,43.656369,-79.356980,Restaurant
14,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,The Extension Room,43.653313,-79.359725,Gym / Fitness Center
15,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,Corktown Common,43.655618,-79.356211,Park
18,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,Distillery Sunday Market,43.650075,-79.361832,Farmers Market
...,...,...,...,...,...,...,...,...,...
2107,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,The Ashbridge Estate,43.664691,-79.321805,Garden
2108,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,TTC Russell Division,43.664908,-79.322560,Light Rail Station
2109,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Jonathan Ashbridge Park,43.664702,-79.319898,Park
2111,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,TTC Stop #03049,43.664470,-79.325145,Light Rail Station


In [23]:
# one hot encoding
infrastructure = pd.get_dummies(quality_dataframe[['VenueCategory']], prefix="", prefix_sep="")
# add postal, borough and neighborhood column back to dataframe
infrastructure['Postal Code'] = quality_dataframe['Postal Code'] 
infrastructure['Borough'] = quality_dataframe['Borough'] 
infrastructure['Neighbourhood'] = quality_dataframe['Neighbourhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(infrastructure.columns[-3:]) + list(infrastructure.columns[:-3])
infrastructure = infrastructure[fixed_columns]

print(infrastructure.shape)
infrastructure.head()
print(infrastructure.columns.values)

(409, 31)
['Postal Code' 'Borough' 'Neighbourhood' 'Airport Terminal' 'Bank'
 'Bus Station' 'Business Service' 'Café' 'College Auditorium'
 'Electronics Store' 'Farmers Market' 'Garden' 'Gas Station'
 'Gym / Fitness Center' 'Hospital' 'Hotel' 'Indie Movie Theater'
 'Light Rail Station' 'Lounge' 'Market' 'Metro Station'
 'Monument / Landmark' 'Office' 'Park' 'Pharmacy' 'Playground'
 'Restaurant' 'School' 'Shopping Mall' 'Theater' 'Train Station']


In [24]:
Toronto_infrastructure = infrastructure.groupby(["Postal Code", "Borough", "Neighbourhood"]).sum().reset_index()
print(Toronto_infrastructure.shape)
Toronto_infrastructure.head()

(79, 31)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Airport Terminal,Bank,Bus Station,Business Service,Café,College Auditorium,Electronics Store,...,Monument / Landmark,Office,Park,Pharmacy,Playground,Restaurant,School,Shopping Mall,Theater,Train Station
0,M1E,Scarborough,"Guildwood, Morningside, West Hill",0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,M1H,Scarborough,Cedarbrae,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1J,Scarborough,Scarborough Village,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [25]:

Toronto_infrastructure['Total infrastructure'] =  Toronto_infrastructure[Toronto_infrastructure.drop(["Postal Code", "Borough", "Neighbourhood"], axis=1).columns.values].sum(axis=1)

In [26]:
Toronto_infrastructure.shape

(79, 32)

#### The best place in Toronto with the most quality infrastructure

In [27]:
Toronto_infrastructure[Toronto_infrastructure['Total infrastructure'] == Toronto_infrastructure['Total infrastructure'].max()].transpose()


Unnamed: 0,48
Postal Code,M5K
Borough,Downtown Toronto
Neighbourhood,"Toronto Dominion Centre, Design Exchange"
Airport Terminal,0
Bank,0
Bus Station,0
Business Service,0
Café,5
College Auditorium,0
Electronics Store,0


#### place in Toronto with low infrastructure quality

In [28]:
bad_infrastructure = Toronto_infrastructure[Toronto_infrastructure['Total infrastructure'] == Toronto_infrastructure['Total infrastructure'].min()]
bad_infrastructure

Unnamed: 0,Postal Code,Borough,Neighbourhood,Airport Terminal,Bank,Bus Station,Business Service,Café,College Auditorium,Electronics Store,...,Office,Park,Pharmacy,Playground,Restaurant,School,Shopping Mall,Theater,Train Station,Total infrastructure
2,M1J,Scarborough,Scarborough Village,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
5,M1N,Scarborough,"Birch Cliff, Cliffside West",0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
6,M1R,Scarborough,"Wexford, Maryvale",0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
7,M1S,Scarborough,Agincourt,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
13,M2M,North York,"Willowdale, Newtonbrook",0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
15,M2P,North York,York Mills West,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
16,M2R,North York,"Willowdale, Willowdale West",0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
17,M3A,North York,Parkwoods,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


### Clustering

In [29]:
grup = Toronto_infrastructure.groupby(["Postal Code"]).sum().reset_index()

print(grup.shape)
grup.head()

(79, 30)


Unnamed: 0,Postal Code,Airport Terminal,Bank,Bus Station,Business Service,Café,College Auditorium,Electronics Store,Farmers Market,Garden,...,Office,Park,Pharmacy,Playground,Restaurant,School,Shopping Mall,Theater,Train Station,Total infrastructure
0,M1E,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,3
1,M1H,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,M1J,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,M1K,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,M1L,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [30]:
grup['Total infrastructure'] =  grup[grup.drop(["Postal Code"], axis=1).columns.values].sum(axis=1)

In [31]:
grup_max = grup[grup['Total infrastructure'] == grup['Total infrastructure'].max()]
print("Best place to stay within a city for vital infrastructure facilities :")
grup_max[['Postal Code', 'Total infrastructure']]
print(grup_max.shape)

Best place to stay within a city for vital infrastructure facilities :
(1, 30)


In [32]:
merged2 = grup.copy()
merged2 = merged2.join(df[["Postal Code",'Latitude', 'Longitude', 'Borough' ]].set_index('Postal Code'), on='Postal Code')
merged2

Unnamed: 0,Postal Code,Airport Terminal,Bank,Bus Station,Business Service,Café,College Auditorium,Electronics Store,Farmers Market,Garden,...,Playground,Restaurant,School,Shopping Mall,Theater,Train Station,Total infrastructure,Latitude,Longitude,Borough
0,M1E,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,6,43.763573,-79.188711,Scarborough
1,M1H,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4,43.773136,-79.239476,Scarborough
2,M1J,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,2,43.744734,-79.239476,Scarborough
3,M1K,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,2,43.727929,-79.262029,Scarborough
4,M1L,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,43.711112,-79.284577,Scarborough
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,M8X,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,43.653654,-79.506944,Etobicoke
75,M9C,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,4,43.643515,-79.577201,Etobicoke
76,M9N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4,43.706876,-79.518188,York
77,M9R,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,43.688905,-79.554724,Etobicoke


In [33]:
# set number of clusters
kclusters = 3

clustering_group = merged2[["Total infrastructure"]]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(clustering_group)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [34]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
result = merged2.copy()
# add clustering labels
result["Cluster Labels"] = kmeans.labels_
print(result.shape)
result.head() # check the last columns!

(79, 34)


Unnamed: 0,Postal Code,Airport Terminal,Bank,Bus Station,Business Service,Café,College Auditorium,Electronics Store,Farmers Market,Garden,...,Restaurant,School,Shopping Mall,Theater,Train Station,Total infrastructure,Latitude,Longitude,Borough,Cluster Labels
0,M1E,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,6,43.763573,-79.188711,Scarborough,0
1,M1H,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,4,43.773136,-79.239476,Scarborough,0
2,M1J,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,43.744734,-79.239476,Scarborough,0
3,M1K,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,2,43.727929,-79.262029,Scarborough,0
4,M1L,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,43.711112,-79.284577,Scarborough,0


In [35]:
# Set manually to get proper fit in the map
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### Map visualization

In [36]:
map_clusters  = folium.Map(location=[latitude, longitude], zoom_start=11)
# set color scheme for the clusters
x = np.arange(kclusters)
rainbow = [    'red',    'blue',    'orange',    'darkgreen',    'darkblue',    'black']
# add markers to map
markers_colors = []
for lat, lng, label1,common, cluster in zip(result['Latitude'], result['Longitude'], result['Postal Code'],result['Total infrastructure'] , result['Cluster Labels']):
    labelnew =  'Postal Code : {} , Total infrastructure : {}'.format(label1,common)
    label = folium.Popup( labelnew, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7,
        parse_html=False).add_to(map_clusters)
map_clusters

#### Cluster 1

In [37]:
result.loc[result['Cluster Labels'] == 0]

Unnamed: 0,Postal Code,Airport Terminal,Bank,Bus Station,Business Service,Café,College Auditorium,Electronics Store,Farmers Market,Garden,...,Restaurant,School,Shopping Mall,Theater,Train Station,Total infrastructure,Latitude,Longitude,Borough,Cluster Labels
0,M1E,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,6,43.763573,-79.188711,Scarborough,0
1,M1H,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,4,43.773136,-79.239476,Scarborough,0
2,M1J,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,43.744734,-79.239476,Scarborough,0
3,M1K,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,2,43.727929,-79.262029,Scarborough,0
4,M1L,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,43.711112,-79.284577,Scarborough,0
5,M1N,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,2,43.692657,-79.264848,Scarborough,0
6,M1R,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,2,43.750072,-79.295849,Scarborough,0
7,M1S,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,43.7942,-79.262029,Scarborough,0
8,M1T,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,4,43.781638,-79.304302,Scarborough,0
9,M1V,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4,43.815252,-79.284577,Scarborough,0


#### Cluster 2

In [38]:
result.loc[result['Cluster Labels'] == 1]

Unnamed: 0,Postal Code,Airport Terminal,Bank,Bus Station,Business Service,Café,College Auditorium,Electronics Store,Farmers Market,Garden,...,Restaurant,School,Shopping Mall,Theater,Train Station,Total infrastructure,Latitude,Longitude,Borough,Cluster Labels
41,M5A,0,1,0,0,3,0,1,1,0,...,1,0,0,2,0,28,43.65426,-79.360636,Downtown Toronto,1
42,M5B,0,1,0,0,4,0,1,0,0,...,1,0,1,2,0,32,43.657162,-79.378937,Downtown Toronto,1
43,M5C,0,0,0,0,4,0,1,2,0,...,2,0,0,1,0,30,43.651494,-79.375418,Downtown Toronto,1
46,M5H,0,0,0,0,5,0,1,0,0,...,4,0,0,1,0,42,43.650571,-79.384568,Downtown Toronto,1
47,M5J,0,1,0,0,4,0,0,0,0,...,3,0,0,1,1,40,43.640816,-79.381752,Downtown Toronto,1
48,M5K,0,0,0,0,5,0,0,0,1,...,4,0,1,1,1,46,43.647177,-79.381576,Downtown Toronto,1
49,M5L,0,0,0,0,6,0,0,0,0,...,7,0,0,0,0,44,43.648198,-79.379817,Downtown Toronto,1
57,M5W,0,0,0,0,3,0,0,2,0,...,3,0,1,0,0,32,43.646435,-79.374846,Downtown Toronto,1
58,M5X,0,0,0,0,7,0,0,0,0,...,4,0,0,1,1,42,43.648429,-79.38228,Downtown Toronto,1


#### Cluster 3

In [39]:
result.loc[result['Cluster Labels'] == 2]

Unnamed: 0,Postal Code,Airport Terminal,Bank,Bus Station,Business Service,Café,College Auditorium,Electronics Store,Farmers Market,Garden,...,Restaurant,School,Shopping Mall,Theater,Train Station,Total infrastructure,Latitude,Longitude,Borough,Cluster Labels
11,M2J,0,2,1,0,0,0,1,0,0,...,3,0,1,1,0,20,43.778517,-79.346556,North York,2
14,M2N,0,1,0,0,2,0,1,0,0,...,2,0,2,0,0,20,43.77012,-79.408493,North York,2
20,M3H,0,2,0,0,0,0,0,0,0,...,1,0,1,0,0,14,43.754328,-79.442259,North York,2
35,M4S,0,0,0,0,2,0,0,1,0,...,1,0,0,0,0,18,43.704324,-79.38879,Central Toronto,2
39,M4X,0,1,0,0,2,0,0,0,0,...,2,0,0,0,0,22,43.667967,-79.367675,Downtown Toronto,2
40,M4Y,0,0,0,0,2,0,0,0,0,...,3,0,0,1,0,18,43.66586,-79.38316,Downtown Toronto,2
44,M5E,0,0,0,0,1,0,0,2,0,...,2,0,1,0,0,18,43.644771,-79.373306,Downtown Toronto,2
45,M5G,0,1,0,0,3,0,0,0,0,...,1,0,0,0,0,18,43.657952,-79.387383,Downtown Toronto,2
54,M5S,0,1,0,0,6,0,0,0,0,...,1,0,0,1,0,18,43.662696,-79.400049,Downtown Toronto,2
55,M5T,0,0,0,0,5,0,0,2,0,...,0,0,0,0,0,20,43.653206,-79.400049,Downtown Toronto,2
