In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib.request

In [2]:
m_codes = pd.DataFrame(columns = ['Postal Code', 'Borough', 'Neighborhood'], index=[0])

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
with urllib.request.urlopen(url) as response:
    wikipage = response.read()
    markup = BeautifulSoup(wikipage, 'html.parser')

table = markup.find_all('tbody')[0]

row_marker = 0
for row in table.find_all('tr'):
    column_marker = 0
    columns = row.find_all('td')
    for column in columns:
        m_codes.iat[row_marker-1, column_marker] = column.get_text() #subtract 1 from row_marker or else row zero is NaNs
        column_marker += 1
    m_codes.loc[row_marker+1] = ['' for n in range(3)]
    row_marker += 1
    
m_codes['Neighborhood'] = m_codes['Neighborhood'].str.strip('\n')

m_codes['Neighborhood'].replace('', np.nan, inplace=True)

m_codes.dropna(how='any', inplace=True)

m_codes.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
m_codes['Neighborhood'] =  m_codes.groupby(by=['Postal Code', 'Borough']).transform(lambda x: ', '.join(x))
#m_codes_new =  m_codes.groupby(by=['Postal Code', 'Borough']).transform(lambda x: ', '.join(x))

#print(m_codes_new.head())

#m_codes['Neighborhood'] = m_codes_new['Neighborhood']
m_codes.drop_duplicates(inplace=True)

m_codes.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront, Regent Park"


In [5]:
m_codes = m_codes[m_codes['Borough'] != "Not assigned"]

m_codes.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront, Regent Park"
6,M6A,North York,"Lawrence Heights, Lawrence Manor"
8,M7A,Queen's Park,Not assigned


In [6]:
#m_codes['Neighborhood'] = np.where(m_codes['Neighborhood'] == "Not assigned", m_codes['Borough'], m_codes['Neighborhood'] )

m_codes.loc[m_codes['Neighborhood'] == "Not assigned", 'Neighborhood'] = m_codes['Borough']

m_codes.reset_index(inplace=True, drop=True)

m_codes.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [7]:
m_codes.shape

(103, 3)

# ------------Break for 1st GitHub publishing--------------

## not using this code b/c geocoder was crazy slow

! pip install geocoder

import geocoder # import geocoder

for index, row in m_codes.iterrows():
    
    postal_code = row['Postal Code']
    
    # initialize your variable to None
    lat_lng_coords = None
    
    print('For code:' + postal_code)
    
    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng
    
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    print('Lat:'+ latitude + ', Long:' + longitude)
    
    row['Latitude'] = latitude
    row['Longitude'] = longitude
    
m_codes.head()
    

In [8]:
!wget 'https://cocl.us/Geospatial_data'

--2019-06-27 01:50:07--  https://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.48.113.201
Connecting to cocl.us (cocl.us)|169.48.113.201|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-06-27 01:50:10--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.27.197
Connecting to ibm.box.com (ibm.box.com)|107.152.27.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-06-27 01:50:10--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Reusing existing connection to ibm.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-06-27 

In [9]:
geocoded_df = pd.read_csv('Geospatial_data')

#geocoded_df.head()

m_codes_geo = m_codes.merge(geocoded_df, how='left', on='Postal Code')

m_codes_geo.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [10]:
m_codes_geo.shape

(103, 5)

# ------------Break for 2nd GitHub publishing--------------

In [11]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [12]:
address = 'Toronto, Ontario, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [13]:
# create map of Toronto using latitude and longitude values
map_tor = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip( m_codes_geo['Latitude'], 
                                            m_codes_geo['Longitude'], 
                                            m_codes_geo['Borough'],
                                            m_codes_geo['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  
    
map_tor

Originally I was going to use only the postal codes in the Toronto boroughs, but there ended up being only a few dozen with not much variety, so I doubled back to this point and replaced the line that copied the Toronto data into a new dataframe, with a simple copying of the original dataframe. I kept the same name for the new dataframe, "toronto_only", for the sake of not having to edit the remaining code cells below.

The map is made redundant by not filtering the codes to Toronto borough.

In [14]:
#toronto_only = m_codes_geo[m_codes_geo['Borough'].str.contains("Toronto")].reset_index(drop=True)

toronto_only = m_codes_geo.copy(deep = True)

toronto_only.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [15]:
# create map of Toronto using latitude and longitude values
map_tor_only = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip( toronto_only['Latitude'], 
                                            toronto_only['Longitude'], 
                                            toronto_only['Borough'],
                                            toronto_only['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor_only)  
    
map_tor_only

Hidden cell below this to store Foursquare credentials

In [16]:
# The code was removed by Watson Studio for sharing.

In [17]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Code Latitude', 
                  'Code Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Import up to 100 venues per postal code, based on proximity to lat/long for center of neighborhood being within 500 meters.

In [18]:
LIMIT = 100

RADIUS = 500

tor_venues = getNearbyVenues(   names=toronto_only['Postal Code'],
                                latitudes=toronto_only['Latitude'],
                                longitudes=toronto_only['Longitude']
                            )

print('Done!')

Done!


Double-checking that the resulting dataframe looks alright

In [19]:
tor_venues.shape

(2258, 7)

In [20]:
tor_venues.head()

Unnamed: 0,Postal Code,Code Latitude,Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,M3A,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,M4A,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,M4A,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


The limit of 100 venues per postal code probably prevented certain densely-developed areas from having their full set of venues represented; I wanted to get an idea of how many postal codes hit this limit.

In [21]:
tor_venues.groupby('Postal Code').count().sort_values(by=['Venue'], ascending = False).head(10)

Unnamed: 0_level_0,Code Latitude,Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M5H,100,100,100,100,100,100
M5B,100,100,100,100,100,100
M5X,100,100,100,100,100,100
M5T,100,100,100,100,100,100
M5C,100,100,100,100,100,100
M5L,100,100,100,100,100,100
M5K,100,100,100,100,100,100
M5J,100,100,100,100,100,100
M5W,94,94,94,94,94,94
M5G,88,88,88,88,88,88


In order to cluster the postal codes using the KMeans function, each category of venues needs to be made into a feature representing which venues (represented by rows) were labeled with that category. The venue name and location info are dropped, but this data is going to be reduced to one row per postal code in the next cell anyway, where the rest of the columns will represent the portion of the venues in that row (postal code) that were labeled with each category.

If you're confused by the commented-out line about renaming a "Neighborhood" column, I noticed that one of the categories for venues is actually "Neighborhood" and it was causing problems when trying to import the Neighborhood names column from the prior dataframes, so I renamed it at one point during some exploratory manipulation. Even though I ended up not needing to bring that column, I figured it would be wise to keep that code visible to remind me of the potential confusion if I revisit this notebook later on.

In [22]:
# one hot encoding
tor_onehot = pd.get_dummies(tor_venues[['Venue Category']], prefix="", prefix_sep="")

#print(tor_onehot.shape)

#tor_onehot.rename(columns = {"Neighborhood":"Neighborhood (category)"}, inplace=True)

#print(tor_onehot['Neighborhood venue'])

# add Postal Code column back to dataframe
tor_onehot['Postal Code'] = tor_venues['Postal Code']

#print(tor_onehot.shape)

# move neighborhood column to the first column
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]

tor_onehot.head()

Unnamed: 0,Postal Code,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
tor_grouped = tor_onehot.groupby('Postal Code').mean().reset_index()
tor_grouped.head()

Unnamed: 0,Postal Code,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

The following block incorporates the full KMeans clustering process - I integrated it into a single block b/c changing the number of clusters and/or top venues required re-running a few other steps afterward anyway, so this allows for reseting the resulting dataframe, running the clustering, and then preparing the results for further analysis.

In [25]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
pcodes_venues_sorted = pd.DataFrame(columns=columns)
pcodes_venues_sorted['Postal Code'] = tor_grouped['Postal Code']

for ind in np.arange(tor_grouped.shape[0]):
    pcodes_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tor_grouped.iloc[ind, :], num_top_venues)

#pcodes_venues_sorted.head()

# set number of clusters
kclusters = 4

tor_grouped_clustering = tor_grouped.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_grouped_clustering)

# check cluster labels generated for each row in the dataframe
#kmeans.labels_[0:10] 

# add clustering labels
pcodes_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

tor_merged = toronto_only

# merge toronto_grouped with toronto_data to add latitude/longitude for each postal code
tor_merged = tor_merged.join(pcodes_venues_sorted.set_index('Postal Code'), on='Postal Code')

tor_merged.dropna(how='any', inplace=True)
#tor_merged[tor_merged['Cluster Labels'].isna()]

tor_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0.0,Fast Food Restaurant,Park,Food & Drink Shop,Dumpling Restaurant,Diner
1,M4A,North York,Victoria Village,43.725882,-79.315572,2.0,Intersection,Coffee Shop,Hockey Arena,Portuguese Restaurant,Drugstore
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,2.0,Coffee Shop,Pub,Bakery,Park,Theater
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,2.0,Clothing Store,Furniture / Home Store,Women's Store,Coffee Shop,Fraternity House
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,2.0,Coffee Shop,Sushi Restaurant,Gym,Japanese Restaurant,Park


Standard mapping code below, nothing fancy. In a proper map, I would choose specific colors for the markers b/c the defaults are often hard to differentiate from each other and the map background.

In [26]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster, postal_code in zip(   tor_merged['Latitude'], 
                                                  tor_merged['Longitude'], 
                                                  tor_merged['Neighborhood'], 
                                                  tor_merged['Cluster Labels'],
                                                  tor_merged['Postal Code']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster+1) + ' Postal ' + str(postal_code), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

I chose to adapt the grouped print-out code from the NY neighborhood analysis to help illustrate the types of venues in each cluster, as well as the number of postal codes to identify any clusters that might be catching outliers from the other, larger clusters.

In [27]:
pcodes_clustered = pcodes_venues_sorted[['Cluster Labels', 'Postal Code']]
#pcodes_clustered

tor_grouped_clustered = tor_grouped.copy()

# merge toronto_grouped with toronto_data to add latitude/longitude for each postal code
tor_grouped_clustered = tor_grouped_clustered.join(pcodes_clustered.set_index('Postal Code'), on='Postal Code')

clusters_venue_freq = tor_grouped_clustered.groupby(['Cluster Labels']).mean().reset_index()

count_per_cluster = tor_grouped_clustered.groupby(['Cluster Labels']).count()

num_top_venues = 5

for cluster in clusters_venue_freq['Cluster Labels']:
    print("----"+str(cluster+1)+"----" + str(count_per_cluster.loc[cluster, 'Postal Code']) + ' postal code(s)' )
    temp = clusters_venue_freq[clusters_venue_freq['Cluster Labels'] == cluster].T.reset_index()
    #print(temp.head())
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----1----13 postal code(s)
                  venue  freq
0                  Park  0.38
1              Bus Line  0.04
2  Fast Food Restaurant  0.04
3           Pizza Place  0.04
4     Convenience Store  0.04


----2----2 postal code(s)
                 venue  freq
0       Baseball Field   1.0
1    Accessories Store   0.0
2   Miscellaneous Shop   0.0
3                Motel   0.0
4  Monument / Landmark   0.0


----3----83 postal code(s)
            venue  freq
0     Coffee Shop  0.07
1     Pizza Place  0.04
2            Café  0.03
3            Bank  0.03
4  Sandwich Place  0.03


----4----2 postal code(s)
                venue  freq
0          Playground  0.75
1        Tennis Court  0.25
2  Miscellaneous Shop  0.00
3       Movie Theater  0.00
4               Motel  0.00




## Summary

The biggest differentiation appears to be the level of commercial development, ranging from commercial areas to wide open spaces; between two ends of that spectrum.

Cluster 1 (red dots) appears to be less developed land, left mostly for parks with a few other venues scattered about.

Cluster 3 (teal dots) appears to be heavily developed land, and Canadians really love their coffee (Tim Hortons?).

Cluster 4 (light green dots) and Cluster 2 (purple dots) appear to be almost devoid of businesses, so they may exclusively residential areas? For Cluster 2, I confirmed that neither M8Y or M9M are the postal code for the Blue Jays' stadium, which is in M5V.