# Part 1: Creating DataFrame with all the postal codes, boroughs and neighborhoods

In [126]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

#parsing and collecting the data
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
Data = url.text
soup = BeautifulSoup(Data, 'html.parser')

#creating lists to collect the information
PostalCode = list()
Borough = list()
Neighborhood = list()
i = len(soup.table.tbody.find_all('tr'))

#parsing the data to collect the information and store it in the lists
for x in range(i-1):
    th = soup.table.tbody.find_all('tr')[x+1].find_all('td')
    PostalCode.append(th[0].string)
    Borough.append(th[1].string)
    Neighborhood.append(th[2].string)
    
#cleaning data
PostalCode = PostalCode[3:]
Borough = Borough[3:]
Neighborhood = Neighborhood[3:]

In [127]:
#cleaning the borough data by eliminating null values in Borough column
lst = list()
for i in range(len(Borough)):
    if Borough[i] != 'Not assigned':
        lst.append(i)
PostalCode = [PostalCode[i] for i in lst]
Borough = [Borough[i] for i in lst]
Neighborhood = [Neighborhood[i] for i in lst]

#Changing neighborhood None values to empty string
test = list() 
for i in range(len(Neighborhood)):
    if Neighborhood[i] is not None:
        test.append(Neighborhood[i][:-1])
    else:
        test.append('')
Neighborhood = test 

In [128]:
#Creating dataframe and assigning lists
df = pd.DataFrame(columns = ['Postal Code','Borough','Neighborhood'])
df['Postal Code'] = PostalCode
df['Borough'] = Borough
df['Neighborhood'] = Neighborhood

In [129]:
#dropping rows with empty neighborhood values
for i in df['Postal Code'].unique():
    present = df[df['Postal Code'] == i]
    for j in present.index:
        if present.loc[j,'Neighborhood'] == '' or present.loc[j,'Neighborhood'] == 'Not assigned':
            df = df.drop(j)
df = df.reset_index(drop = True)

In [130]:
#creating dictionary with index as PostalCode and value as lists of Neighborhoods in that code
neigh_dict = dict()
for i in range(len(df['Postal Code'])):
    index = df.loc[i, 'Postal Code']
    existing = neigh_dict.get(index, list())
    existing.append(df.loc[i, 'Neighborhood'])
    neigh_dict[index] = existing

In [131]:
#Match the boroughs to the new postalcode/neighborhood lists
lst = list()
for i in range(len(neigh_dict)):
    x = df[df['Postal Code'] == list(neigh_dict.keys())[i]]['Borough'].values[0]
    lst.append(x)
Borough = lst

In [132]:
#empty out old dataframe
df.iloc[0:0]

Unnamed: 0,Postal Code,Borough,Neighborhood


In [133]:
#assign new values to dataframe and drop all none values
df['Postal Code'] = pd.Series(list(neigh_dict.keys()))
df['Neighborhood'] = pd.Series(list(neigh_dict.values()))
df['Borough'] = pd.Series(Borough)
df = df.dropna()

In [134]:
#change the lists in the neighborhood to strings separated by commas
df['Neighborhood'] = [', '.join(x) for x in df['Neighborhood']]

In [135]:
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3B,North York,Don Mills North
1,M5B,Downtown Toronto,"Ryerson, Garden District"
2,M6B,North York,Glencairn
3,M9B,Etobicoke,"Cloverdale, Martin Grove"
4,M3C,North York,Don Mills South
5,M9C,Etobicoke,"Bloordale Gardens, Eringate, Old Burnhamthorpe"
6,M1E,Scarborough,Guildwood
7,M6E,York,Caledonia-Fairbanks
8,M5G,Downtown Toronto,Central Bay Street
9,M6G,Downtown Toronto,Christie


In [136]:
print("Shape of dataframe:", df.shape)

Shape of dataframe: (59, 3)


# Part 2: Creating dataframe with latitude and longitude coordinates

In [137]:
#getting coordinates for latitude and longitude from csv and putting it in the dataframe
import geocoder
latitude = list()
longitude = list()

df_coord = pd.read_csv('Geospatial_Coordinates.csv')

for i in range(len(df)):
    postalcode = df_coord['Postal Code'][i]
    latitude.append(df_coord[df_coord['Postal Code'] == postalcode].values[0][1])
    longitude.append(df_coord[df_coord['Postal Code'] == postalcode].values[0][2])    
df['Latitude'] = latitude
df['Longitude'] = longitude

In [138]:
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3B,North York,Don Mills North,43.806686,-79.194353
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.784535,-79.160497
2,M6B,North York,Glencairn,43.763573,-79.188711
3,M9B,Etobicoke,"Cloverdale, Martin Grove",43.770992,-79.216917
4,M3C,North York,Don Mills South,43.773136,-79.239476
5,M9C,Etobicoke,"Bloordale Gardens, Eringate, Old Burnhamthorpe",43.744734,-79.239476
6,M1E,Scarborough,Guildwood,43.727929,-79.262029
7,M6E,York,Caledonia-Fairbanks,43.711112,-79.284577
8,M5G,Downtown Toronto,Central Bay Street,43.716316,-79.239476
9,M6G,Downtown Toronto,Christie,43.692657,-79.264848


# Part 3: Exploring the data

I will be clustering and mapping all the boroughs of Toronto onto a map

In [139]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [140]:
#getting geographical coordinates
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent = 'toronto_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

#creating map of toronto with neighborhoods superimposed on top
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 10)
for lat, lng, postal, borough in zip(df['Latitude'], df['Longitude'], df['Postal Code'], df['Borough']):
    label = 'Borough: {}, Postal Code: {}'.format(borough, postal)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
map_toronto

In [141]:
#creating map of the North York Borough
map_NorthYork = folium.Map(location = [latitude, longitude], zoom_start = 10)
df_NorthYork = df[df['Borough'] == 'North York']
for lat, lng, postal, borough in zip(df_NorthYork['Latitude'], df_NorthYork['Longitude'], df_NorthYork['Postal Code'], df_NorthYork['Borough']):
    label = 'Borough: {}, Postal Code: {}'.format(borough, postal)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.5,
        parse_html=False).add_to(map_NorthYork)  
map_NorthYork

In [142]:
#Exploring the North York borough
CLIENT_ID = 'O01XON5X0HRHLTYIS21HGKLGFJXLEADJ0TRWK1HYKMMPEJ0M' # your Foursquare ID
CLIENT_SECRET = 'ZKG0YDR2I0I5I2ZZD3F4N4ZCR4SQN4QC4RQJBL251CMD1O5O' # your Foursquare Secret
VERSION = '20200208' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: O01XON5X0HRHLTYIS21HGKLGFJXLEADJ0TRWK1HYKMMPEJ0M
CLIENT_SECRET:ZKG0YDR2I0I5I2ZZD3F4N4ZCR4SQN4QC4RQJBL251CMD1O5O


In [143]:
address = 'North York, Ontario'
geolocator = Nominatim(user_agent = 'northyork_explorer')
location = geolocator.geocode(address)
boro_latitude = location.latitude
boro_longitude = location.longitude
# type your answer here
radius = 500
limit = 100
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, 
                                                                                                                           CLIENT_SECRET, 
                                                                                                                           boro_latitude, 
                                                                                                                           boro_longitude, 
                                                                                                                           VERSION, 
                                                                                                                           radius, 
                                                                                                                           limit)

In [144]:
results = requests.get(url).json()
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [145]:
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Grill Gate,Mediterranean Restaurant,43.753123,-79.45169
1,Orly Restaurant & Grill,Middle Eastern Restaurant,43.754493,-79.443507
2,Tim Hortons,Coffee Shop,43.754767,-79.44325
3,Domino's Pizza,Pizza Place,43.753127,-79.450926


In [146]:
def getNearbyVenues(names, postalcodes, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, postalcode, lat, lng in zip(names, postalcodes, latitudes, longitudes):            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            postalcode,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Postal Code',
                  'Borough Latitude', 
                  'Borough Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [147]:
#getting venues for all postal codes
toronto_venues = getNearbyVenues(names=df['Borough'], 
                                 postalcodes = df['Postal Code'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )
print(len(toronto_venues['Venue Category'].unique()))

208


In [148]:
#one hot encoding variables
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Borough'] = toronto_venues['Borough'] 
toronto_onehot['Postal Code'] = toronto_venues['Postal Code']
fixed_columns = [toronto_onehot.columns[-2]] + [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-2])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot

Unnamed: 0,Borough,Postal Code,Afghan Restaurant,Airport,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,North York,M3B,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,North York,M3B,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Downtown Toronto,M5B,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Downtown Toronto,M5B,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Downtown Toronto,M5B,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1152,Etobicoke,M8Z,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1153,Etobicoke,M8Z,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1154,Etobicoke,M8Z,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1155,Etobicoke,M8Z,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [149]:
#cleaning and preparing dataframe for kmeans
toronto_grouped = toronto_onehot.groupby('Postal Code').mean().reset_index()
pc = toronto_grouped['Postal Code']
toronto_grouped

Unnamed: 0,Postal Code,Afghan Restaurant,Airport,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0
2,M1M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M1V,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M1W,0.012346,0.0,0.012346,0.0,0.0,0.012346,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.012346,0.0,0.0,0.0,0.0,0.012346
7,M2J,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M2L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M2N,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [150]:
#top 5 venues for each postal code
num_top_venues = 5

for hood in toronto_grouped['Postal Code']:
    print("----"+hood+"----")
    print("Borough:", toronto_onehot.loc[toronto_onehot[toronto_onehot['Postal Code'] == hood].index[0], 'Borough'])
    temp = toronto_grouped[toronto_grouped['Postal Code'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M1E----
Borough: Scarborough
                   venue  freq
0             Hobby Shop  0.25
1       Department Store  0.25
2            Coffee Shop  0.25
3     Chinese Restaurant  0.25
4  Performing Arts Venue  0.00


----M1H----
Borough: Scarborough
                   venue  freq
0      Indian Restaurant   0.4
1     Chinese Restaurant   0.2
2              Pet Store   0.2
3  Vietnamese Restaurant   0.2
4      Afghan Restaurant   0.0


----M1M----
Borough: Scarborough
           venue  freq
0    Pizza Place  0.17
1       Pharmacy  0.17
2        Butcher  0.17
3    Coffee Shop  0.17
4  Grocery Store  0.17


----M1N----
Borough: Scarborough
                  venue  freq
0        Massage Studio  0.25
1           Coffee Shop  0.25
2                   Bar  0.25
3  Caribbean Restaurant  0.25
4     Afghan Restaurant  0.00


----M1T----
Borough: Scarborough
               venue  freq
0     Sandwich Place  0.14
1  Food & Drink Shop  0.14
2              Hotel  0.14
3     Breakfast Spot  0.14
4 

In [151]:
#fitting kmeans model
kclusters = 5
toronto_clustering_grouped = toronto_grouped.drop('Postal Code', 1)
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(toronto_clustering_grouped)

In [152]:
#pairing postal codes with latitude and longitude
lat = list()
long = list()
for i in pc:
    lat.append(df.loc[df[df['Postal Code'] == i].index[0], 'Latitude'])
    long.append(df.loc[df[df['Postal Code'] == i].index[0], 'Longitude'])

In [153]:
#create dataframe for mapping
mapped_toronto = pd.DataFrame(columns = ['Postal Code', 'Latitude', 'Longitude', 'Cluster Labels'])
mapped_toronto['Postal Code'] = pc
mapped_toronto['Latitude'] = lat
mapped_toronto['Longitude'] = long
mapped_toronto['Cluster Labels'] = kmeans.labels_

In [154]:
mapped_toronto.head()

Unnamed: 0,Postal Code,Latitude,Longitude,Cluster Labels
0,M1E,43.727929,-79.262029,0
1,M1H,43.75741,-79.273304,0
2,M1M,43.782736,-79.442259,0
3,M1N,43.76798,-79.487262,0
4,M1T,43.712751,-79.390197,0


In [156]:
# create map that's clustered
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map that are clustered
markers_colors = []
for lat, lon, poscode, cluster in zip(mapped_toronto['Latitude'], mapped_toronto['Longitude'], mapped_toronto['Postal Code'], mapped_toronto['Cluster Labels']):
    boro = df.loc[df[df['Postal Code'] == poscode].index[0], 'Borough']
    label = folium.Popup('Borough: ' + 
                         str(boro) + ', ' + '\n' +
                         'Postal Code: ' + 
                         str(poscode) + ',' + '\n' +
                         'Cluster: ' + 
                         str(cluster), parse_html = True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters