# 1. Fetching Data

In [176]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

#BeautifulSoup for Web Scraping
from bs4 import BeautifulSoup

#Convert Address to Latitude, Longitude
import geocoder

#Time functionality
import time


print('Libraries imported.')

Libraries imported.


In [177]:
raw_data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

parsed =  BeautifulSoup(raw_data.text, 'lxml')
tables = parsed.find_all('table')
table = tables[0]
subtags = table.find_all('tr') #Find all the <tr> Tags and fetch into array
initTag = subtags[3]  #First 3 rows are unassigned and hence ignoring them
subtags = subtags[4:] 

# 2. Creating and Building Dataframe

In [178]:
column_names = [ 'PostalCode','Borough', 'Neighborhood']
df = pd.DataFrame(columns= column_names)


In [179]:
#A function that takes input an array of <td> and returns the code, borough and neighbourhood as per the required format
def getValues(values):
    if(True):
        code = values[0].get_text()
        borough = values[1].get_text()
        neibr = values[2].get_text().rstrip()
        if(borough != 'Not assigned'):
            if(neibr != 'Not assigned'):
                return code, borough, neibr
            else:
                return code, borough, borough
        else:
            return None, None, None
    else:
        return None,None,None

#Iteratively traversing the list of <tr> tags and cumulating the data into a dataframe
code, bor, nei = getValues(initTag.find_all('td'))
for tag in subtags:
    next_code, next_bor, next_nei = getValues(tag.find_all('td'))
    if(next_bor==None):
        continue
    if(next_code==code):
        nei = nei + ', '+next_nei
    else:
        df = df.append({'PostalCode':code,'Borough':bor, 'Neighborhood':nei}, ignore_index=True)
        code = next_code
        bor = next_bor
        nei = next_nei

df = df.append({'PostalCode':code,'Borough':bor, 'Neighborhood':nei}, ignore_index=True)    
       

In [180]:
#Shape
df.shape


(103, 3)

In [181]:
#This cell is using Geocoding Libraries. It can be skipped if using the location files

# lat_coords = [None]*103
# long_coords = [None]*103

# for index, row in df.iterrows():
#     code = row['PostalCode']
#     latlong = None
#     while(latlong is None):
#         geo = geocoder.google('{}, Toronto, Ontario'.format(code))
#         latlong = geo.latlng
#     lat_coords[index] = latlong[0]
#     long_coords[index] = latlong[1]
#     display(str(index) +' '+ str(lat_coords[index])+ ' '+ str(long_coords[index]))
# df['Latitude'] = lat_coords
# df['Longitude'] = long_coords


In [182]:
df.sort_values(by='PostalCode', ascending=True, inplace=True)
coords = pd.read_csv('TorontoCoords.csv')

df['Latitude'] = coords['Latitude']
df['Longitude'] = coords['Longitude']

display(df.head())

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
6,M1B,Scarborough,"Rouge, Malvern",43.727929,-79.262029
12,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7942,-79.262029
18,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.778517,-79.346556
22,M1G,Scarborough,Woburn,43.77012,-79.408493
26,M1H,Scarborough,Cedarbrae,43.745906,-79.352188


In [183]:
latlng = None
while(latlng is None):
    center = geocoder.google('Toronto, Ontario')
    latlng = center.latlng

In [184]:
toronto = folium.Map(location=[latlng[0], latlng[1]], zoom_start=10)
for lat, long, label in zip(df['Latitude'], df['Longitude'], df['PostalCode']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat,long], radius=5, fill=True,fill_color='#3186cc', popup=label).add_to(toronto)
toronto

In [185]:
CLIENT_ID = 'FHG3YAB3252T53LFLCZ0BCHM2PUUX4TSGBDZRA2HJGMEGI1I' # your Foursquare ID
CLIENT_SECRET = '2WEEFSQF5RWIT1BJIR0CJZFNTQSSJJNAJAOOBVEDDK2MHXR0' # your Foursquare Secret
VERSION = '20180604'
limit = 100
radius = 300

def getURL(lat, long):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        long, 
        radius, 
        limit)
    return url
def getItemList(res):
    if(len(res['response']['groups'][0]['items'])==0):
        return None
    return json_normalize(res['response']['groups'][0]['items']).loc[:, ['venue.id', 'venue.name', 'venue.location.lat', 'venue.location.lng', 'venue.categories']]
def getCategoryType(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

df_temp = df[df['PostalCode']=='M6C']
getItemList(requests.get(getURL(df_temp.loc[16,"Latitude"], df_temp.loc[16, 'Longitude'])).json())

In [None]:
def getNearbyVenues(names, lats, longs):
    venues_list= []
    for name, lat, long in zip(names, lats, longs):
        print(name)
        results = requests.get(getURL(lat, long)).json()
        time.sleep(1)
        items = getItemList(results)
        if(items is not None):
            venues_list.append([(name,
                             lat,
                             long,
                             row['venue.name'],
                             row['venue.location.lat'],
                             row['venue.location.lng'],
                             getCategoryType(row))for index, row in items.iterrows()])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Area Latitude', 
                  'Area Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

venues = getNearbyVenues(df['PostalCode'], df['Latitude'], df['Longitude'])

M1B
M1C
M1E
M1G
M1H
M1J
M1K
M1L
M1M
M1N
M1P
M1R
M1S
M1T
M1V
M1W
M1X
M2H
M2J
M2K
M2L
M2M
M2N
M2P
M2R
M3A
M3B
M3C
M3H
M3J
M3K
M3L
M3M
M3N
M4A
M4B
M4C
M4E
M4G
M4H
M4J
M4K
M4L
M4M
M4N
M4P
M4R
M4S
M4T
M4V
M4W
M4X
M4Y
M5A
M5B
M5C
M5E
M5G
M5H
M5J
M5K
M5L
M5M
M5N
M5P
M5R
M5S
M5T
M5V
M5W
M5X
M6A
M6B
M6C
M6E
M6G
M6H
M6J
M6K
M6L
M6M
M6N
M6P
M6R
M6S
M7A
M7R
M7Y
M8V
M8W


In [None]:
print('There are {} uniques categories.'.format(len(venues['Venue Category'].unique())))
print('\nNumber of venues per Postal Code:')
display(venues.groupby("Postal Code").count())
venues.to_csv("venues.csv")

In [None]:
venueOHE= pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")
venueOHE['PostalCode'] = venues['Postal Code']
columns = [venueOHE.columns[-1]] + list(venueOHE.columns[:-1])
venueOHE = venueOHE[columns]
venueOHE.head()

In [None]:
categories = venueOHE.groupby("PostalCode").sum().reset_index()
categories

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = categories['PostalCode']

for ind in np.arange(categories.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(categories.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

In [None]:
k_clusters = 5
clustering = categories.drop("PostalCode", 1)

kmeans = KMeans(n_clusters = k_clusters, random_state=0).fit(clustering)

kmeans.labels_[0:10]

In [None]:
torontoData = neighborhoods_venues_sorted
torontoData['Cluster'] = kmeans.labels_
torontoData = torontoData.join(df.set_index("PostalCode"), on="PostalCode")


columns = ['PostalCode', 'Borough','Neighborhood', 'Latitude', 
         'Longitude', 'Cluster', '1st Most Common Venue', '2nd Most Common Venue',
       '3rd Most Common Venue', '4th Most Common Venue',
       '5th Most Common Venue', '6th Most Common Venue',
       '7th Most Common Venue', '8th Most Common Venue',
       '9th Most Common Venue', '10th Most Common Venue' ]
torontoData = torontoData[columns]


display(torontoData.head())

In [None]:

# create map
map_clusters = folium.Map(location=[latlng[0], latlng[1]], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k_clusters)
ys = [i+x+(i*x)**2 for i in range(k_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, nbrs, cluster in zip(torontoData['Latitude'], torontoData['Longitude'], torontoData['Neighborhood'], torontoData['Cluster']):
    label = folium.Popup(str(nbrs) + ' : Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters