#  Applied Data Science Capstone: Week 3

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unicodedata import normalize
import requests
#from bs4 import BeautifulSoup
import html5lib

## Scraping the Table of Canada's Postal Codes

In [2]:
url =  "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" 
page = requests.get(url)
table = pd.read_html(url)

In [3]:
print(f'Total tables: {len(table)}')

Total tables: 3


## We want the first table

In [4]:
table = table[0]

## Because the values we want as our separate columns are listed within each box of the table, we will flatten the table into a vector and then use slicing to pull out the features we want.

In [5]:
postal = table.values.flatten()

In [6]:
postal = pd.DataFrame(postal)

### The Postal Code is contained in the first three positions of the string. We extract it and give it its own column.

In [7]:
postal.columns = ['source']
postal['Postal_Code'] = postal['source'].str[:3]
postal['source'] = postal['source'].str[3:]

### We remove any unassigned Postal Codes.

In [8]:
postal.drop(postal[postal['source'] == "Not assigned"].index, inplace = True) 

### Cleaning up any extra characters to make for easy feature selection

In [9]:
#postal["source"] = postal.source.str.replace('', ' ')
postal["Neighborhood"] = postal.source.apply(lambda st: st[st.find("(")+1:st.find(")")])
postal.head()

Unnamed: 0,source,Postal_Code,Neighborhood
2,North York(Parkwoods),M3A,Parkwoods
3,North York(Victoria Village),M4A,Victoria Village
4,Downtown Toronto(Regent Park / Harbourfront),M5A,Regent Park / Harbourfront
5,North York(Lawrence Manor / Lawrence Heights),M6A,Lawrence Manor / Lawrence Heights
6,Queen's Park(Ontario Provincial Government),M7A,Ontario Provincial Government


### Splitting the "source" column into multiple columns, using space as a delimiter

In [10]:
new = postal.source.str.split(pat = "(", expand = True)
new.head()

Unnamed: 0,0,1,2
2,North York,Parkwoods),
3,North York,Victoria Village),
4,Downtown Toronto,Regent Park / Harbourfront),
5,North York,Lawrence Manor / Lawrence Heights),
6,Queen's Park,Ontario Provincial Government),


### Borough is the first column. Then, we concatenate the other columns to form the Neighborhoods column and remove all the "nones"

In [11]:
postal["Borough"] = new.iloc[:,0]
new = postal.Neighborhood.str.split(pat = "/", expand = True)
new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
2,Parkwoods,,,,,,,,
3,Victoria Village,,,,,,,,
4,Regent Park,Harbourfront,,,,,,,
5,Lawrence Manor,Lawrence Heights,,,,,,,
6,Ontario Provincial Government,,,,,,,,


In [12]:
postal["Neighborhood"] = new.iloc[:, 0:8].apply(lambda row:', '.join(row.values.astype(str)), axis = 1)

In [13]:
postal["Neighborhood"] = postal.Neighborhood.str.replace(', None', '')

### Here are our final dataframe and its shape

In [14]:
postal = postal.drop(['source'], axis = 1)
postal.head()

Unnamed: 0,Postal_Code,Neighborhood,Borough
2,M3A,Parkwoods,North York
3,M4A,Victoria Village,North York
4,M5A,"Regent Park , Harbourfront",Downtown Toronto
5,M6A,"Lawrence Manor , Lawrence Heights",North York
6,M7A,Ontario Provincial Government,Queen's Park


In [15]:
postal.shape

(103, 3)

## Geocoder didn't work, so I'm loading the values from CSV

In [16]:
#!pip install geocoder
#import geocoder # import geocoder

# initialize your variable to None
#lat_lng_coords = None

# loop until you get the coordinates
#while(lat_lng_coords is None):
  #g = geocoder.google('{}, Toronto, Ontario'.format(postal.Postal_Code))
  #lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

In [17]:
data = pd.read_csv("Geospatial_Coordinates.csv") 
data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
data.columns = ['Postal_Code', 'Latitude', 'Longitude']
merged = pd.merge(postal, data)
merged.head()

Unnamed: 0,Postal_Code,Neighborhood,Borough,Latitude,Longitude
0,M3A,Parkwoods,North York,43.753259,-79.329656
1,M4A,Victoria Village,North York,43.725882,-79.315572
2,M5A,"Regent Park , Harbourfront",Downtown Toronto,43.65426,-79.360636
3,M6A,"Lawrence Manor , Lawrence Heights",North York,43.718518,-79.464763
4,M7A,Ontario Provincial Government,Queen's Park,43.662301,-79.389494


## Clustering Section

In [19]:
import numpy as np
from sklearn.cluster import KMeans

### Select the Boroughs that contain "Toronto"

In [20]:
Toronto = merged[merged['Borough'].str.contains('Toronto')].reset_index(drop = True)
Toronto.head()

Unnamed: 0,Postal_Code,Neighborhood,Borough,Latitude,Longitude
0,M5A,"Regent Park , Harbourfront",Downtown Toronto,43.65426,-79.360636
1,M5B,"Garden District, Ryerson",Downtown Toronto,43.657162,-79.378937
2,M5C,St. James Town,Downtown Toronto,43.651494,-79.375418
3,M4E,The Beaches,East Toronto,43.676357,-79.293031
4,M5E,Berczy Park,Downtown Toronto,43.644771,-79.373306


### Importing Libraries to make the Folium map

In [21]:
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 

import matplotlib.cm as cm
import matplotlib.colors as colors

#!conda install -c conda-forge folium=0.5.0 --yes 
import folium 


In [22]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


## Using the Foursquare API to gather venues in Toronto

In [23]:
CLIENT_ID = '4OQLMH5TFBS0FUP2G4KE0XNJXFCZYYCHM1UJRNHHQC4VRS33' # your Foursquare ID
CLIENT_SECRET = 'KZQDU1KS5GYGQOCRTTOHA35XBGJVPDOCUIP4AMVUBNUHUPGO' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 500 # A default Foursquare API limit value


print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 4OQLMH5TFBS0FUP2G4KE0XNJXFCZYYCHM1UJRNHHQC4VRS33
CLIENT_SECRET:KZQDU1KS5GYGQOCRTTOHA35XBGJVPDOCUIP4AMVUBNUHUPGO


In [24]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [25]:
toronto_venues = getNearbyVenues(names = Toronto['Neighborhood'],
                                  latitudes = Toronto['Latitude'],
                                  longitudes = Toronto['Longitude'])

Regent Park ,  Harbourfront
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond ,  Adelaide ,  King
Dufferin ,  Dovercourt Village
The Danforth East
Harbourfront East ,  Union Station ,  Toronto Islands
Little Portugal ,  Trinity
The Danforth West ,  Riverdale
Toronto Dominion Centre ,  Design Exchange
Brockton ,  Parkdale Village ,  Exhibition Place
India Bazaar ,  The Beaches West
Commerce Court ,  Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
High Park ,  The Junction South
North Toronto West
The Annex ,  North Midtown ,  Yorkville
Parkdale ,  Roncesvalles
Davisville
University of Toronto ,  Harbord
Runnymede ,  Swansea
Moore Park ,  Summerhill East
Kensington Market ,  Chinatown ,  Grange Park
Summerhill West ,  Rathnelly ,  South Hill ,  Forest Hill SE ,  Deer Park
CN Tower ,  King and Spadina ,  Railway Lands ,  Harbourfront West ,  Bathurst Quay ,  South Niagara ,  Island airport
R

In [26]:
print(toronto_venues.shape)
toronto_venues.groupby('Neighborhood').count()
toronto_data = toronto_venues.iloc[:,0:3].groupby('Neighborhood').mean().reset_index()
toronto_data.columns = ['Neighborhood', 'Latitude', 'Longitude']
toronto_venues.head()

(1586, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park , Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park , Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park , Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park , Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park , Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


## One Hot Encoding and Grouping venues by Neighborhood

In [27]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
first_col = toronto_onehot.pop('Neighborhood')
toronto_onehot.insert(0, 'Neighborhood', first_col)

toronto_onehot.head()

Unnamed: 0,Neighborhood,Adult Boutique,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Adult Boutique,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016949,0.0,0.0,0.0,0.0,0.0
1,"Brockton , Parkdale Village , Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower , King and Spadina , Railway Lands ...",0.0,0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016667,0.0,0.0,0.016667,0.0,0.016667
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=7).fit(toronto_grouped_clustering)


In [30]:
toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data.join(toronto_grouped.set_index('Neighborhood'), on = "Neighborhood")
toronto_merged.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,Adult Boutique,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Berczy Park,43.644771,-79.373306,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016949,0.0,0.0,0.0,0.0,0.0
1,"Brockton , Parkdale Village , Exhibition Place",43.636847,-79.428191,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower , King and Spadina , Railway Lands ...",43.628947,-79.39442,0,0.0,0.0625,0.0625,0.125,0.125,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,43.657952,-79.387383,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016667,0.0,0.0,0.016667,0.0,0.016667
4,Christie,43.669542,-79.422564,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Creating the Folium Map

In [31]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters