#  Applied Data Science Capstone Project Notebook

## Week 1 Capstone Project

In [1]:
import pandas as pd
import numpy as np

In [2]:
print("Hello Capstone Project Course")

Hello Capstone Project Course


## Week 3 Segmenting and Clustering

### Part 1 Create the Data Frame

The next lines of code are my attempt at web scraping to get the postal code data from wikipedia.

In [3]:
#!pip install lxml
 
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", match='Borough')
len(df)

1

In [4]:
t_df = df[0]
t_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Dropping rows that do not have a borough assigned.

In [5]:
t_df.drop(t_df[t_df['Borough'] == 'Not assigned'].index, inplace = True) 
t_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Replacing neighbourhoods not assigned with the borough names.

In [6]:
t_df['Neighbourhood'] = np.where(t_df['Neighbourhood'] == 'Not assigned', t_df['Borough'], t_df['Neighbourhood'])
t_df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [7]:
t_df.shape

(103, 3)

### Part 2 Get Geo Location Data

Merge two data bases so the lat and lng for each borough are listed.

In [8]:
t_df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [9]:
geo_df = pd.read_csv('http://cocl.us/Geospatial_data')
geo_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [10]:
t_df2 = pd.merge(t_df, geo_df, on='Postal Code')
t_df2

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


Getting borough names to narrow search in next part of project.

In [11]:
t_df2.Borough.unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

### Part 3 Exploring the Boroughs of Toronto

Importing and Installing needed libraries to map and analyze data.

In [14]:
from geopy.geocoders import Nominatim
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
#! pip install folium
import folium

from IPython.display import Image
from IPython.core.display import HTML

from pandas.io.json import json_normalize

print('Libraries imported.')

Libraries imported.


In [15]:
address = 'Toronto, Canada'

geolocater = Nominatim(user_agent="tor_explorer")
location = geolocater.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical cooridinate of Toronto is {}, {}'.format(latitude, longitude))

The geographical cooridinate of Toronto is 43.6534817, -79.3839347


In [16]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, lng, borough, neighbourhood in zip(t_df2['Latitude'], t_df2['Longitude'], t_df2['Borough'], t_df2['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

Isolating the borough of Downtown Toronto.

In [17]:
toronto_data = t_df2[t_df2['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [18]:
address='Downtown Toronto'

geolocater = Nominatim(user_agent="dt_explorer")
location = geolocater.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical cooridinate of Downtown Toronto is {}, {}'.format(latitude, longitude))


The geographical cooridinate of Downtown Toronto is 43.6541737, -79.38081164513409


In [21]:
map_dttoronto = folium.Map(location=[latitude, longitude], zoom_start=13)

for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dttoronto)
    
map_dttoronto

Now I will use Foursquare to explore the neighbourhoods.

In [26]:
CLIENT_ID = 'H5KGSVX0LK3XUNBPMWSWTHP0Q0YSJJNSNAC2XHWGWH4KH25L'
CLIENT_SECRET = 'KC402O2PMBUX5TPRSMM3AB4Y2VQWEQWSRNP4XFINZMGBONTF'
VERSION = '20180605'
LIMIT = 100

toronto_data.loc[8, 'Neighbourhood']

'Harbourfront East, Union Station, Toronto Islands'

In [28]:
neighbourhood_latitude = toronto_data.loc[8, 'Latitude']
neighbourhood_longitude = toronto_data.loc[8, 'Longitude']
neighbourhood_name = toronto_data.loc[8, 'Neighbourhood']

limit = 100

radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, neighbourhood_latitude, neighbourhood_longitude, radius, LIMIT)

url

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '600ef8af512f380ec1adfeab'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Harbourfront',
  'headerFullLocation': 'Harbourfront, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 151,
  'suggestedBounds': {'ne': {'lat': 43.6453157045, 'lng': -79.37554568947675},
   'sw': {'lat': 43.636315695499995, 'lng': -79.38795891052322}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bfaa3494a67c928d08528cf',
       'name': 'Harbourfront',
       'location': {'lat': 43.639525632239106,
        'lng': -79.38068838052389,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.639525632239106,
          'lng': -79.3806

In [33]:
items = results['response']['groups'][0]['items']
items[0:2]

[{'reasons': {'count': 0,
   'items': [{'summary': 'This spot is popular',
     'type': 'general',
     'reasonName': 'globalInteractionReason'}]},
  'venue': {'id': '4bfaa3494a67c928d08528cf',
   'name': 'Harbourfront',
   'location': {'lat': 43.639525632239106,
    'lng': -79.38068838052389,
    'labeledLatLngs': [{'label': 'display',
      'lat': 43.639525632239106,
      'lng': -79.38068838052389}],
    'distance': 167,
    'cc': 'CA',
    'city': 'Toronto',
    'state': 'ON',
    'country': 'Canada',
    'formattedAddress': ['Toronto ON', 'Canada']},
   'categories': [{'id': '4f2a25ac4b909258e854f55f',
     'name': 'Neighborhood',
     'pluralName': 'Neighborhoods',
     'shortName': 'Neighborhood',
     'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/neighborhood_',
      'suffix': '.png'},
     'primary': True}],
   'photos': {'count': 0, 'groups': []}},
  'referralId': 'e-0-4bfaa3494a67c928d08528cf-0'},
 {'reasons': {'count': 0,
   'items': [{'summary'

In [55]:
df_dt = json_normalize(items)

filtered_columns = ['venue.name', 'venue.categories'] + [col for col in df_dt.columns if col.startswith('venue.location')] + ['venue.id']
dataframe_filtered = df_dt.loc[:, filtered_columns]


dataframe_filtered.columns = [col.split('.')[-1] for col in dataframe_filtered.columns]

dataframe_filtered.head(10)

  if __name__ == '__main__':


Unnamed: 0,name,categories,lat,lng,labeledLatLngs,distance,cc,city,state,country,formattedAddress,address,crossStreet,postalCode,neighborhood,id
0,Harbourfront,"[{'id': '4f2a25ac4b909258e854f55f', 'name': 'N...",43.639526,-79.380688,"[{'label': 'display', 'lat': 43.63952563223910...",167,CA,Toronto,ON,Canada,"[Toronto ON, Canada]",,,,,4bfaa3494a67c928d08528cf
1,Roundhouse Park,"[{'id': '4bf58dd8d48988d163941735', 'name': 'P...",43.641745,-79.384279,"[{'label': 'display', 'lat': 43.64174513889102...",228,CA,Toronto,ON,Canada,"[255 Bremner Blvd. (at Lower Simcoe St.), Toro...",255 Bremner Blvd.,at Lower Simcoe St.,M5V 3M9,Entertainment District,4b642db1f964a520b7a22ae3
2,BeaverTails,"[{'id': '4bf58dd8d48988d1d0941735', 'name': 'D...",43.639736,-79.380068,"[{'label': 'display', 'lat': 43.639736, 'lng':...",181,CA,Toronto,ON,Canada,"[145 Queen’s Quay W (York St), Toronto ON M5J ...",145 Queen’s Quay W,York St,M5J 2H4,Harbourfront,55a19437498eeea53fa58b54
3,Lake Ontario,"[{'id': '4bf58dd8d48988d161941735', 'name': 'L...",43.638945,-79.379665,"[{'label': 'display', 'lat': 43.63894493157648...",267,CA,Toronto,ON,Canada,"[Toronto ON, Canada]",,,,,4d07f8041657a35d19272ae7
4,Harbourfront Centre,"[{'id': '4bf58dd8d48988d1f2931735', 'name': 'P...",43.638556,-79.38319,"[{'label': 'display', 'lat': 43.63855578926675...",276,CA,Toronto,ON,Canada,"[235 Queens Quay West (at Lower Simcoe St.), T...",235 Queens Quay West,at Lower Simcoe St.,M5J 2G8,,4ad4c05ef964a52094f620e3
5,Maple Leaf Square,"[{'id': '4bf58dd8d48988d164941735', 'name': 'P...",43.642925,-79.380892,"[{'label': 'display', 'lat': 43.64292522840183...",244,CA,Toronto,ON,Canada,"[15 York St. (Bremner Blvd.), Toronto ON, Canada]",15 York St.,Bremner Blvd.,,,4bdb8c1cc79cc928a77583e9
6,iQ Food Co,"[{'id': '4bf58dd8d48988d1bd941735', 'name': 'S...",43.642851,-79.382081,"[{'label': 'display', 'lat': 43.642851, 'lng':...",228,CA,Toronto,ON,Canada,"[18 York Street (Bremner Ave), Toronto ON M5J ...",18 York Street,Bremner Ave,M5J 0B2,,5346c98a498ed612110d0f60
7,Longo's Maple Leaf Square,"[{'id': '52f2ab2ebcbc57f1066b8b46', 'name': 'S...",43.642517,-79.381393,"[{'label': 'display', 'lat': 43.64251679604069...",191,CA,Toronto,ON,Canada,"[15 York St. (enter on Bremner), Toronto ON M5...",15 York St.,enter on Bremner,M5J 0A3,,4caaabd8f47ea143763b8521
8,Real Sports Apparel,"[{'id': '4bf58dd8d48988d1f2941735', 'name': 'S...",43.64286,-79.380184,"[{'label': 'display', 'lat': 43.64285984835777...",260,CA,Toronto,ON,Canada,"[15 York St. Unit B (Maple Leaf Square), Toron...",15 York St. Unit B,Maple Leaf Square,M5J,,4be46c832468c92887c1fe42
9,Delta Hotels by Marriott Toronto,"[{'id': '4bf58dd8d48988d1fa931735', 'name': 'H...",43.642882,-79.383949,"[{'label': 'display', 'lat': 43.6428819, 'lng'...",290,CA,Toronto,ON,Canada,"[75 Lower Simcoe Street, Toronto ON M5J 3A6, C...",75 Lower Simcoe Street,,M5J 3A6,,53357710498e20817350cfb4


In [58]:
venues_map = folium.Map(location=[latitude, longitude], zoom_start=15)

folium.CircleMarker(
    [latitude, longitude],
    radius = 10,
    popup = 'Downtown Toronto',
    fill = True,
    color = 'yellow',
    fill_color = 'yellow',
    fill_opacity = 0.6,
    ).add_to(venues_map)

for lat, lng, label in zip(dataframe_filtered.lat, dataframe_filtered.lng, dataframe_filtered.categories):
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        fill = True,
        color = 'blue',
        fill_color = 'blue',
        fill_opacity = 0.6,
        parse_html=False).add_to(venues_map)
    
venues_map


Map with all places located in the neighbourhoods of Harbourfront East, Union Station, Toronto Islands.