# Toronto segmentation and clustering neighbourhoods

### Week 3 assignment of the Applied Data Science Capstone 

Importing some libraries and dependencies

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.9.1
  latest version: 4.9.2

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be UPDATED:

  ca-certificates    anaconda::ca-certificates-2020.10.14-0 --> conda-forge::ca-certificates-2020.11.8-ha878542_0
  certifi                anaconda::certifi-2020.6.20-py36_0 --> conda-forge::certifi-2020.11.8-py36h5fab9bb_0

The following packages will be SUPERSEDED by a higher-priority channel:

  openssl               anaconda::openssl-1.1.1h-h7b6447c_0 --> conda-forge::openssl-1.1.1h-h516909a_0


Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.9.1
  latest version: 4.9.2

P

## Part 1: Creating the data frame
Scraping the Wikipedia page to a data frame in pandas

In [2]:
!conda install -c anaconda lxml --yes
print("ok")

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.9.1
  latest version: 4.9.2

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - lxml


The following packages will be SUPERSEDED by a higher-priority channel:

  ca-certificates    conda-forge::ca-certificates-2020.11.~ --> anaconda::ca-certificates-2020.10.14-0
  certifi            conda-forge::certifi-2020.11.8-py36h5~ --> anaconda::certifi-2020.6.20-py36_0
  openssl            conda-forge::openssl-1.1.1h-h516909a_0 --> anaconda::openssl-1.1.1h-h7b6447c_0


Preparing transaction: done
Verifying transaction: done
Executing transaction: done
ok


In [3]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Dropping the rows that does not have a borough assigned

In [4]:
df.drop(df.loc[df['Borough']=='Not assigned'].index, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Checking duplicated values of Postal Code

In [5]:
df_dup = df['Postal Code'].duplicated()
if df_dup is True: print(df_dup)
else: print("Not duplicated columns")

Not duplicated columns


Checking for any 'Not assigned' values in neighborhood

In [6]:
df_not_neigh = df['Neighbourhood']=='Not assigned'
if df_not_neigh is True: print(df_not_neigh)
else: print("All neighbourhood values are valid")

All neighbourhood values are valid


In [31]:
df.shape

(103, 3)

## Part 2: Getting the latitude and the longitude coordinates of each neighborhood

In [7]:
#Using the csv file to get the location
df_lat_lon = pd.read_csv("https://cocl.us/Geospatial_data")

In [8]:
print("The shape of the csv file is ",df_lat_lon.shape)
df_lat_lon.head()

The shape of the csv file is  (103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Concatenating both dfs

In [9]:
new_df = pd.merge(df, df_lat_lon, on="Postal Code")
print("Shape of the new df is", new_df.shape)
new_df.head()

Shape of the new df is (103, 5)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Part 3: Exploring and clustering the neighborhoods in Toronto

Partitioning the df

In [19]:
df_toronto = new_df[new_df['Borough'].str.contains('Toronto')] 
df_toronto.reset_index(drop=True, inplace=True)
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


Coordinates of Toronto to visualize its boroughs

In [32]:
lat_toronto =  43.651070
lon_toronto = -79.347015

# creating the map
map_toronto = folium.Map(location=[lat_toronto, lon_toronto], zoom_start=12)

# add markers to map
for lat, lon, borough, neighborhood, pc in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood'], df_toronto['Postal Code']):
    label = '{} - {}; {}'.format( pc, neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Utilizing the Foursquare API to explore the 5th neighborhood in 'df_toronto' and segment it

In [44]:
CLIENT_ID = 'DH4TQH3EWR0BHBFBB2YJFBTH4CEOM3N5HN2QZM53ISAU0OML' # your Foursquare ID
CLIENT_SECRET = 'W4HAAL2RMJ1SSXKIX43Y0TNSUSOYAZ5UZTTIEVKXQRBQST1Y' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

#getting the 5th neighbourhood
neighborhood_latitude = df_toronto.loc[4, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_toronto.loc[4, 'Longitude'] # neighborhood longitude value
print ("The 5th neighbourhood in 'df_toronto' is:", df_toronto.loc[4, 'Neighbourhood'], ", its latitude is:",neighborhood_latitude," and its longitude is:",neighborhood_longitude )

The 5th neighbourhood in 'df_toronto' is: The Beaches , its latitude is: 43.67635739999999  and its longitude is: -79.2930312


Getting the top 10 venues that are in The Beaches within a radius of 500 meters

In [45]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    500, 
    10)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=DH4TQH3EWR0BHBFBB2YJFBTH4CEOM3N5HN2QZM53ISAU0OML&client_secret=W4HAAL2RMJ1SSXKIX43Y0TNSUSOYAZ5UZTTIEVKXQRBQST1Y&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=10'

Sending the GET request and examining the resutls

In [46]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5fc5310d0e79470f4e8503ad'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.67682094413784,
          'lng': -79.29394208780985}],
        'distanc

In [47]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Cleaning the data in JSON file

In [51]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues


Only 4 venues were returned by Foursquare.


  This is separate from the ipykernel package so we can avoid doing imports until


In [52]:
print('Only {} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

Only 4 venues were returned by Foursquare.


#### Exploring all Neighbourhoods in Toronto