<h1 align=center><font size = 5>Toronto Neighborhoods</font></h1>

In [1]:
import pandas as pd

<a id='item1'></a>

## Web Scrapping Wikipedia

### Data Import

In [2]:
path = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
wiki_df = pd.read_html(path, header=0)[0]

In [4]:
wiki_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Ignore Rows where "Borough" is "Not assigned"

In [5]:
wiki_df2 = wiki_df.loc[wiki_df['Borough']!='Not assigned'].reset_index()

### "Neighborhood" to match "Borough" in Rows where "Neighborhood" is "Not assigned"

In [6]:
wiki_df3 = wiki_df2.copy()

In [7]:
wiki_df3.loc[wiki_df3['Neighborhood'] == 'Not assigned', 'Neighborhood'] = wiki_df3.loc[wiki_df3['Neighborhood'] == 'Not assigned']['Borough']

In [8]:
wiki_df3.head()

Unnamed: 0,index,Postal Code,Borough,Neighborhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
wiki_df3.shape

(103, 4)

## Getting Geolocation Coordinates

In [10]:
import geocoder # import geocoder

In [11]:
# initialize your variable to None
postal_code = 'M7A'

lat_lng_coords = None

In [12]:
geo_df = pd.read_csv("Geospatial_Coordinates.csv")

In [13]:
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
df = pd.merge(wiki_df3, geo_df, left_on='Postal Code', right_on='Postal Code', how='outer').drop('index', axis=1)

In [15]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [16]:
geo_df.loc[geo_df['Postal Code'] == 'M3A']

Unnamed: 0,Postal Code,Latitude,Longitude
25,M3A,43.753259,-79.329656


In [17]:
df.shape

(103, 5)

## Get Map Data from Foursquare API

#### Access Details

In [18]:
import requests
from pandas import json_normalize

In [33]:
from access_details import CLIENT_ID_
from access_details import CLIENT_SECRET_

In [45]:
CLIENT_ID = CLIENT_ID_
CLIENT_SECRET = CLIENT_SECRET_
VERSION = '20200620'

#### Parameters

In [117]:
radius = 100000
LIMIT = 10000

In [118]:
lat = df.iloc[4,3]
long = df.iloc[4,4]

#### Define URL & Get JSON

In [119]:
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}' \
.format(CLIENT_ID, CLIENT_SECRET, lat, long, VERSION, radius, LIMIT)

In [120]:
results = requests.get(url).json()

#### Transform to Pandas DF

In [122]:
pd.set_option('display.max_columns', None)

In [123]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [124]:
venues = results['response']['groups'][0]['items']
venues = json_normalize(venues)

In [125]:
venues.tail()

Unnamed: 0,referralId,reasons.count,reasons.items,venue.id,venue.name,venue.location.lat,venue.location.lng,venue.location.labeledLatLngs,venue.location.distance,venue.location.cc,venue.location.city,venue.location.state,venue.location.country,venue.location.formattedAddress,venue.categories,venue.photos.count,venue.photos.groups,venue.location.address,venue.location.crossStreet,venue.location.postalCode,venue.venuePage.id,venue.location.neighborhood
95,e-0-4ae33298f964a520909121e3-95,0,"[{'summary': 'This spot is popular', 'type': '...",4ae33298f964a520909121e3,Sorauren Avenue Park,43.648443,-79.443405,"[{'label': 'display', 'lat': 43.64844330662797...",4607,CA,Toronto,ON,Canada,"[289 Sorauren Ave. (at Wabash Ave.), Toronto O...","[{'id': '4bf58dd8d48988d163941735', 'name': 'P...",0,[],289 Sorauren Ave.,at Wabash Ave.,,,
96,e-0-50d4940de4b0d27a203d4b1e-96,0,"[{'summary': 'This spot is popular', 'type': '...",50d4940de4b0d27a203d4b1e,Cabin Fever,43.655442,-79.456639,"[{'label': 'display', 'lat': 43.65544217064788...",5461,CA,Toronto,ON,Canada,"[1669 Bloor St. W, Toronto ON M6P 1A6, Canada]","[{'id': '4bf58dd8d48988d116941735', 'name': 'B...",0,[],1669 Bloor St. W,,M6P 1A6,45319624.0,
97,e-0-572e0547498ed619e59df508-97,0,"[{'summary': 'This spot is popular', 'type': '...",572e0547498ed619e59df508,Hale Coffee,43.665532,-79.449949,"[{'label': 'display', 'lat': 43.66553169801808...",4881,CA,Toronto,ON,Canada,"[300 Campbell St (Dupont St), Toronto ON, Canada]","[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",0,[],300 Campbell St,Dupont St,,,
98,e-0-4b93f74df964a5208f5d34e3-98,0,"[{'summary': 'This spot is popular', 'type': '...",4b93f74df964a5208f5d34e3,Humber River Footbridge,43.631851,-79.471321,"[{'label': 'display', 'lat': 43.63185092304897...",7411,CA,Toronto,ON,Canada,"[Humber River (at Lake Ontario), Toronto ON, C...","[{'id': '4bf58dd8d48988d1df941735', 'name': 'B...",0,[],Humber River,at Lake Ontario,,,
99,e-0-4bcb17a7cc8cd13a0ca5becf-99,0,"[{'summary': 'This spot is popular', 'type': '...",4bcb17a7cc8cd13a0ca5becf,Sunnybrook Park,43.722317,-79.355496,"[{'label': 'display', 'lat': 43.72231713758858...",7219,CA,Toronto,ON,Canada,"[Toronto ON, Canada]","[{'id': '4bf58dd8d48988d162941735', 'name': 'O...",0,[],,,,,


In [126]:
venues['venue.location.postalCode'].isnull().sum()

30

Now we are ready to clean the json and structure it into a *pandas* dataframe.

In [127]:
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng', 'venue.location.postalCode']
venues =venues.loc[:, filtered_columns]

venues_filtered = venues.copy()

# filter the category for each row
venues_filtered['venue.categories'] = venues.apply(get_category_type, axis=1)
venues_filtered['venue.location.postalCode'] = venues['venue.location.postalCode'].str.split(" ", 1).str[1]

# clean columns
venues_filtered.columns = [col.split(".")[-1] for col in venues.columns]

venues_filtered.head()

Unnamed: 0,name,categories,lat,lng,postalCode
0,Downtown Toronto,Neighborhood,43.653232,-79.385296,
1,DanceLifeX Centre,Dance Studio,43.666956,-79.385297,1Z5
2,Seven Lives - Tacos y Mariscos,Mexican Restaurant,43.654418,-79.400545,2L6
3,Art Gallery of Ontario,Art Gallery,43.654003,-79.392922,1G4
4,Alo,French Restaurant,43.648574,-79.396243,2L6


## Groupby categories and One-hot Encode

In [128]:
venues_filtered.groupby(['postalCode']).count()

Unnamed: 0_level_0,name,categories,lat,lng
postalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1A1,1,1,1,1
1A6,2,2,2,2
1A9,1,1,1,1
1B5,1,1,1,1
1C3,1,1,1,1
...,...,...,...,...
3W6,1,1,1,1
3W9,1,1,1,1
3X8,1,1,1,1
3Y9,1,1,1,1


## Join Mapdata table and Neighborhood table

# Clustering

In [41]:
from sklearn.cluster import KMeans

In [42]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(venues)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

ValueError: could not convert string to float: 'Antique Shop'