# Segmenting and Clustering Neighborhoods in Toronto

### Data frame preparation

In [1]:
#import libraries

import requests
import lxml.html as lh
import pandas as pd
import numpy as np

In [2]:
# Get table from WIKI page

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)

# page contains 3 tables, we need first one
df = dfs[0]

In [3]:
# clean the data
# Condition 1: Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
df.drop(df[df['Borough'] == 'Not assigned'].index, inplace=True)
df.reset_index(inplace=True, drop=True)
# Condition 2: If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.
df['Neighbourhood'] = np.where((df.Neighbourhood == 'Not assigned'),df.Borough,df.Neighbourhood)


In [4]:
# Check the structure
df.head()


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
# Check the shape
df.shape

(103, 3)

### Geocoder data collection 

[Google geocoder doesn't work without API key and for free since 2018](https://cloud.google.com/maps-platform/user-guide/?_ga=2.124905161.212364110.1538961567-1131107758.1527643759), so using OpenCage 



In [6]:
#install needed package
!pip install opencage

Collecting opencage
  Downloading opencage-1.2.2-py3-none-any.whl (6.1 kB)
Collecting backoff>=1.10.0
  Downloading backoff-1.10.0-py2.py3-none-any.whl (31 kB)
Installing collected packages: backoff, opencage
Successfully installed backoff-1.10.0 opencage-1.2.2


In [7]:
from opencage.geocoder import OpenCageGeocode
import time

Next cell will be hidden as contains secret keys:

key: OpenCage key

CLIENT_ID: Foursquare ID

CLIENT_SECRET: Foursquare Secret

VERSION: Foursquare API version

LIMIT: default Foursquare API limit value

In [8]:
# The code was removed by Watson Studio for sharing.

In [9]:
# Add coordinates
df['Latitude'] = 0.0
df['Longitude'] = 0.0

geocoder = OpenCageGeocode(key)

for index, row in df.iterrows():
    results = geocoder.geocode('{}, Toronto, Ontario'.format(row["Postal Code"]))
    df.loc[index,'Latitude'] = results[0]['geometry']['lat']
    df.loc[index,'Longitude'] = results[0]['geometry']['lng']
    time.sleep(1) #no more than 1 request per second




In [10]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.653482,-79.383935
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.653482,-79.383935


### Toronto area exploration 

In [11]:
!pip install folium
import folium

address = 'Toronto, Canada'

results = geocoder.geocode('Toronto, Ontario')
tor_latitude = results[0]['geometry']['lat']
tor_longitude = results[0]['geometry']['lng']
print('The geograpical coordinates of Toronto are {}, {}.'.format(tor_latitude, tor_longitude))

Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 5.5 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1
The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


### Create a map of areas

In [12]:

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[tor_latitude, tor_longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '({}), {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Getting venue data for each area

In [13]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
toronto_venues = getNearbyVenues(names=df['Neighbourhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

#### There are venues with category = 'Neighborhood', we don't need them, so remowing them

In [15]:
# remove venues with category Neighborhood

toronto_venues.drop(toronto_venues[toronto_venues['Venue Category'] == 'Neighborhood'].index, inplace=True)
toronto_venues.shape

(2517, 7)

In [16]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
1,Parkwoods,43.653482,-79.383935,Nathan Phillips Square,43.65227,-79.383516,Plaza
2,Parkwoods,43.653482,-79.383935,Japango,43.655268,-79.385165,Sushi Restaurant
3,Parkwoods,43.653482,-79.383935,Poke Guys,43.654895,-79.385052,Poke Place
4,Parkwoods,43.653482,-79.383935,Textile Museum of Canada,43.654396,-79.3865,Art Museum
5,Parkwoods,43.653482,-79.383935,Chatime 日出茶太,43.655542,-79.384684,Bubble Tea Shop
