# Segmenting and Clustering Neighborhoods in Toronto
###  Visualising Neighborhoods

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import requests
import json
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

!pip install folium
import folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 8.0MB/s eta 0:00:011
[?25hCollecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [5]:
from geopy.geocoders import Nominatim

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### First we have to build the Neighbourhood dataframe as we did in Task 1 and 2

In [3]:
# Retrieve neighbourhood data from wikipedia page
nb_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
nb_html = requests.get(nb_url).text
nb_soup = BeautifulSoup(nb_html, 'html.parser')

nb_data = []
for tr in nb_soup.tbody.find_all('tr'):
    nb_data.append([ td.get_text().strip() for td in tr.find_all('td')])

# Read data into a dataframe
nb_df = pd.DataFrame(nb_data, columns=['PostalCode','Borough','Neighbourhood'])

# Find rows of Boroughs that have "Not assigned"
NA_indx = nb_df[(nb_df['Borough'] == "Not assigned")].index

# Drop the unnecessary first row -- that marked as None
nb_df.dropna(inplace=True)

# Drop all rows of Boroughs that have "Not assigned"
nb_df.drop(NA_indx, inplace=True)

# Merge duplicate rows based on PostalCode and Borough
nb_df = nb_df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()

# Fix Not assigned Neighbourhoods with the Borough Names
def fix_NB(data):
    if data['Neighbourhood'] == 'Not assigned':
        x = data['Borough']
    else:
        x = data['Neighbourhood']
    return x

nb_df['Neighborhood'] = nb_df.apply(fix_NB, axis='columns')

# Check whether the fix has worked
print("Not assigned Neighborhood count = {}".format(len(nb_df[nb_df['Neighborhood']=='Not assigned'])))

# We dont need the old Neighbourhood column anymore
nb_df.drop(columns='Neighbourhood', inplace=True)

# Reading geo data from sv to dataframe
ll_df = pd.read_csv('http://cocl.us/Geospatial_data')

# Rename Postal Code as PostalCode -- make equal the 2 column names
ll_df.rename(columns={"Postal Code": "PostalCode"}, inplace=True)

# Merging the 2 dataframes
nbll_df = pd.merge(nb_df, ll_df, on='PostalCode', how='outer')
nbll_df.head()

Not assigned Neighborhood count = 0


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Now we can play with these data and generate maps

In [6]:
# Get lat/long data for Toronto

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="Tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The lat/longs of Toronto are {}, {}.'.format(latitude, longitude))

The lat/longs of Toronto are 43.6534817, -79.3839347.


In [7]:
# create map of Toronto using latitude and longitude values
TN_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to the map
for lat, lng, borough, neighborhood in zip(nbll_df['Latitude'], nbll_df['Longitude'], nbll_df['Borough'], nbll_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(TN_map)  
    
TN_map

In [8]:
CLIENT_ID = 'JC204Q4EIQ3VEXQJN1UNDFPRGVCZMUN5MV4SZJPBNWBLMJOX0' # your Foursquare ID
CLIENT_SECRET = 'KZVQJQ1MCKRRFO1UVRJPGZGUBPSZZGLPJMCJUBIKAPJ4RCMXK' # your Foursquare Secret
VERSION = '20190516' # Foursquare API version

In [9]:
# Get the lat/long data of the first row of the dataframe

nh1_lat = nbll_df.loc[0, 'Latitude']
nh1_long = nbll_df.loc[0, 'Longitude']
nh1_name = nbll_df.loc[0, 'Neighborhood']

print('Lat/long values of {} neighbourhood are {}, {}.'.format(nh1_name, nh1_lat, nh1_long))

Lat/long values of Malvern / Rouge neighbourhood are 43.806686299999996, -79.19435340000001.


In [10]:
# Build the foursquare URL to get venues for the above neighbourhood

LIMIT = 100
RADIUS = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    nh1_lat, 
    nh1_long, 
    RADIUS, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=JC204Q4EIQ3VEXQJN1UNDFPRGVCZMUN5MV4SZJPBNWBLMJO0&client_secret=KZVQJQ1MCKRRFO1UVRJPGZGUBPSZZGLPJMCJUBIKAPJ4RCMK&v=20190516&ll=43.806686299999996,-79.19435340000001&radius=500&limit=100'

In [11]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ea5d55778a484001b756801'},
  'headerLocation': 'Malvern',
  'headerFullLocation': 'Malvern, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 1,
  'suggestedBounds': {'ne': {'lat': 43.8111863045, 'lng': -79.18812958073042},
   'sw': {'lat': 43.80218629549999, 'lng': -79.2005772192696}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bb6b9446edc76b0d771311c',
       'name': 'Wendy’s',
       'location': {'crossStreet': 'Morningside & Sheppard',
        'lat': 43.80744841934756,
        'lng': -79.19905558052072,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.80744841934756,
          'lng': -79.19905558052072}],
        'distance': 387,
        'cc': 'CA',
        'city': 'Toronto',
    

In [12]:
# function that extracts the category of the venue

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [13]:
# Get nearby venues and display

venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Wendy’s,Fast Food Restaurant,43.807448,-79.199056


In [14]:
# Get nearby venues for all Toronto neighbourhoods

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
Tor_venues = getNearbyVenues(names=nbll_df['Neighborhood'], latitudes = nbll_df['Latitude'], longitudes = nbll_df['Longitude'])
Tor_venues.head()

Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Malvern / Rouge,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
3,Guildwood / Morningside / West Hill,43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank
4,Guildwood / Morningside / West Hill,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [16]:
Tor_venues.shape

(2127, 7)

In [17]:
Tor_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
Alderwood / Long Branch,8,8,8,8,8,8
Bathurst Manor / Wilson Heights / Downsview North,20,20,20,20,20,20
Bayview Village,4,4,4,4,4,4
Bedford Park / Lawrence Manor East,23,23,23,23,23,23
Berczy Park,58,58,58,58,58,58
Birch Cliff / Cliffside West,4,4,4,4,4,4
Brockton / Parkdale Village / Exhibition Place,23,23,23,23,23,23
Business reply mail Processing CentrE,15,15,15,15,15,15
CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst Quay / South Niagara / Island airport,14,14,14,14,14,14
