#### This project will explore, segment, and cluster the neighborhoods in the city of Toronto

##### Importing required libraries

In [12]:
import pandas as pd
import numpy as np
import requests
import time
import geocoder
from bs4 import BeautifulSoup
import ssl
import json
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

ssl._create_default_https_context = ssl._create_unverified_context

##### Getting the contents of the website with 'requests' library

In [75]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
response

<Response [200]>

##### Scraping the contents from html using BeautifulSoup and building an initial Pandas dataframe

In [78]:
# instantiating the soup object with response text, and html.parser option
soup = BeautifulSoup(response.text, "html.parser")

# parsing the table part of the response by looking at wikitable sortable class-type
postal_table = soup.find(class_="wikitable sortable")

# building the initial dataframe from table's contents 
table_rows = postal_table.find_all('tr')
row_values = []
for tr in table_rows:
    td = tr.find_all('td')
    row_text = [tr.text.strip() for tr in td if tr.text.strip()]
    if row_text:
        row_values.append(row_text)

toronto_df = pd.DataFrame(row_values, columns=["PostalCode", "Borough", "Neighborhood"])
# toronto_df.head(10)
toronto_df.shape


(288, 3)

##### Cleaning the dataframe (dropping, combining, and truncating multiple cells)

In [80]:
# ignoring cells with a Borough that is Not assigned.
borough_df = toronto_df[toronto_df.Borough != 'Not assigned']

# replacing 'Not assigned' neighborhood value with the corresponding Borough value
borough_df['Neighborhood'].replace('Not assigned', "Queen's Park", inplace=True)


# combining neighborhoods with the same PostalCode into single row 
combined_df = borough_df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index(drop=False)

#combined_df.head(10)
combined_df.shape


(103, 3)

##### Getting the latitude and longitude coordinates of each neighborhood to utilize the Foursquare location data

In [81]:
# geocoder seems to be very unreliable, so we can use the link to the csv file to get latitude and longitude
url = 'http://cocl.us/Geospatial_data'
lat_long_df = pd.read_csv(url)

# since the latitude-longitude table has the same data ordered as of combined_df, we can just create a new dataframe
# with the required columns
detailed_df = pd.DataFrame({'PostalCode':combined_df['PostalCode'], 
                            'Borough':combined_df['Borough'], 
                            'Neighborhood':combined_df['Neighborhood'], 
                            'Latitude':lat_long_df['Latitude'], 
                            'Longitude':lat_long_df['Longitude']})


# detailed_df.head(20)
detailed_df.shape


(103, 5)

##### Cluster analysis of the neighborhoods in Toronto 

Toronto neighborhood has a total of 11 boroughs and 211 neighborhoods. It will become a tedious task to analyse all 11 boroughs, so we will work on boroughs that has the word 'Toronto' in them. There are totally 4 boroughs that has 'toronto' in their name: "Downtown Toronto", "Central Toronto", "West Toronto", and "East Toronto".

In [17]:
# Analysing Downtown Toronto borough
detailed_df[detailed_df.Borough == 'Downtown Toronto']

Unnamed: 0,Borough,Latitude,Longitude,Neighborhood,PostalCode
50,Downtown Toronto,43.679563,-79.377529,Rosedale,M4W
51,Downtown Toronto,43.667967,-79.367675,"Cabbagetown, St. James Town",M4X
52,Downtown Toronto,43.66586,-79.38316,Church and Wellesley,M4Y
53,Downtown Toronto,43.65426,-79.360636,"Harbourfront, Regent Park",M5A
54,Downtown Toronto,43.657162,-79.378937,"Ryerson, Garden District",M5B
55,Downtown Toronto,43.651494,-79.375418,St. James Town,M5C
56,Downtown Toronto,43.644771,-79.373306,Berczy Park,M5E
57,Downtown Toronto,43.657952,-79.387383,Central Bay Street,M5G
58,Downtown Toronto,43.650571,-79.384568,"Adelaide, King, Richmond",M5H
59,Downtown Toronto,43.640816,-79.381752,"Harbourfront East, Toronto Islands, Union Station",M5J


Downtown Toronto has 18 different postal codes and around 34 neighborhoods on the whole

In [18]:
# Analysing Central Toronto borough
detailed_df[detailed_df.Borough == 'Central Toronto'].count()


Borough         9
Latitude        9
Longitude       9
Neighborhood    9
PostalCode      9
dtype: int64

Central Toronto has 9 different postal codes and around 16 neighborhoods

In [19]:
# Analysing West Toronto borough
detailed_df[detailed_df.Borough == 'West Toronto'].count()

Borough         6
Latitude        6
Longitude       6
Neighborhood    6
PostalCode      6
dtype: int64

West Toronto has 6 different postal codes and around 13 neighborhoods

In [20]:
# Analysing East Toronto borough
detailed_df[detailed_df.Borough == 'East Toronto'].count()

Borough         5
Latitude        5
Longitude       5
Neighborhood    5
PostalCode      5
dtype: int64

East Toronto has 5 different postal codes and around 7 neighborhoods

In [21]:
# Creating a new dataframe for cluster analysis of 'Toronto' Boroughs
d_t = detailed_df[detailed_df['Borough'] == 'Downtown Toronto']
c_t = detailed_df[detailed_df['Borough'] == 'Central Toronto']
w_t = detailed_df[detailed_df['Borough'] == 'West Toronto']
e_t = detailed_df[detailed_df['Borough'] == 'East Toronto']

toronto_cluster = pd.concat([d_t, c_t, w_t, e_t], sort=False)

toronto_cluster.head()
# toronto_cluster.shape

Unnamed: 0,Borough,Latitude,Longitude,Neighborhood,PostalCode
50,Downtown Toronto,43.679563,-79.377529,Rosedale,M4W
51,Downtown Toronto,43.667967,-79.367675,"Cabbagetown, St. James Town",M4X
52,Downtown Toronto,43.66586,-79.38316,Church and Wellesley,M4Y
53,Downtown Toronto,43.65426,-79.360636,"Harbourfront, Regent Park",M5A
54,Downtown Toronto,43.657162,-79.378937,"Ryerson, Garden District",M5B


In [22]:
# Using geopy to get the latitude and longitude values of Toronto

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Coordinates of Toronto are {}, {}. '.format(latitude, longitude))

Coordinates of Toronto are 43.653963, -79.387207. 


##### Creating a map of Toronto with all its neighborhoods superimposed on Top (filtered by Borough that has the word 'toronto')

In [23]:
# creating a map of toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# adding markers
for lat, lng, borough, neighborhood in zip(toronto_cluster['Latitude'], toronto_cluster['Longitude'], toronto_cluster['Borough'], toronto_cluster['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)
    
map_toronto

The above map shows 4 different Boroughs ("Downtown Toronto", "Central Toronto", "West Toronto", and "East Toronto") with it's associated neighborhoods.

##### Creating map of Downtown Toronto and its neighborhoods

Let us dig deep into the Downtown Toronto borough for further analysis. So we'll slice the dataframe to create a new dataframe only with Downtown Toronto data.

In [24]:
downtown_toronto = toronto_cluster[toronto_cluster['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
downtown_toronto

Unnamed: 0,Borough,Latitude,Longitude,Neighborhood,PostalCode
0,Downtown Toronto,43.679563,-79.377529,Rosedale,M4W
1,Downtown Toronto,43.667967,-79.367675,"Cabbagetown, St. James Town",M4X
2,Downtown Toronto,43.66586,-79.38316,Church and Wellesley,M4Y
3,Downtown Toronto,43.65426,-79.360636,"Harbourfront, Regent Park",M5A
4,Downtown Toronto,43.657162,-79.378937,"Ryerson, Garden District",M5B
5,Downtown Toronto,43.651494,-79.375418,St. James Town,M5C
6,Downtown Toronto,43.644771,-79.373306,Berczy Park,M5E
7,Downtown Toronto,43.657952,-79.387383,Central Bay Street,M5G
8,Downtown Toronto,43.650571,-79.384568,"Adelaide, King, Richmond",M5H
9,Downtown Toronto,43.640816,-79.381752,"Harbourfront East, Toronto Islands, Union Station",M5J


In [25]:
# Geographical coordinates of Downtown Toronto

address = 'Downtown Toronto, Toronto'
geolocator = Nominatim(user_agent='toronto_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The coordinates of {} are {}, {}.'.format(address, latitude, longitude))

The coordinates of Downtown Toronto, Toronto are 43.6541737, -79.3808116451.


In [33]:
# Visualizing the neighborhoods of Downtown Toronto
downtown_neighborhood = folium.Map(location=[latitude, longitude], zoom_start=12)

# adding markers
for lat, long, label in zip(downtown_toronto['Latitude'], downtown_toronto['Longitude'], downtown_toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.2,
        parse_html=False).add_to(downtown_neighborhood)
    
downtown_neighborhood

##### Exploring Downtown Toronto's first neighborhood using FourSquare APIs

In [39]:
# foursquare credentials
CLIENT_ID = 'PEQPJEZXPG4HRTFTMTZHL3P3QRZ1SV55525PX2YO22F3Q0JB'
CLIENT_SECRET = 'JEDPFGTRY4SISHI1S2W3RTQGIG3UUROA5OG4I3NW4UXBBLB1'
VERSION = '20180605'

In [40]:
# exploring first neighborhood's details
neighborhood_latitude = downtown_toronto.loc[0, 'Latitude']
neighborhood_longitude = downtown_toronto.loc[0, 'Longitude']
neighborhood_name = downtown_toronto.loc[0, 'Neighborhood']

print('Coordinates of {} is {}, {}.'.format(neighborhood_name, neighborhood_latitude, neighborhood_longitude))

Coordinates of Rosedale is 43.6795626, -79.3775294.


##### Getting top venues that are in Rosedale within a radius of 500 meters

In [43]:
# top 100 venues in a 500 meter radius around Rosedale (first neighborhood in Downtown Toronto)
no_of_venues = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, CLIENT_SECRET, VERSION, neighborhood_latitude, neighborhood_longitude, radius, no_of_venues)

response = requests.get(url).json()
response

{u'meta': {u'code': 200, u'requestId': u'5d1d2701d9a6e6002fbbb20c'},
 u'response': {u'groups': [{u'items': [{u'reasons': {u'count': 0,
       u'items': [{u'reasonName': u'globalInteractionReason',
         u'summary': u'This spot is popular',
         u'type': u'general'}]},
      u'referralId': u'e-0-4bae2150f964a520df873be3-0',
      u'venue': {u'categories': [{u'icon': {u'prefix': u'https://ss3.4sqi.net/img/categories_v2/building/default_',
          u'suffix': u'.png'},
         u'id': u'4bf58dd8d48988d130941735',
         u'name': u'Building',
         u'pluralName': u'Buildings',
         u'primary': True,
         u'shortName': u'Building'}],
       u'id': u'4bae2150f964a520df873be3',
       u'location': {u'address': u'146 Crescent Rd.',
        u'cc': u'CA',
        u'city': u'Toronto',
        u'country': u'Canada',
        u'crossStreet': u'btwn. Lamport Ave. and Mt. Pleasant Rd.',
        u'distance': 230,
        u'formattedAddress': [u'146 Crescent Rd. (btwn. Lamport Ave. 

In [58]:
# all the information is in the items key
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
# cleaning the json response and fitting it into a pandas dataframe
venues = response['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)

#filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:,filtered_columns]

#filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

#clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Mooredale House,Building,43.678631,-79.380091
1,Rosedale Park,Playground,43.682328,-79.378934
2,Whitney Park,Park,43.682036,-79.373788
3,Alex Murray Parkette,Park,43.6783,-79.382773
4,Milkman's Lane,Trail,43.676352,-79.373842


##### Exploring all Neighborhoods in Downtown Toronto using the same (above) process

In [65]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # creating the url
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, CLIENT_SECRET, VERSION, neighborhood_latitude, neighborhood_longitude, radius, no_of_venues)
        
        # make the GET request
        results = requests.get(url).json()["response"]["groups"][0]['items']
        
        # only relevant information
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighboorhood Latitude', 'Neighborhood Longitude', 
                                 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
        
    return nearby_venues
        

In [66]:
# writing code to run the above function on each neighborhood and create a new dataframe called toronto_venues
toronto_venues = getNearbyVenues(names=downtown_toronto['Neighborhood'],
                                latitudes=downtown_toronto['Latitude'],
                                longitudes=downtown_toronto['Longitude'])

Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie


In [71]:
# printing all the venues for all the neighborhoods in Downtown Toronto
# toronto_venues.shape
toronto_venues.head(10)

Unnamed: 0,Neighborhood,Neighboorhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Mooredale House,43.678631,-79.380091,Building
1,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
2,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
3,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
4,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
5,"Cabbagetown, St. James Town",43.667967,-79.367675,Mooredale House,43.678631,-79.380091,Building
6,"Cabbagetown, St. James Town",43.667967,-79.367675,Rosedale Park,43.682328,-79.378934,Playground
7,"Cabbagetown, St. James Town",43.667967,-79.367675,Whitney Park,43.682036,-79.373788,Park
8,"Cabbagetown, St. James Town",43.667967,-79.367675,Alex Murray Parkette,43.6783,-79.382773,Park
9,"Cabbagetown, St. James Town",43.667967,-79.367675,Milkman's Lane,43.676352,-79.373842,Trail


In [73]:
# number of venues returned for each neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighboorhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",5,5,5,5,5,5
Berczy Park,5,5,5,5,5,5
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",5,5,5,5,5,5
"Cabbagetown, St. James Town",5,5,5,5,5,5
Central Bay Street,5,5,5,5,5,5
"Chinatown, Grange Park, Kensington Market",5,5,5,5,5,5
Christie,5,5,5,5,5,5
Church and Wellesley,5,5,5,5,5,5
"Commerce Court, Victoria Hotel",5,5,5,5,5,5
"Design Exchange, Toronto Dominion Centre",5,5,5,5,5,5
