#### This project will explore, segment, and cluster the neighborhoods in the city of Toronto

##### Importing required libraries

In [1]:
import pandas as pd
import numpy as np
import urllib.request
import requests
import time
import geocoder
from bs4 import BeautifulSoup
import ssl
import json
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

ssl._create_default_https_context = ssl._create_unverified_context

##### Getting the contents of the website with 'requests' library

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
response

<Response [200]>

##### Scraping the contents from html using BeautifulSoup and building an initial Pandas dataframe

In [3]:
# instantiating the soup object with response text, and html.parser option
soup = BeautifulSoup(response.text, "html.parser")

# parsing the table part of the response by looking at wikitable sortable class-type
postal_table = soup.find(class_="wikitable sortable")

# building the initial dataframe from table's contents 
table_rows = postal_table.find_all('tr')
row_values = []
for tr in table_rows:
    td = tr.find_all('td')
    row_text = [tr.text.strip() for tr in td if tr.text.strip()]
    if row_text:
        row_values.append(row_text)

toronto_df = pd.DataFrame(row_values, columns=["PostalCode", "Borough", "Neighborhood"])
#toronto_df.head(10)
toronto_df.shape


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


##### Cleaning the dataframe (dropping, combining, and truncating multiple cells)

In [4]:
# ignoring cells with a Borough that is Not assigned.
borough_df = toronto_df[toronto_df.Borough != 'Not assigned']

# replacing 'Not assigned' neighborhood value with the corresponding Borough value
borough_df['Neighborhood'].replace('Not assigned', "Queen's Park", inplace=True)


# combining neighborhoods with the same PostalCode into single row 
combined_df = borough_df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index(drop=False)

# combined_df.head(10)
combined_df.shape


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


##### Getting the latitude and longitude coordinates of each neighborhood to utilize the Foursquare location data

In [5]:
# geocoder seems to be very unreliable, so we can use the link to the csv file to get latitude and longitude
url = 'http://cocl.us/Geospatial_data'
lat_long_df = pd.read_csv(url)

# since the latitude-longitude table has the same data ordered as of combined_df, we can just create a new dataframe
# with the required columns
detailed_df = pd.DataFrame({'PostalCode':combined_df['PostalCode'], 
                            'Borough':combined_df['Borough'], 
                            'Neighborhood':combined_df['Neighborhood'], 
                            'Latitude':lat_long_df['Latitude'], 
                            'Longitude':lat_long_df['Longitude']})


# detailed_df.head(20)
detailed_df.shape


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [9]:
# counting the number of Boroughs and neighborhoods
print('This dataframe has {} boroughs and {} neighborhoods.'.format(len(detailed_df['Borough'].unique()), len(detailed_df['Neighborhood'].unique())))

This dataframe has 11 boroughs and 103 neighborhoods.


##### Cluster analysis of the neighborhoods in Toronto using Foursquare API

Toronto neighborhood has a total of 11 boroughs and 100+ neighborhoods. It will become a tedious task to analyse all 11 boroughs, so we will work on boroughs that has the word 'Toronto' in them. There are totally 4 boroughs that has 'toronto' in their name: "Downtown Toronto", "Central Toronto", "West Toronto", and "East Toronto".

In [61]:
# Analysing number of postal codes in each of the Boroughs that ends with the word 'Toronto'
borough_list = ['Downtown Toronto', 'Central Toronto', 'West Toronto', 'East Toronto']
for bor in borough_list:
    print("There are {} postal codes in {}.".format(detailed_df.PostalCode[detailed_df.Borough == bor].count(), bor))


There are 18 postal codes in Downtown Toronto.
There are 9 postal codes in Central Toronto.
There are 6 postal codes in West Toronto.
There are 5 postal codes in East Toronto.


In [120]:
# Creating a new dataframe for cluster analysis of 'Toronto' Boroughs
d_t = detailed_df[detailed_df['Borough'] == 'Downtown Toronto']
c_t = detailed_df[detailed_df['Borough'] == 'Central Toronto']
w_t = detailed_df[detailed_df['Borough'] == 'West Toronto']
e_t = detailed_df[detailed_df['Borough'] == 'East Toronto']

combined = pd.concat([d_t, c_t, w_t, e_t], sort=False)
toronto_df = combined.reset_index(drop=True)

# toronto_df.head()
toronto_df.shape

(38, 5)

In [121]:
# Using geopy to get the latitude and longitude values of Toronto
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Coordinates of Toronto are {}, {}. '.format(latitude, longitude))

Coordinates of Toronto are 43.653963, -79.387207. 


##### Creating a map of Toronto with neighborhoods superimposed on Top

In [122]:
# creating a map of toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# adding markers
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)

# displaying toronto map with 4 boroughs and it's neighborhoods
map_toronto

The above map shows 4 different Boroughs ("Downtown Toronto", "Central Toronto", "West Toronto", and "East Toronto") with it's associated neighborhoods.

##### Using Foursquare APIs to explore first neighborhood in toronto_df

In [123]:
# defining foursquare credentials
client_id = 'PEQPJEZXPG4HRTFTMTZHL3P3QRZ1SV55525PX2YO22F3Q0JB'
client_secret = 'JEDPFGTRY4SISHI1S2W3RTQGIG3UUROA5OG4I3NW4UXBBLB1'
version = '20180605'

In [124]:
# exploring first neighborhoods in our toronto_df dataframe
toronto_df.loc[0, 'Neighborhood']

'Rosedale'

In [127]:
# getting first neighborhoods latitude and longitude value
rosedale_latitude = toronto_df.loc[0, 'Latitude']
rosedale_longitude = toronto_df.loc[0, 'Longitude']
print('Rosedale\'s latitude and longitude values are {}, {}.'.format(rosedale_latitude, rosedale_longitude))


Rosedale's latitude and longitude values are 43.6795626, -79.37752940000001.


In [130]:
# getting top 100 venues that are in Rosedale within a radius of 500 radius
no_of_venues = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(client_id, client_secret, version, rosedale_latitude, rosedale_longitude, radius, no_of_venues)
url


'https://api.foursquare.com/v2/venues/explore?&client_id=PEQPJEZXPG4HRTFTMTZHL3P3QRZ1SV55525PX2YO22F3Q0JB&client_secret=JEDPFGTRY4SISHI1S2W3RTQGIG3UUROA5OG4I3NW4UXBBLB1&v=20180605&ll=43.6795626,-79.37752940000001&radius=500&limit=100'