#### This project will explore, segment, and cluster the neighborhoods in the city of Toronto

##### Importing required libraries

In [3]:
import pandas as pd
import numpy as np
import requests
import time
import geocoder
from bs4 import BeautifulSoup
import ssl
import json
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

ssl._create_default_https_context = ssl._create_unverified_context



##### Getting the contents of the website with 'requests' library

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
response

<Response [200]>

##### Scraping the contents from html using BeautifulSoup and building an initial Pandas dataframe

In [5]:
# instantiating the soup object with response text, and html.parser option
soup = BeautifulSoup(response.text, "html.parser")

# parsing the table part of the response by looking at wikitable sortable class-type
postal_table = soup.find(class_="wikitable sortable")

# building the initial dataframe from table's contents 
table_rows = postal_table.find_all('tr')
row_values = []
for tr in table_rows:
    td = tr.find_all('td')
    row_text = [tr.text.strip() for tr in td if tr.text.strip()]
    if row_text:
        row_values.append(row_text)

toronto_df = pd.DataFrame(row_values, columns=["PostalCode", "Borough", "Neighborhood"])
# toronto_df.head(10)
toronto_df.shape


(288, 3)

##### Cleaning the dataframe (dropping, combining, and truncating multiple cells)

In [6]:
# ignoring cells with a Borough that is Not assigned.
borough_df = toronto_df[toronto_df.Borough != 'Not assigned']

# replacing 'Not assigned' neighborhood value with the corresponding Borough value
borough_df['Neighborhood'].replace('Not assigned', "Queen's Park", inplace=True)


# combining neighborhoods with the same PostalCode into single row 
combined_df = borough_df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index(drop=False)

#combined_df.head(10)
combined_df.shape


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


(103, 3)

##### Getting the latitude and longitude coordinates of each neighborhood to utilize the Foursquare location data

In [7]:
# geocoder seems to be very unreliable, so we can use the link to the csv file to get latitude and longitude
url = 'http://cocl.us/Geospatial_data'
lat_long_df = pd.read_csv(url)

# since the latitude-longitude table has the same data ordered as of combined_df, we can just create a new dataframe
# with the required columns
detailed_df = pd.DataFrame({'PostalCode':combined_df['PostalCode'], 
                            'Borough':combined_df['Borough'], 
                            'Neighborhood':combined_df['Neighborhood'], 
                            'Latitude':lat_long_df['Latitude'], 
                            'Longitude':lat_long_df['Longitude']})


# detailed_df.head(20)
detailed_df.shape


(103, 5)

##### Cluster analysis of the neighborhoods in Toronto 

Toronto neighborhood has a total of 11 boroughs and 211 neighborhoods. It will become a tedious task to analyse all 11 boroughs, so we will work on boroughs that has the word 'Toronto' in them. There are totally 4 boroughs that has 'toronto' in their name: "Downtown Toronto", "Central Toronto", "West Toronto", and "East Toronto".

In [8]:
# Analysing Downtown Toronto borough
detailed_df[detailed_df.Borough == 'Downtown Toronto']

Unnamed: 0,Borough,Latitude,Longitude,Neighborhood,PostalCode
50,Downtown Toronto,43.679563,-79.377529,Rosedale,M4W
51,Downtown Toronto,43.667967,-79.367675,"Cabbagetown, St. James Town",M4X
52,Downtown Toronto,43.66586,-79.38316,Church and Wellesley,M4Y
53,Downtown Toronto,43.65426,-79.360636,"Harbourfront, Regent Park",M5A
54,Downtown Toronto,43.657162,-79.378937,"Ryerson, Garden District",M5B
55,Downtown Toronto,43.651494,-79.375418,St. James Town,M5C
56,Downtown Toronto,43.644771,-79.373306,Berczy Park,M5E
57,Downtown Toronto,43.657952,-79.387383,Central Bay Street,M5G
58,Downtown Toronto,43.650571,-79.384568,"Adelaide, King, Richmond",M5H
59,Downtown Toronto,43.640816,-79.381752,"Harbourfront East, Toronto Islands, Union Station",M5J


Downtown Toronto has 18 different postal codes and around 34 neighborhoods on the whole

In [12]:
# Creating a new dataframe for cluster analysis of 'Toronto' Boroughs
d_t = detailed_df[detailed_df['Borough'] == 'Downtown Toronto']
c_t = detailed_df[detailed_df['Borough'] == 'Central Toronto']
w_t = detailed_df[detailed_df['Borough'] == 'West Toronto']
e_t = detailed_df[detailed_df['Borough'] == 'East Toronto']

toronto_cluster = pd.concat([d_t, c_t, w_t, e_t], sort=False)

toronto_cluster.head()
# toronto_cluster.shape

Unnamed: 0,Borough,Latitude,Longitude,Neighborhood,PostalCode
50,Downtown Toronto,43.679563,-79.377529,Rosedale,M4W
51,Downtown Toronto,43.667967,-79.367675,"Cabbagetown, St. James Town",M4X
52,Downtown Toronto,43.66586,-79.38316,Church and Wellesley,M4Y
53,Downtown Toronto,43.65426,-79.360636,"Harbourfront, Regent Park",M5A
54,Downtown Toronto,43.657162,-79.378937,"Ryerson, Garden District",M5B


In [13]:
# Using geopy to get the latitude and longitude values of Toronto

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Coordinates of Toronto are {}, {}. '.format(latitude, longitude))

Coordinates of Toronto are 43.653963, -79.387207. 


##### Creating a map of Toronto with all its neighborhoods superimposed on Top (filtered by Borough that has the word 'toronto')

In [14]:
# creating a map of toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# adding markers
for lat, lng, borough, neighborhood in zip(toronto_cluster['Latitude'], toronto_cluster['Longitude'], toronto_cluster['Borough'], toronto_cluster['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)
    
map_toronto