# Segmenting and Clustering Neighborhoods in Toronto

### Part 1: Web Scraping

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

from geopy.geocoders import Nominatim

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

Solving environment: done

# All requested packages already installed.



In [2]:
link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
reqlink = requests.get(link).text
soup = BeautifulSoup(reqlink, 'lxml')
#print(soup.prettify())

In [3]:
table = soup.find('table')
tablerow = table.find_all('td')
ele_len = len(tablerow)
PostalCode = []
Borough = []
Neighborhood = []

for i in range(0, ele_len, 3):
    PostalCode.append(tablerow[i].text.strip())
    Borough.append(tablerow[i+1].text.strip())
    Neighborhood.append(tablerow[i+2].text.strip())

In [4]:
df_toronto = pd.DataFrame(data = [PostalCode, Borough, Neighborhood]).transpose()
df_toronto.columns = ['PostalCode', 'Borough', 'Neighborhood']
df_toronto = df_toronto[df_toronto['Borough'] != 'Not assigned']
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
df_toronto.to_csv('toronto_p1.csv')
df_toronto['Neighborhood'] = df_toronto['Neighborhood'].str.replace('\n', '')
df_toronto = df_toronto[df_toronto['Borough'] != 'Not assigned']
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
df_toronto.shape

(103, 3)

### Part 2: Extracting Longitude and Latitude

In [7]:
tableloc = soup.find('table')
tablelocrow = tableloc.find_all('td')
ele_len = len(tablelocrow)
PC = []
BH = []
ND = []

for i in range(0, ele_len, 3):
    PC.append(tablelocrow[i].text.strip())
    BH.append(tablelocrow[i+1].text.strip())
    ND.append(tablelocrow[i+2].text.strip())

In [8]:
df_can_PC = pd.DataFrame(data = [PC, BH, ND]).transpose()
df_can_PC.columns = ['PostalCode', 'Borough', 'Neighborhood']
df_can_PC = df_can_PC[df_can_PC['Borough'] != 'Not assigned']
df_can_PC.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
df_cangrp = df_can_PC.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_cangrp.columns = ['PostalCode', 'Borough', 'Neighborhood']

In [10]:
longlat = pd.read_csv('https://cocl.us/Geospatial_data')
longlat.columns = ['PostalCode', 'Latitude', 'Longitude']

In [11]:
df_longlat = pd.merge(df_cangrp, longlat, on=['PostalCode'], how = 'inner')

In [12]:
df_longlat

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


### Part 3: Exploration and Clustering of Toronto Neighborhoods

In [13]:
tableclus = soup.find('table')
tableclusrow = tableclus.find_all('td')
ele_len = len(tableclusrow)
Pos = []
Bou = []
Nei = []

for i in range(0, ele_len, 3):
    Pos.append(tableclusrow[i].text.strip())
    Bou.append(tableclusrow[i+1].text.strip())
    Nei.append(tableclusrow[i+2].text.strip())

In [14]:
df_clus = pd.DataFrame(data = [Pos, Bou, Nei]).transpose()
df_clus.columns = ['PostalCode', 'Borough', 'Neighborhood']
df_clus = df_clus[df_clus['Borough'] != 'Not assigned']
df_clus.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [15]:
df_clusgrp = df_clus.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_clusgrp.columns = ['PostalCode', 'Borough', 'Neighborhood']

In [16]:
ll = pd.read_csv('https://cocl.us/Geospatial_data')
ll.columns = ['PostalCode', 'Latitude', 'Longitude']

In [17]:
df_ll = pd.merge(df_clusgrp, ll, on=['PostalCode'], how = 'inner')
df_ll

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [18]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_ll['Borough'].unique()),
        df_ll.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [19]:
address = 'Toronto, CA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [20]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_ll['Latitude'], df_ll['Longitude'], df_ll['Borough'], df_ll['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto