### Segmenting and Clustering Neighborhoods in the city of Toronto, Canada

In [317]:
from bs4 import BeautifulSoup
import requests
import csv
import lxml
import re
import pandas as pd
import numpy as np


In [322]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'lxml')

my_table = soup.find('table',{'class':'wikitable sortable'})
thread = my_table.findAll('th')
thread = str(thread).split(',')
for el in range(len(thread)):
    thread[el] = thread[el].replace("[", "")
    thread[el] = thread[el].replace("]", "")
    thread[el] = thread[el].replace("<th>", "")
    thread[el] = thread[el].replace("</th>", "")    
    thread[el] = thread[el].replace("\n", "") 
    thread[el] = thread[el].strip() 
    
tbody = my_table.findAll('td')
#print(tbody)
table = []
for el in range(0, len(tbody), 3):
    temp_list = []
    temp_list.append(tbody[el])    
    temp_list.append(tbody[el+1])
    temp_list.append(tbody[el+2])
    table.append(temp_list)
#print(table)
def pattern_recognition(word):
    code1 = re.compile('title=\"(.*?)\"')
    match_code1 = code1.search(word)
    if match_code1:
        return match_code1.group(0)
    
    
    
frame = []
for unit in table:
    temp_frame = []
    for el in unit:
        el = str(el).replace("<td>", "")
        el = str(el).replace("</td>", "")    
        el = str(el).replace("\n", "")
        pattern = pattern_recognition(el)
        if pattern != None:
            pattern = pattern.replace("title=\"", "")
            pattern = pattern.replace("\"", "")                          
            temp_frame.append(pattern)
        else:
            temp_frame.append(el)
    frame.append(temp_frame) 

df = pd.DataFrame(frame)
df.columns = [thread[0], thread[1], thread[2]]
df.replace('Not assigned', np.NaN, inplace=True)
df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront (Toronto)
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park (Toronto),
9,M8A,,


In [323]:
df.shape

(288, 3)

In [324]:
df.Borough.unique()

array([nan, 'North York', 'Downtown Toronto', "Queen's Park (Toronto)",
       'Etobicoke', 'Scarborough, Toronto', 'East York', 'York',
       'East Toronto', 'West Toronto', 'Central Toronto', 'Mississauga'],
      dtype=object)

In [325]:
df['Neighbourhood'].isnull().value_counts()

False    210
True      78
Name: Neighbourhood, dtype: int64

In [326]:
df['Borough'].isnull().value_counts()

False    211
True      77
Name: Borough, dtype: int64

In [327]:
df = df[df['Borough'].notnull()]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront (Toronto)
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [328]:
df.shape

(211, 3)

In [329]:
df['Neighbourhood'] = df.apply(
    lambda row: row['Borough'] if pd.isnull(row['Neighbourhood']) else row['Neighbourhood'],
    axis=1)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront (Toronto)
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park (Toronto),Queen's Park (Toronto)
10,M9A,Etobicoke,Islington Avenue
11,M1B,"Scarborough, Toronto","Rouge, Toronto"
12,M1B,"Scarborough, Toronto","Malvern, Toronto"


In [330]:
df.index = pd.RangeIndex(len(df.index))
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront (Toronto)
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [331]:
result1 = [df['Postcode'][0]]
result2 = [df['Borough'][0]]
result3 = [df['Neighbourhood'][0]]

for k, a, b in zip(df['Postcode'][1:], df['Borough'][1:], df['Neighbourhood'][1:]):
    if k == result1[-1] and b != result3[-1]:        # If k matches the last value in result1,
        result3[-1] += ", " + b  # add a to the last value of result3
    else:  # Otherwise add a new row with the values
        result1.append(k)
        result2.append(a)
        result3.append(b)

# Create a new dataframe using these result lists
data = pd.DataFrame({'Postcode': result1, 'Borough': result2, 'Neighbourhood': result3})
data.head(20)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront (Toronto), Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park (Toronto),Queen's Park (Toronto)
5,M9A,Etobicoke,Islington Avenue
6,M1B,"Scarborough, Toronto","Rouge, Toronto, Malvern, Toronto"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [333]:
data.shape

(103, 3)

In [334]:
df_coord = pd.read_csv('Geospatial_Coordinates.csv')
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [344]:
df_coord.shape

(103, 3)

In [345]:
df_coord.rename(columns={'Postal Code':'Postcode'}, inplace=True)
df_coord.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [346]:
frames = [data,df_coord]
df_keys = pd.merge(data,df_coord, on='Postcode')
df_keys.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront (Toronto), Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park (Toronto),Queen's Park (Toronto),43.662301,-79.389494


In [347]:
df_keys.rename(columns={'Postcode':'PostalCode'}, inplace=True)
df_keys.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront (Toronto), Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park (Toronto),Queen's Park (Toronto),43.662301,-79.389494


In [351]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from geopy.geocoders import Nominatim

In [363]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 43.653963, -79.387207.


In [364]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_keys['Latitude'], df_keys['Longitude'], df_keys['Borough'], df_keys['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork