# Segmenting and Clustering Neighborhoods in Toronto (Part 2 and 3)
## Week 3 Assignment of Applied Data Science Capstone
### Task 2: Add the latitude and longitude coordinates to the dataframe and cluster the neighborhoods in Toronto

Import Libraries

In [17]:
import numpy as np
import pandas as pd

In [18]:
from bs4 import BeautifulSoup
import requests

website = r"https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
webdata  = requests.get(website)
data = webdata.text
soup = BeautifulSoup(data, "lxml")

In [19]:
temp_content = []
table = soup.find("tbody")
for row in table.find_all("tr"):
    cells = row.find_all("td")
    for e in cells:
        temp_content.append(str(e)[4:-5])
print(temp_content[0:10])

['M1A', 'Not assigned', 'Not assigned\n', 'M2A', 'Not assigned', 'Not assigned\n', 'M3A', '<a href="/wiki/North_York" title="North York">North York</a>', '<a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>\n', 'M4A']


In [20]:
import re

content = []
for u in temp_content:
    c = u.strip()
    if c[0] == "<":
        c = re.search('title="(.*)">', c).group(1)  
    content.append(c) 
print(content[0:10])

['M1A', 'Not assigned', 'Not assigned', 'M2A', 'Not assigned', 'Not assigned', 'M3A', 'North York', 'Parkwoods', 'M4A']


In [21]:
data_dict = {
    "Postcode":content[0::3],
    "Borough":content[1::3],
    "Neighbourhood":content[2::3]
}
df = pd.DataFrame(data_dict)
df = df[df["Borough"] != "Not assigned"].reset_index(drop=True)
not_as_fil = df["Neighbourhood"] == "Not assigned"
df.loc[not_as_fil,["Neighbourhood"]] = df.loc[not_as_fil]["Borough"]
print(df.head())

  Postcode           Borough           Neighbourhood
0      M3A        North York               Parkwoods
1      M4A        North York        Victoria Village
2      M5A  Downtown Toronto  Harbourfront (Toronto)
3      M5A  Downtown Toronto             Regent Park
4      M6A        North York        Lawrence Heights


Assign postcode borough and neighbourhood into the list

In [26]:
lat_long = pd.read_csv("https://cocl.us/Geospatial_data", sep=",", header=0)

Load latitude and longitude data

In [24]:
lat_long.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
# print(lat_long.columns[0])
print(lat_long.head(10))

  PostalCode   Latitude  Longitude
0        M1B  43.806686 -79.194353
1        M1C  43.784535 -79.160497
2        M1E  43.763573 -79.188711
3        M1G  43.770992 -79.216917
4        M1H  43.773136 -79.239476
5        M1J  43.744734 -79.239476
6        M1K  43.727929 -79.262029
7        M1L  43.711112 -79.284577
8        M1M  43.716316 -79.239476
9        M1N  43.692657 -79.264848


Associate Postal code to Lat and Long data

In [25]:
final = df.merge(lat_long, how="inner", left_on='Postcode', right_on='PostalCode')
print(final[0:10].to_string())

  Postcode                 Borough           Neighbourhood PostalCode   Latitude  Longitude
0      M3A              North York               Parkwoods        M3A  43.753259 -79.329656
1      M4A              North York        Victoria Village        M4A  43.725882 -79.315572
2      M5A        Downtown Toronto  Harbourfront (Toronto)        M5A  43.654260 -79.360636
3      M5A        Downtown Toronto             Regent Park        M5A  43.654260 -79.360636
4      M6A              North York        Lawrence Heights        M6A  43.718518 -79.464763
5      M6A              North York          Lawrence Manor        M6A  43.718518 -79.464763
6      M7A  Queen's Park (Toronto)  Queen's Park (Toronto)        M7A  43.662301 -79.389494
7      M9A               Etobicoke        Islington Avenue        M9A  43.667856 -79.532242
8      M1B    Scarborough, Toronto          Rouge, Toronto        M1B  43.806686 -79.194353
9      M1B    Scarborough, Toronto        Malvern, Toronto        M1B  43.806686

Merge to full data

In [27]:
pst = final.groupby("Postcode")
df_agg = pst.aggregate(lambda x: list(set(x)))
print(df_agg[4:6].to_string())

                         Borough          Neighbourhood PostalCode      Latitude             Longitude
Postcode                                                                                              
M1H       [Scarborough, Toronto]            [Cedarbrae]      [M1H]   [43.773136]  [-79.23947609999999]
M1J       [Scarborough, Toronto]  [Scarborough Village]      [M1J]  [43.7447342]  [-79.23947609999999]


Group data

In [28]:
df_agg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103 entries, M1B to M9W
Data columns (total 5 columns):
Borough          103 non-null object
Neighbourhood    103 non-null object
PostalCode       103 non-null object
Latitude         103 non-null object
Longitude        103 non-null object
dtypes: object(5)
memory usage: 4.8+ KB
