# Segmenting and Clustering Neighborhoods in the city of Toronto, Canada

In [41]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib import request
import urllib

In [42]:
wikipedia_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html = urllib.request.urlopen(wikipedia_url).read()
soup = BeautifulSoup(html, "html.parser")

### Steps
* Read the table in Wikipedia
* Go through one table row at a time
* Add each table row to a dictionary, which can then be used to create the DataFrame

In [43]:
results = []
table = soup.find("table",{"class":"wikitable sortable"})
trs = table.findAll('tr')
trs = [tr for tr in trs if len(tr.find_all('th'))<1 ]
for tr in trs:
    tds = tr.find_all('td')
    #     PostalCode, Borough, and Neighborhood
    d = {
        'PostalCode': str(tds[0].text.strip()),
        'Borough': str(tds[1].text.strip()),
        'Neighborhood': str(tds[2].text.strip())
    }
    results.append(d)


### Create the DataFrame

In [44]:
df = pd.DataFrame(results)
df.head()

Unnamed: 0,Borough,Neighborhood,PostalCode
0,Not assigned,Not assigned,M1A
1,Not assigned,Not assigned,M2A
2,North York,Parkwoods,M3A
3,North York,Victoria Village,M4A
4,Downtown Toronto,Harbourfront,M5A


### Ignore cells with a borough that is Not assigned.

In [45]:
df = df[df.Borough != 'Not assigned']

### Combine Neighbourhoods belonging to same PostalCode and Borough

In [46]:
df = df.groupby(['PostalCode', 'Borough']).agg(','.join).reset_index()
df[df.PostalCode == 'M9V']

Unnamed: 0,PostalCode,Borough,Neighborhood
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [47]:
df['Neighborhood'] = np.where(df.Neighborhood == 'Not assigned', df.Borough, df.Neighborhood)

In [48]:
df[df.PostalCode == 'M7A']

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


### Total Number of Rows

In [49]:
df.shape

(103, 3)

## *** Second Question ***

#### I created a separate notebook as the instructions were a bit ambiguous

In [50]:
lat_lng_file = 'https://cocl.us/Geospatial_data'
lat_lng = pd.read_csv(lat_lng_file)
lat_lng.head(3)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711


### Merge the Lat Lng Dataframe with the Locations DF

In [51]:
df = df.merge(lat_lng, left_on='PostalCode', right_on = 'Postal Code', how='left')

In [52]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


#### We can drop one extra Postal Code column

In [53]:
df.drop('Postal Code', axis=1, inplace=True)

### Make sure we have not increased the number of rows. It should still be 103 rows and 5 columns

In [54]:
df.shape

(103, 5)

In [55]:
df.head(3)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
