# Segmenting and Clustering Neighborhoods in the city of Toronto, Canada

In [7]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib import request
import urllib

In [35]:
wikipedia_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html = urllib.request.urlopen(wikipedia_url).read()
soup = BeautifulSoup(html, "html.parser")

### Steps
* Read the table in Wikipedia
* Go through one table row at a time
* Add each table row to a dictionary, which can then be used to create the DataFrame

In [62]:
results = []
table = soup.find("table",{"class":"wikitable sortable"})
trs = table.findAll('tr')
trs = [tr for tr in trs if len(tr.find_all('th'))<1 ]
for tr in trs:
    tds = tr.find_all('td')
    #     PostalCode, Borough, and Neighborhood
    d = {
        'PostalCode': str(tds[0].text.strip()),
        'Borough': str(tds[1].text.strip()),
        'Neighborhood': str(tds[2].text.strip())
    }
    results.append(d)


### Create the DataFrame

In [63]:
df = pd.DataFrame(results)
df.head()

Unnamed: 0,Borough,Neighborhood,PostalCode
0,Not assigned,Not assigned,M1A
1,Not assigned,Not assigned,M2A
2,North York,Parkwoods,M3A
3,North York,Victoria Village,M4A
4,Downtown Toronto,Harbourfront,M5A


### Ignore cells with a borough that is Not assigned.

In [64]:
df = df[df.Borough != 'Not assigned']

### Combine Neighbourhoods belonging to same PostalCode and Borough

In [65]:
df = df.groupby(['PostalCode', 'Borough']).agg(','.join).reset_index()
df[df.PostalCode == 'M9V']

Unnamed: 0,PostalCode,Borough,Neighborhood
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [66]:
df['Neighborhood'] = np.where(df.Neighborhood == 'Not assigned', df.Borough, df.Neighborhood)

In [67]:
df[df.PostalCode == 'M7A']

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


### Total Number of Rows

In [68]:
df.shape

(103, 3)