## Segmenting and Clustering Neighbourhoods

### Load Libraries

Load the libraries needed for this exercise

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

###  Fetch Wikipedia Page with Postal Codes

Connect to the wikipedia page to get the postal codes

In [2]:
wikipedia_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_page = requests.get (wikipedia_link)
page = raw_page.text


### Create Data Frame for the Postal Codes

In [3]:
# Convert the page to a BeautifulSoup object to parse the postal codes

from bs4 import BeautifulSoup
bs = BeautifulSoup (page, 'lxml')

In [46]:
# Create the postal code dataframe

postal_table = bs.find ('tbody')
cols = ['PostalCode', 'Borough', 'Neighborhood']
postal_df = pd.DataFrame(columns=cols)

# first row contains the table header information
for r, row in enumerate (postal_table.find_all('tr')[1:]):
    postalcode = ""
    borough = ""
    neighborhood = ""
    
    for c, col in enumerate (row.find_all('td')):
        if (c == 0):
            postalcode = col.text
        elif (c == 1):
            borough = col.text
        elif (c == 2):
            neighborhood = col.text.rstrip()
        else:
            print ('Should never get here')
    
    # Use the borough name if the neighborhood has not been assigned
    if (neighborhood == "Not assigned"):
        neighborhood = borough
    
    # if the borough is not assigned, then skip this row
    if (borough != "Not assigned"):
        dup_df = postal_df[postal_df ['PostalCode'] == postalcode]
        if (not dup_df.empty):
            idx = dup_df.index.values.astype(int)[0]
            appended_nh = dup_df.loc[idx, 'Neighborhood'] + ', ' + neighborhood
            postal_df.loc[idx, 'Neighborhood'] = appended_nh
        else: 
            temp_df = pd.DataFrame ({'PostalCode': [postalcode], 'Borough': [borough], 'Neighborhood': [neighborhood]})
            postal_df = postal_df.append(temp_df, ignore_index=True)

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 100):
    print(postal_df)
print (postal_df.shape)


    PostalCode           Borough                                       Neighborhood
0          M3A        North York                                          Parkwoods
1          M4A        North York                                   Victoria Village
2          M5A  Downtown Toronto                          Harbourfront, Regent Park
3          M6A        North York                   Lawrence Heights, Lawrence Manor
4          M7A      Queen's Park                                       Queen's Park
5          M9A         Etobicoke                                   Islington Avenue
6          M1B       Scarborough                                     Rouge, Malvern
7          M3B        North York                                    Don Mills North
8          M4B         East York                    Woodbine Gardens, Parkview Hill
9          M5B  Downtown Toronto                           Ryerson, Garden District
10         M6B        North York                                          Gl