# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

### Obtaining the postal codes table

First, we obtain a list of the tables in the url

In [1]:
import pandas as pd
import numpy as np

WIKI_URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html_tables = pd.read_html(WIKI_URL)
for df in html_tables:
    print(df.head())
    print(df.shape)

  Postcode           Borough     Neighbourhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront
(289, 3)
                                                  0   \
0                                                NaN   
1  NL NS PE NB QC ON MB SK AB BC NU/NT YT A B C E...   
2                                                 NL   
3                                                  A   

                                                  1   \
0                              Canadian postal codes   
1  NL NS PE NB QC ON MB SK AB BC NU/NT YT A B C E...   
2                                                 NS   
3                                                  B   

                                                  2    3    4    5    6    7   \
0                                                NaN  NaN  NaN  Na

We can notice that the postal code tables is the first dataframe in the list, so we assign it to a variable

In [2]:
postal_df = html_tables[0]
print(postal_df.shape)
postal_df.head()

(289, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Now we drop all the rows with 'Not assigned' borough

In [3]:
postal_df['Borough'].replace('Not assigned', np.nan, inplace=True)
postal_df.dropna(subset=['Borough'], inplace=True)
print(postal_df.shape)
postal_df.head()

(212, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


Now we concatenate all the Neighbourhood of the Postcode values to the first row it appears, drop all the other rows and reset the indexes

In [4]:
for p in postal_df['Postcode'].unique():
    subset_df = postal_df[postal_df['Postcode']==p]
    first_index = None
    neighbourhoods = []
    for index in subset_df.index:
        if (first_index == None):
            first_index = index
        neighbourhoods.append(subset_df.loc[index,'Neighbourhood'])
    postal_df.loc[first_index, 'Neighbourhood'] = ', '.join(neighbourhoods)
postal_df.drop_duplicates('Postcode', inplace=True)
postal_df.index = range(postal_df.shape[0])
print(postal_df.shape)
postal_df.head()

(103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


Lastly, we assign the Borough name to all 'Not assigned' Neighborhoods and print the shape of the final dataframe

In [5]:
for i in postal_df[postal_df['Neighbourhood']=='Not assigned'].index:
    postal_df.loc[i, 'Neighbourhood'] = postal_df.loc[i, 'Borough']
postal_df.shape

(103, 3)