***
# Applied Data Science Capstone Project
## Week 3 -  Segmenting and Clustering Neighborhoods in Toronto
## Notebook 1 - Build the code to scrape the Wikipedia page with Canada Postal Codes
***

### Import Packages

In [1]:
import pandas as pd
import numpy as np

### Read the postal codes of Canada using Wikipedia page

In [11]:
# Read Canada Postal Code from Wikipedia table
can_post_URL  = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df_can = pd.read_html(can_post_URL)[0]
df_can.rename(columns = {'Postcode':'PostalCode'}, inplace = True) 
print('Dimension of the Dataframe is', df_can.shape)
df_can.head()

Dimension of the Dataframe is (287, 3)


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Filter Data
- Ignore cells with a borough with "Not assigned" values
- Cell has a borough but a Not assigned neighborhood. Replace neighborhood with the Borough
- Combines neighborhoods that exist in one postal code area (Separated with a comma)

In [13]:
# Drop "Not assigned" Boroughs
# 
df_can_filter1 = df_can[(df_can['Borough'] != 'Not assigned')]
df_can_filter1.reset_index(drop=True, inplace=True)
print('The Dataframe Dimension after dropping Boroughs with "Not Assigned" values is',df_can_filter1.shape)
df_can_filter1.head()

The Dataframe Dimension after dropping Boroughs with "Not Assigned" values is (210, 3)


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [15]:
# Change "Not assigned" Neighbourhood with the Borough Value
#
df_can_filter2=df_can_filter1.copy()
df_can_filter2.loc[df_can_filter2['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df_can_filter2['Borough']
df_can_filter2.reset_index(drop=True, inplace=True)
print('The following Dataframe does not contain "Not assigned" values')
df_can_filter2.head()

The following Dataframe does not contain "Not assigned" values


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [8]:
# Merge Rows with same PostalCode
#
df_can_filter2=df_can_filter2.sort_values(by ='Neighbourhood', ascending=[False] )
df_can_group = df_can_filter2.groupby(['PostalCode']).agg({'Borough':'first','Neighbourhood': ', '.join }) 
df_can_group = df_can_group.reset_index()
df_can_group.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"West Hill, Morningside, Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Display the dimension (Shape) final dataframe

In [16]:
# Display final Dataframe Dimension
#
table_dim = df_can_group.shape
print('The Final Dataframe Dimension is',table_dim)

The Final Dataframe Dimension is (103, 3)
