# Import libraries

In [227]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

# Website Scraping

## Set up URL and download the web page

In [228]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text

## Get a file handler for an output .csv file

In [229]:
filename = "toronto-FSA.csv"
f = open(filename, 'w')

## Initialize a BeautifulSoup object

In [230]:
soup = BeautifulSoup(source, 'lxml')


## Get the major article (Toronto FSA table)

In [231]:
article = soup.find('div', class_="mw-parser-output")

table = article.table

## Get the entry points for using at  rows and features/columns extracting

In [232]:
rows = table.tbody.find_all("tr")

features = rows[0].find_all('th')

## .csv file header (first line of file which contains features' name/ID)

In [233]:
headers = ""

## Extracting features

In [234]:
for feature in features:
    headers = headers + feature.text + ","

# A comma is not required at the end of each line/row
headers = headers[:-1]

## Writing features to .csv file

In [235]:
f.write(headers)

31

## Extracting rows/cells and writing to .csv file

In [236]:
for row in rows:
    cells = row.find_all('td')
    row_buff = ""
    for cell in cells:
        row_buff = row_buff + cell.text + ','

    # A comma is not required at the end of each line/row
    row_buff = row_buff[:-1]
    f.write(row_buff)

## Close the .csv file

In [237]:
f.close()

# Read in the .csv file and store in Data Frame

In [238]:
import numpy as np

In [239]:
df_toronto = pd.read_csv("toronto-FSA.csv")
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [240]:
df_toronto.shape

(289, 3)

# Data Cleaning

## Define droping masks

In [241]:
borough_mask = df_toronto.index[df_toronto['Borough'] == 'Not assigned']
neighborhood_mask = df_toronto.index[df_toronto['Neighbourhood'] == 'Not assigned']
neighborhood_and_borough_mask = borough_mask & neighborhood_mask

## Print out the statistics before cleaning

In [242]:
print('Statistics before cleaning:\n')
print('  {} Postal codes'.format(df_toronto['Postcode'].unique().shape[0]))
print('  {} Boroughs'.format(df_toronto['Borough'].unique().shape[0] - 1))
print('  {} Neighborhoods'.format(df_toronto['Neighbourhood'].unique().shape[0]))
print('  {} rows with Not assigned Borough'.format(borough_mask.shape[0]))
print('  {} rows with Not assigned Neighborhood'.format(neighborhood_mask.shape[0]))
print('  {} rows with Not assigned Neighborhood and Borough'.format(neighborhood_and_borough_mask.shape[0]),'\n')

print('The DataFrame shape is {}'.format(df_toronto.shape),'\n')

Statistics before cleaning:

  180 Postal codes
  11 Boroughs
  210 Neighborhoods
  77 rows with Not assigned Borough
  78 rows with Not assigned Neighborhood
  77 rows with Not assigned Neighborhood and Borough 

The DataFrame shape is (289, 3) 



## Dropping rows with "Not assigned" at "Borough" column

In [243]:
df_toronto.drop(df_toronto.index[borough_mask], inplace=True)
df_toronto.reset_index(drop=True, inplace=True)
print(df_toronto.shape)
df_toronto.head(10)

(212, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


## Replace "Not assigned" values in the "Neighborhood" column with the "Borough" name in that cell

In [244]:
# Rerun this because the indexes on the dataframe where reset
neighborhood_mask = df_toronto.index[df_toronto['Neighbourhood'] == 'Not assigned']

In [245]:
for idx in neighborhood_mask:
    df_toronto['Neighbourhood'][idx] = df_toronto['Borough'][idx]
print(df_toronto.shape)   
df_toronto.head(10)

(212, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


## Print out the statistics after cleaning

In [246]:
borough_mask = df_toronto.index[df_toronto['Borough'] == 'Not assigned']
neighborhood_mask = df_toronto.index[df_toronto['Neighbourhood'] == 'Not assigned']
neighborhood_and_borough_mask = borough_mask & neighborhood_mask

print('Statistics after cleaning:\n')
print('  {} Postal codes'.format(df_toronto['Postcode'].unique().shape[0]))
print('  {} Boroughs'.format(df_toronto['Borough'].unique().shape[0]))
print('  {} Neighborhoods'.format(df_toronto['Neighbourhood'].unique().shape[0]))
print('  {} rows with Not assigned Borough'.format(borough_mask.shape[0]))
print('  {} rows with Not assigned Neighborhood'.format(neighborhood_mask.shape[0]))
print('  {} rows with Not assigned Neighborhood and Borough'.format(neighborhood_and_borough_mask.shape[0]),'\n')

print('The DataFrame shape is {}'.format(df_toronto.shape),'\n')

Statistics after cleaning:

  103 Postal codes
  11 Boroughs
  210 Neighborhoods
  0 rows with Not assigned Borough
  0 rows with Not assigned Neighborhood
  0 rows with Not assigned Neighborhood and Borough 

The DataFrame shape is (212, 3) 



##  Group by the 'Postcode' column and consolidate the content in 'Neighbourhood' cells

In [247]:
# lambda functions to handle the cell operations
f_neighborhoods = lambda x: "%s" % ', '.join(x)
f_boroughs = lambda x: set(x).pop()

temp = df_toronto.groupby('Postcode')
temp_neighborhoods = temp['Neighbourhood'].apply(f_neighborhoods)
temp_boroughs = temp['Borough'].apply(f_boroughs)

columns_list = list(zip(temp_boroughs.index, temp_boroughs, temp_neighborhoods))
df_toronto_grouped = pd.DataFrame(columns_list)

df_toronto_grouped.columns = ['Postcode', 'Borough', 'Neighbourhood']

df_toronto_grouped.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Print out the final DataFrame shape

In [248]:
print('The final DataFrame shape is {}'.format(df_toronto_grouped.shape),'\n')

The final DataFrame shape is (103, 3) 

