# This is the Coursera Capstone 

## Import the usual crew

In [9]:

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

## Connect to the web address

In [10]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
filename = "toronto-FSA.csv"
f = open(filename, 'w')

## Import Beautiful Soup

In [11]:
soup = BeautifulSoup(source, 'lxml')
article = soup.find('div', class_="mw-parser-output")

table = article.table
rows = table.tbody.find_all("tr")

features = rows[0].find_all('th')

## Just check the current output 

In [13]:
headers = ""

for feature in features:
    headers = headers + feature.text + ","
headers = headers[:-1]

f.write(headers)

31

## Get data entry points for extraction and close the CSV file

In [14]:
for row in rows:
    cells = row.find_all('td')
    row_buff = ""
    for cell in cells:
        row_buff = row_buff + cell.text + ','

    # A comma is not required at the end of each line/row
    row_buff = row_buff[:-1]
    f.write(row_buff)

f.close()

## Read the CSV file and check the dataframe

In [15]:
df_toronto = pd.read_csv("toronto-FSA.csv")
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [16]:
df_toronto.shape

(288, 3)

## So now we commence with cleaning the file by dropping masks

In [17]:
borough_mask = df_toronto.index[df_toronto['Borough'] == 'Not assigned']
neighborhood_mask = df_toronto.index[df_toronto['Neighbourhood'] == 'Not assigned']
neighborhood_and_borough_mask = borough_mask & neighborhood_mask

## Now we need to drop specific rows with "not assigned"

In [18]:
df_toronto.drop(df_toronto.index[borough_mask], inplace=True)
df_toronto.reset_index(drop=True, inplace=True)
print(df_toronto.shape)
df_toronto.head(10)

(211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


## We now need to substitute the current borough names for those as "Not assigned"

In [19]:
neighborhood_mask = df_toronto.index[df_toronto['Neighbourhood'] == 'Not assigned']

In [20]:
for idx in neighborhood_mask:
    df_toronto['Neighbourhood'][idx] = df_toronto['Borough'][idx]
print(df_toronto.shape)   
df_toronto.head(10)

(211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


## So we can now check some basic stats after cleaning

In [24]:
borough_mask = df_toronto.index[df_toronto['Borough'] == 'Not assigned']
neighborhood_mask = df_toronto.index[df_toronto['Neighbourhood'] == 'Not assigned']
neighborhood_and_borough_mask = borough_mask & neighborhood_mask

print('Statistics after cleaning:\n')
print('  {} rows with Not assigned Borough'.format(borough_mask.shape[0]))
print('  {} rows with Not assigned Neighborhood'.format(neighborhood_mask.shape[0]))
print('  {} rows with Not assigned Neighborhood and Borough'.format(neighborhood_and_borough_mask.shape[0]),'\n')

print('The DataFrame shape is {}'.format(df_toronto.shape),'\n')

Statistics after cleaning:

  0 rows with Not assigned Borough
  0 rows with Not assigned Neighborhood
  0 rows with Not assigned Neighborhood and Borough 

The DataFrame shape is (211, 3) 



## So now we group the data by Postcode and consolidate the content in the Neighborhood cells

In [22]:
# lambda functions to handle the cell operations
f_neighborhoods = lambda x: "%s" % ', '.join(x)
f_boroughs = lambda x: set(x).pop()

temp = df_toronto.groupby('Postcode')
temp_neighborhoods = temp['Neighbourhood'].apply(f_neighborhoods)
temp_boroughs = temp['Borough'].apply(f_boroughs)

columns_list = list(zip(temp_boroughs.index, temp_boroughs, temp_neighborhoods))
df_toronto_grouped = pd.DataFrame(columns_list)

df_toronto_grouped.columns = ['Postcode', 'Borough', 'Neighbourhood']

df_toronto_grouped.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## We check the final shape

In [23]:
print('The final DataFrame shape is {}'.format(df_toronto_grouped.shape),'\n')

The final DataFrame shape is (103, 3) 

