# 1 Parsing Data from Wikipedia (List of postal codes in Canada)

In [18]:
import requests # for accessing Wikipedia Page
from bs4 import BeautifulSoup # Beautifull soap for reading the table data

In [19]:
# First loading the web - page from Wikipedia
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
# Then parsing it 
soup = BeautifulSoup(website_url,'lxml')
# Checking correctness of parsing by scrolling
# print(soup.prettify())

In [20]:
#
# Firstly scanning table in HTML - Page and writing it to a List (append method)
# secondly finding table, then table-body with the data and then rows and there content per iteration
#
data =[]
table = soup.find('table',{'class':'wikitable sortable'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append(cols)
# then writing Data to a pandas dataframe and 
import pandas as pd
data[0]=['PostalCode','Borough','Neighborhood']
headers = data.pop(0)
df = pd.DataFrame(data,columns = headers)
# Showing first 5 elements of the DataFrame to be shure to find the right data
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# 2 Process assigned values for column Borough only

In [21]:
# What Values are in column Borough?
df['Borough'].unique()

array(['Not assigned', 'North York', 'Downtown Toronto', "Queen's Park",
       'Etobicoke', 'Scarborough', 'East York', 'York', 'East Toronto',
       'West Toronto', 'Central Toronto', 'Mississauga'], dtype=object)

In [22]:
# just ignore 'not assigned' and drop None values
df.dropna(subset=['Borough'], inplace=True)
df=df[~df['Borough'].str.contains('Not assigned')]
# Checking if all None and 'not assigned' values are filtered in DataFrame 
df['Borough'].unique()


array(['North York', 'Downtown Toronto', "Queen's Park", 'Etobicoke',
       'Scarborough', 'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

# 3 Consolidate Neighborhoods

In [23]:
# sorting dataframe from chapter 2 for sequential reading
df_sorted = df.sort_values(['PostalCode']).reset_index(drop=True)
columns   = ['PostalCode','Borough','Neighborhood']
# creating an empty result - Dataframe and initialising Variables
result = pd.DataFrame(columns=columns)
pc_before  = df_sorted.loc[0]['PostalCode']
bor_before = df_sorted.loc[0]['Borough']
neigbor = ''
for i in range(0,len(df_sorted)):
    if pc_before == df_sorted.loc[i]['PostalCode'] and bor_before == df_sorted.loc[i]['Borough']:
        if i>0:
            neigbor = neigbor + ',' + df_sorted.loc[i]['Neighborhood']
        else:
            neigbor = df_sorted.loc[i]['Neighborhood']
        df2 = pd.DataFrame([[df_sorted.loc[i]['PostalCode'], df_sorted.loc[i]['Borough'], neigbor]], columns=columns)
       
    else:
        result=result.append(df2)
        neigbor = df_sorted.loc[i]['Neighborhood']
        df2 = pd.DataFrame([[df_sorted.loc[i]['PostalCode'], df_sorted.loc[i]['Borough'], neigbor]], columns=columns)
    pc_before = df_sorted.loc[i]['PostalCode']
    bor_before = df_sorted.loc[i]['Borough']
# Reindexing and Testprint of the result
result = result.reset_index(drop=True)
result.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# 4 Using the .shape method to print the number of rows in result - dataframe


In [24]:
print('Number of rows in dataframe =',result.shape[0])

Number of rows in dataframe = 102
