# Webscraping postal codes of Toronto Canada - 1

Part of IBM Capstone Data Science Course Part 1 - webscraping the postal codes of Toronto Canada

## Part 1 Webscraping the postal codes of Canada

## Prepare Dependencies

In [40]:
from bs4 import BeautifulSoup
import requests

# Getting the source from wikipedia

In [42]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [43]:
# put it into the soup

soup = BeautifulSoup(source, 'lxml')
#print(soup.prettify())

## Parsing out the information from the table

In [44]:
table = soup.find('table', class_ = 'wikitable sortable')
#print(table.prettify())

In [45]:
# All table body rows ***

for table_body in soup.find_all('td'):
    print(table_body)

<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td>
<td>M6A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
</td>
<td>M6A</td>
<td

In [46]:
# 4th step from labs/DP0701EN/Webscraping python BeautifulSoup parsing table.ipynb

data = [] #create empty list named data

table = soup.find('table', attrs={'class':'wikitable sortable'})
#print(table)

tbody = table.tbody
#print(tbody)

In [47]:
# 5th step from labs/DP0701EN/Webscraping python BeautifulSoup parsing table.ipynb

#table_body = table.find('tbody')

rows = tbody.find_all('tr')
for row in rows:
    #print(row)
    cols = row.find_all('td')
    #print(cols)
    cols = [ele.text.strip() for ele in cols]
    #print(cols)
    data.append([ele for ele in cols if ele]) # Get rid of empty values
    
#print(data) # print list data with lists of the rows scraped from the wikipedia page

## load data in Pandas dataframe

In [None]:
df = pd.DataFrame(data)
df.columns = ['Postcode','Borough','Neighbourhood']
#df

## Clean up the data in the pandas frame

In [49]:
#drop all entry's with no assigned Borough
df = df[df.Borough != 'Not assigned']
#df

In [50]:
#clean up first row
df = df[df.Postcode.notnull()]
#df

In [51]:
#reindex dataframe
df = df.reset_index(drop=True)
#df

In [52]:
#check if there are rows with value not assigned for the column Neighbourhood
df.loc[df['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
6,M7A,Queen's Park,Not assigned


In [55]:
#Copy value of Borough to Neighbourhood when Neighbourhood is not assigned

df.Neighbourhood = df.Borough.where(df.Neighbourhood == 'Not assigned', df.Neighbourhood)
#df.loc[df['Neighbourhood'] == 'Not assigned']

In [56]:
#Check the copied value
df.loc[df['Neighbourhood'] == "Queen's Park"]

Unnamed: 0,Postcode,Borough,Neighbourhood
6,M7A,Queen's Park,Queen's Park


In [57]:
#Group the dataframe by Postcode and Borough, combine Neighbourhood seperated with a comma, reindex data frame

#https://stackoverflow.com/questions/36392735/how-to-combine-multiple-rows-into-a-single-row-with-pandas
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

# Result of webscraping the postal codes of Canada in the cleaned up pandas data frame

In [58]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Shape of the resulting data frame

In [59]:
df.shape

(103, 3)