# Scrape the data from the web

In this assignment, we scrape https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M and make a dataframe out of it.

In [175]:
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize

## Pull HTML and start parsing it

At this step we pull data from the server, feed it to a BeautifulSoup object and search for 'wikitable sortable' class, which represents the table on the page. 

In [176]:
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(page.text, 'html.parser')
neighs_list = soup.find(class_='wikitable')
neighs_list

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

## Parse the HTML and make a dataframe

In [177]:
import pandas as pd

In [178]:
col_names = ['PostalCode','Borough','Neighborhood']

df = pd.DataFrame(columns = col_names)

# n will pass through every 'tr' element
for n in neighs_list.find_all('tr'): 
    d =[n.contents[1].text.strip() # PostalCode
        ,n.contents[3].text.strip() # Borough
        ,n.contents[5].text.strip()] # Neighborhood
    row = dict(zip(col_names,d)) # row -- dictionary object that represents a row
    if row['Borough'].lower() not in ['not assigned','borough']: # ignore not-assigned Boroughs and the first row
        df = df.append(row, ignore_index = True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


## Aggregate neighborhoods in the same postal code

In [179]:
# make a custom aggregator that makes a string with comma-separated neighborhoods
def comma_sep(series):
    return ','.join(series.tolist())

# make a comma separated list of neighborhoods pertaining to the same borough:
df = df.groupby(by='PostalCode') \
    .agg({'Borough':'first','Neighborhood':comma_sep}) \
    .reset_index() \
    [['PostalCode','Borough','Neighborhood']]
    
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Copy neighborhood from borough, where the neighborhood is 'not assigned'

The assumption here is that the dataframe does not contain any '_not assigned_' boroughs. Actually, this has been addressed above in the web-scraping code

In [180]:
missing_neigh_cond = df.Neighborhood.str.lower() == 'not assigned' # select condition
df.loc[missing_neigh_cond,'Neighborhood'] = df.loc[missing_neigh_cond,'Borough'] # update

## The Final Dataframe

In [181]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [182]:
df.shape

(103, 3)