In [50]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [51]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

## Loading Table to Pandas DF

Using `get` we downloaded the contents of the webpage in text format and stored it in variable `data`

In [52]:
data  = requests.get(url).text

To create list of dataframes from the above text in variable `data`, we use pandas `read_html` function. As our table is at index `0`, we saved this dataframe at index `0` to variable `df`

In [53]:
df = pd.read_html(data, flavor='bs4')[0]

We can see that the above dataframe have some rows of missing data with value `'Not assigned'` in both of its `Borough` and `Neighbourhood` columns.

In [54]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


1. Ignoring rows with `Borough` as `'Not assigned'`.
2. Replacing values of `Neighbourhood` column where rows are equal to `'Not assigned'` with value in `Borough` column.
3. Reset Index of the dataframe.

In [55]:
# step 1
df = df[df['Borough']!="Not assigned"]

# step 2
df.loc[df['Neighbourhood']=='Not assigned', 'Neighbourhood'] = df['Borough']

# step 3
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Number of rows in the above dataframe

In [60]:
print(f"Number of rows in processed datframe = {df.shape[0]}")

Number of rows in processed datframe = 103
