# Import required libraries & parse wiki page into BeautifulSoup object

In [138]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
print("libraries imported")

libraries imported


In [139]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [140]:
import requests
raw_wiki_page = requests.get(url)
raw_wiki_page

page = raw_wiki_page.text
#page

In [141]:
#parse url into BeautifulSoup object
soup = BeautifulSoup(page)
print("wiki url parsed into bs object")

wiki url parsed into bs object


In [142]:
#print(soup.prettify())

# Parse wiki table into pandas DataFrame

In [143]:
table = soup.find_all('table')[0]
wiki_df = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])
wiki_df

Unnamed: 0,PostalCode,Borough,Neighborhood


In [144]:
#for loop in table
l = []
for tr in table.find_all('tr'):
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
wiki_df = pd.DataFrame(l, columns=['PostalCode', 'Borough', 'Neighborhood'])
wiki_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


# Start cleaning table

### Drop first tow

In [145]:
wiki_df.drop(0 , inplace=True)
wiki_df

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n
5,M5A,Downtown Toronto,Harbourfront\n
6,M5A,Downtown Toronto,Regent Park\n
7,M6A,North York,Lawrence Heights\n
8,M6A,North York,Lawrence Manor\n
9,M7A,Queen's Park,Not assigned\n
10,M8A,Not assigned,Not assigned\n


### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 

In [146]:
wiki_df.loc[wiki_df['Neighborhood'].str.contains("Not assigned"), 'Neighborhood'] = wiki_df['Borough']
#wiki_df['Neighborhood'].str.contains("Not assigned")
wiki_df

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n
5,M5A,Downtown Toronto,Harbourfront\n
6,M5A,Downtown Toronto,Regent Park\n
7,M6A,North York,Lawrence Heights\n
8,M6A,North York,Lawrence Manor\n
9,M7A,Queen's Park,Queen's Park
10,M8A,Not assigned,Not assigned


### Remove "Not assigned" rows

In [147]:
wiki_df = wiki_df[~wiki_df['PostalCode'].str.contains("Not assigned")]
wiki_df = wiki_df[~wiki_df['Borough'].str.contains("Not assigned")]
wiki_df = wiki_df[~wiki_df['Neighborhood'].str.contains("Not assigned")]
wiki_df = wiki_df.reset_index()
new_wiki_df = wiki_df[['PostalCode','Borough','Neighborhood']]
new_wiki_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods\n
1,M4A,North York,Victoria Village\n
2,M5A,Downtown Toronto,Harbourfront\n
3,M5A,Downtown Toronto,Regent Park\n
4,M6A,North York,Lawrence Heights\n
5,M6A,North York,Lawrence Manor\n
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue\n
8,M1B,Scarborough,Rouge\n
9,M1B,Scarborough,Malvern\n


### Remove string "\n"

In [148]:
#clean new_wiki_df
new_wiki_df['Neighborhood'] = new_wiki_df['Neighborhood'].str.replace('\n','')
new_wiki_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


### combine rows that 'one neighborhood exist in one postal code area' into one row with the neighborhoods separated with a comma

In [149]:
new_wiki_df_2 = new_wiki_df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
new_wiki_df_2 = pd.DataFrame(new_wiki_df_2).reset_index()

new_wiki_df_2

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Print the shape of cleaned datset

In [150]:
print("Shape of cleaned dataset = ",new_wiki_df_2.shape)

Shape of cleaned dataset =  (103, 3)
