### Libraries

In [3]:
### Conda Installs
#!conda install -c conda-forge geopy --yes 
#!conda install -c conda-forge folium=0.5.0 --yes
#!conda install -c anaconda pandas --yes 
#!conda install -c anaconda wget --yes 
#!conda install -c conda-forge matplotlib
#!conda install -c anaconda beautifulsoup4

In [7]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import lxml 

### Import website data. Find table.

In [32]:
weblink = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(weblink.content,'lxml')
table = soup.find_all('tbody')[0]
#print(table) 


#### Select table with data needed.

In [33]:
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('tr')

In [46]:
### Data for table
data = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    data.append(row)
print(data)
#print(df[0].to_json(orient='records'))

[[], ['M1A', 'Not assigned', 'Not assigned\n'], ['M2A', 'Not assigned', 'Not assigned\n'], ['M3A', 'North York', 'Parkwoods\n'], ['M4A', 'North York', 'Victoria Village\n'], ['M5A', 'Downtown Toronto', 'Harbourfront\n'], ['M6A', 'North York', 'Lawrence Heights\n'], ['M6A', 'North York', 'Lawrence Manor\n'], ['M7A', 'Downtown Toronto', "Queen's Park\n"], ['M8A', 'Not assigned', 'Not assigned\n'], ['M9A', "Queen's Park", 'Not assigned\n'], ['M1B', 'Scarborough', 'Rouge\n'], ['M1B', 'Scarborough', 'Malvern\n'], ['M2B', 'Not assigned', 'Not assigned\n'], ['M3B', 'North York', 'Don Mills North\n'], ['M4B', 'East York', 'Woodbine Gardens\n'], ['M4B', 'East York', 'Parkview Hill\n'], ['M5B', 'Downtown Toronto', 'Ryerson\n'], ['M5B', 'Downtown Toronto', 'Garden District\n'], ['M6B', 'North York', 'Glencairn\n'], ['M7B', 'Not assigned', 'Not assigned\n'], ['M8B', 'Not assigned', 'Not assigned\n'], ['M9B', 'Etobicoke', 'Cloverdale\n'], ['M9B', 'Etobicoke', 'Islington\n'], ['M9B', 'Etobicoke', 'M

In [22]:
### Convert imported data to pandas DataFrame
df = pd.DataFrame(data, columns=["Postcode", "Borough", "Neighbourhood"])
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n
...,...,...,...
283,M8Z,Etobicoke,Mimico NW\n
284,M8Z,Etobicoke,The Queensway West\n
285,M8Z,Etobicoke,Royal York South West\n
286,M8Z,Etobicoke,South of Bloor\n


### Clean Data

In [75]:
# Remove \n
df = df.replace(r'\n','', regex=True)
df.columns = df.columns.str.strip()
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge , Malvern"
1,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village , Martin Grove Gardens , Ric..."
101,M9V,Etobicoke,"Albion Gardens , Beaumond Heights , Humbergate..."


In [76]:
# verify column headers
df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [77]:
# Drop row with value 'Not assigned'
df = df[df.Borough != 'Not assigned']
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge , Malvern"
1,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village , Martin Grove Gardens , Ric..."
101,M9V,Etobicoke,"Albion Gardens , Beaumond Heights , Humbergate..."


In [78]:
# Group columns by postal code and borough, removing duplicate postcode value
# and moving the duplicate data in borough to Neighbourhood.
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge , Malvern"
1,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [95]:
# Verify neighbourhood has values in all rows.
df.isin(['Not Available']).any().any()

False

In [97]:
# Shape of DataFrame
df.shape

(103, 3)