In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# conda install -c conda-forge geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# conda install -c conda-forge folium
import folium # map rendering library

# conda install -c anaconda beautifulsoup4 
from bs4 import BeautifulSoup

## Creating DataFrame for Analysis

#### Srape website and create initial DataFrame
website to scrape:
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [2]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))
# Save initial DataFrame to df_toronto; display first 15 rows
df_toronto = df[0]
df_toronto.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


#### Process DataFrame

In [3]:
# drop records where Borough = 'Not assigned'
df_toronto = df_toronto[df_toronto.Borough != 'Not assigned']
# Now check records where Borough = 'Not assigned'. There shoudn't be any records showing anymore
df_toronto[df_toronto.Borough == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


In [5]:
# group data by by PostCode, and combine records by concatenating "Neighbourhood" values as comma deliminated strings
df_toronto = df_toronto.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
# If Neighbourhood = "Not assigned", assign Borough value to Neighbourhood
df_toronto[df_toronto.Neighbourhood == 'Not assigned'] = df_toronto[df_toronto.Neighbourhood == 'Not assigned'].assign(Neighbourhood = df_toronto[df_toronto.Neighbourhood == 'Not assigned'].Borough)
# Now check DataFrame value where index = 85. Value of "Neighbourhood" should be the same of "Borough"
df_toronto.loc[[85]]

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


In [7]:
# Now check records where Neighbourhood = 'Not assigned'. There shouldn't be any records showing anyore
df_toronto[df_toronto.Neighbourhood == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


In [8]:
# To match the DataFrame in the instruction, rename column "Postcode" to "PostalCode"
df_toronto.rename(columns = {'Postcode':'PostalCode'}, inplace = True)
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
# Lastly for this part, show the shape of the processed DataFrame
print("There are " + str(df_toronto.shape[0]) + " rows, and " + str(df_toronto.shape[1]) + " columns in the DataFrame")

There are 103 rows, and 3 columns in the DataFrame
