## Let's pull some data from the wikipedia page for Canadian Postal Codes

In [159]:
import pandas as pd

list = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", )

### now let's convert the list into a Pandas Dataframe

In [160]:
df = pd.DataFrame(list[0])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### rename the columns

In [161]:
df.columns = ['PostalCode', 'Borough', 'Neighborhood']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### are there any postal codes that don't have a Borough assigned?

In [163]:
df[df.Borough == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
9,M8A,Not assigned,Not assigned
13,M2B,Not assigned,Not assigned
20,M7B,Not assigned,Not assigned
21,M8B,Not assigned,Not assigned
30,M2C,Not assigned,Not assigned
36,M7C,Not assigned,Not assigned
37,M8C,Not assigned,Not assigned
45,M2E,Not assigned,Not assigned


### remove "Not assigned" Boroughs

In [166]:
df_assigned = df[df.Borough != 'Not assigned']
df_assigned[df_assigned.Borough == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


### are there any "Not assigned" Neighborhoods?

In [167]:
df_assigned[df_assigned.Neighborhood == "Not assigned"]

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M7A,Queen's Park,Not assigned


### replace "Not assigned" neighborhood with the Borough name

In [171]:
df_assigned.at[8, 'Neighborhood'] = "Queen's Park"

In [172]:
#confirm the neighborhood name change
df_assigned.loc[8]

PostalCode               M7A
Borough         Queen's Park
Neighborhood    Queen's Park
Name: 8, dtype: object

In [177]:
# group the rows by PostalCode and Borough
# separate multiple neighborhood names with a comma
df_city = df_assigned.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: ', '.join(x)).reset_index()
df_city.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [178]:
# show the number of rows and columns
df_city.shape

(103, 3)