# importing packages for web scrapping wiki table

In [1]:
import requests
import lxml.html as lh

from bs4 import BeautifulSoup as soup

import pandas as pd

In [2]:
my_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

page = requests.get(my_url)
doc = lh.fromstring(page.content)
tr_elements = doc.xpath('//tr')

In [3]:
len(tr_elements)

293

In [4]:
# first row
tr_elements[0].text_content()

'\nPostcode\nBorough\nNeighbourhood\n'

In [5]:
# last row
tr_elements[287].text_content()

'\nM9Z\nNot assigned\nNot assigned\n'

# creating dataframe by iteratively list appending

In [6]:
col = []
i = 0

for j in range(288):
    for t in tr_elements[j]:
        i+= 1
        info = t.text_content()
        col.append(info)

In [7]:
pst = []
bor = []
nei = []

n = 288

for j in range(1,n):
    pst.append(col[3*j])
    bor.append(col[3*j+1])
    nei.append(col[3*j+2])

for i in range(len(nei)):
    nei[i] = nei[i].replace('\n','') # removing the \n 

In [8]:
Dict = {'Postcode':pst, 'Borough':bor, 'Neighbourhood':nei }
df = pd.DataFrame(Dict)

# the first dataframe that we webscrap from wiki page

In [9]:
df.shape

(287, 3)

In [10]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [11]:
df.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor
286,M9Z,Not assigned,Not assigned


# the second dataframe that filter out the rows with Borough = Not assigned

In [13]:
df2 = df[df['Borough'] != 'Not assigned']

In [14]:
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


# generating ref list to build unique postcode for neighbourhood

In [15]:
df2['ref'] = df2['Postcode'] +','+ df2['Borough']  # using concatenation for ref on postcode and borough

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [16]:
ref_list = list(df2['ref'].unique()) # unique ref_list

In [17]:
df2[df2['ref'] == ref_list[3]]

Unnamed: 0,Postcode,Borough,Neighbourhood,ref
5,M6A,North York,Lawrence Heights,"M6A,North York"
6,M6A,North York,Lawrence Manor,"M6A,North York"


In [18]:
nei_list = []
for i in range(len(ref_list)):
    nei_list.append(list(df2[df2['ref'] == ref_list[i]].iloc[:,2]))

In [19]:
# coverting list of list to list of string
nei_list = list(map(','.join, nei_list))

In [20]:
nei_list[0:5]

['Parkwoods',
 'Victoria Village',
 'Harbourfront',
 'Lawrence Heights,Lawrence Manor',
 "Queen's Park"]

In [21]:
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,ref
2,M3A,North York,Parkwoods,"M3A,North York"
3,M4A,North York,Victoria Village,"M4A,North York"
4,M5A,Downtown Toronto,Harbourfront,"M5A,Downtown Toronto"
5,M6A,North York,Lawrence Heights,"M6A,North York"
6,M6A,North York,Lawrence Manor,"M6A,North York"


# the third dataframe for unique postcode with multiple neighbourhood

In [22]:
df3 = pd.DataFrame({'ref':ref_list,'Neighbourhood':nei_list })

In [23]:
df3.head()

Unnamed: 0,ref,Neighbourhood
0,"M3A,North York",Parkwoods
1,"M4A,North York",Victoria Village
2,"M5A,Downtown Toronto",Harbourfront
3,"M6A,North York","Lawrence Heights,Lawrence Manor"
4,"M7A,Downtown Toronto",Queen's Park


In [24]:
df3[['Postcode','Borough']] = df3['ref'].str.split(',',expand=True)

In [25]:
df3.head()

Unnamed: 0,ref,Neighbourhood,Postcode,Borough
0,"M3A,North York",Parkwoods,M3A,North York
1,"M4A,North York",Victoria Village,M4A,North York
2,"M5A,Downtown Toronto",Harbourfront,M5A,Downtown Toronto
3,"M6A,North York","Lawrence Heights,Lawrence Manor",M6A,North York
4,"M7A,Downtown Toronto",Queen's Park,M7A,Downtown Toronto


In [26]:
df4 = df3[['Postcode','Borough','Neighbourhood']]

In [27]:
df4.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


# filtering for Not assigned neighbourhood with known borough

In [28]:
df4[df4['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
5,M9A,Queen's Park,Not assigned


In [35]:
df4['Neighbourhood'][5] = "Queen's Park"  # replacing the value

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [36]:
df4[df4['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


In [37]:
df4.head(8)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Queen's Park
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North


# final dataframe dimension

In [38]:
df4.shape

(103, 3)

In [39]:
df4.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
