# Assignment

## Part 1 - Import Toronto Data from Wikipedia

In [147]:
import requests
import pandas as pd
import numpy as np

In [148]:
!pip install lxml

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [149]:
import lxml.html as lh

#### Get the table data

In [150]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#Create a handle, page, to handle the contents of the website
page = requests.get(url)

#Store the contents of the website under doc
doc = lh.fromstring(page.content)

#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [151]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

#### Get the header

In [152]:
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print (i,name)
    col.append((name,[]))

1 Postal code

2 Borough

3 Neighborhood



#### Create a dataframe

In [153]:
#Since our first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [154]:
# Check that length of the columns are the same
[len(C) for (title,C) in col]

[181, 181, 181]

In [155]:
#Create the dataframe
columns={title:column for (title,column) in col}
df=pd.DataFrame(columns)

In [156]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n


#### Clean Dataframe

In [157]:
df.keys()

Index(['Postal code\n', 'Borough\n', 'Neighborhood\n'], dtype='object')

In [158]:
df.columns = [col.replace('\n','') for col in df.columns]
df.columns = df.columns.str.rstrip()

In [159]:
df.keys()

Index(['Postal code', 'Borough', 'Neighborhood'], dtype='object')

In [160]:
df = df.replace('\n','', regex=True)

In [161]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [162]:
# Remove Boroughs that are Not assigned
df=df[df.Borough!='Not assigned']

In [163]:
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [164]:
df['Neighborhood'] = df.Neighborhood.replace(' /',',', regex=True)

In [165]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [166]:
#Check if there are any empty neighborhoods
np.where(pd.isnull(df.Neighborhood))

(array([], dtype=int64),)

In [167]:
#Check if there are any neighborhoods with value Not assigned
index=(df.Neighborhood=='Not assigned')
index.value_counts()

False    104
Name: Neighborhood, dtype: int64

In [168]:
# Check the last rows
df.tail()

Unnamed: 0,Postal code,Borough,Neighborhood
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."
103,,Canadian postal codes,


In [169]:
#Remove last row since it doesn't belong in this table
df=df.drop(df.index[103])

##### Dataset is cleaned!

In [170]:
# Check how many rows / neighborhoods there are
print('The number of rows is {}'.format(df.shape[0]))

The number of rows is 103


In [171]:
df.to_csv('Toronto_Neighborhoods.csv', index=False)