# Capstone Project Part II

We need to scrape data from a Wikipedia page, store the data in a dataframe, and clean the data.

In [1]:
import pandas as pd
import numpy as np
import requests
import lxml.html as lh

First we pull the html data from the wiki page by passing the URL to requests. Then we use lxml.html to extract all of the data from the table.

In [5]:
# URL of wiki page with Toronto neighborhood info
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Load webpage
wiki = requests.get(wiki_url)

# Store contents of webpage
wiki_contents = lh.fromstring(wiki.content)

# Extract content of table
table_contents = wiki_contents.xpath('//tr')

Now we organize the table contents and extract the column names from the first row

In [20]:
# Get the column names from the first row of the table
columns = []
for T in table_contents[0]:
    columns.append(T.text_content())
    
print(columns)

['Postal Code\n', 'Borough\n', 'Neighborhood\n']


In [21]:
# Strip \n from the column names
columns = [T.strip() for T in columns]
columns

['Postal Code', 'Borough', 'Neighborhood']

In [44]:
# Load data from table
col_data = []
for i in range(len(columns)):
    col_data.append([])

for i in range(1,len(table_contents)):
    row = table_contents[i]
    
    # Break loop if the length of the row is not 3
    if len(row) != 3:
        break
    
    # Add data to col_data and strip off \n from each string
    for j, T in enumerate(row):
        col_data[j].append(T.text_content().strip())

We can now create a dataframe from the organized data and drop the rows where the borough is not assigned.

In [57]:
# Create dataframe and view head
df = pd.DataFrame({columns[i]: col_data[i] for i in range(len(columns))})
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [58]:
# Drop rows where borough is not assigned
df = df[df['Borough'] != 'Not assigned']
df.reset_index(drop = True, inplace = True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [60]:
df.shape

(104, 3)