# Create Neightbourhood Dataframe
## Capstone project, Week 3 Notebook 1

In [75]:
# Get the latest version of BeautifulSoup; use XML parser for speed
#!pip install beautifulsoup4
import pandas as pd
from bs4 import BeautifulSoup
from lxml import html
import requests

### Get the data
Read the wikipedia page using BeautifoulSoup

In [None]:
# if we don't submit a User-Agent header, the connection is refused by the server
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}

# The list of postal codes in Canada where the first letter is M
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse the html content
soup = BeautifulSoup(html_content, "lxml")
# print(soup.prettify()) # print the parsed data of html

### Get HTML table
Get the HTML elements that interest us

In [31]:
# The content we want is in tr elements inside a tbody tag 
# We must select only the tr's inside the tbody, avoid the extra tr's after that
html_table = soup.tbody
html_rows = html_table.find_all('tr')
html_rows[0:2]

[<tr>
 <th>Postal Code
 </th>
 <th>Borough
 </th>
 <th>Neighborhood
 </th></tr>,
 <tr>
 <td>M1A
 </td>
 <td>Not assigned
 </td>
 <td>
 </td></tr>]

### Convert HTML table into dictionary
Create a dictionary for each table row, of the form {'Postal Code': '', 'Borough': '', 'Neighborhood': ''}
This will prove useful when we want to use logic to exclude records, change values etc.

In [129]:
# table header, remove newlines before extracting 
table_header = []
for th in html_rows[0].find_all('th'):
    table_header.append(th.text.replace('\n', ' ').strip())

table_data = []
for tr in html_rows:
    t_row = {}
    # Each table row is stored in the form of t_row = {'Postal Code': '', 'Borough': '', 'Neighborhood': ''}
    for td, th in zip(tr.find_all("td"), table_header):
        cell = td.text.replace('\n', '')
        t_row[th] = cell 
    
    # skip the first empty row
    if not t_row:
        continue
        
    # add to the dataset only if the Borough is not assigned
    if (t_row['Borough'] != 'Not assigned'):
        table_data.append(t_row)
        
    # If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the
    # same as the borough. Today there is no data on the page that fits this rule, let's implement it anyway
    if (t_row['Borough'] != 'Not assigned' and t_row['Neighborhood'] == ''):
        t_row['Neighborhood'] = t_row['Borough']
        
table_data[0:3]

[{'Postal Code': 'M3A', 'Borough': 'North York', 'Neighborhood': 'Parkwoods'},
 {'Postal Code': 'M4A',
  'Borough': 'North York',
  'Neighborhood': 'Victoria Village'},
 {'Postal Code': 'M5A',
  'Borough': 'Downtown Toronto',
  'Neighborhood': 'Regent Park, Harbourfront'}]

### Put into a dataframe

In [130]:
df = pd.DataFrame(table_data)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [131]:
# Check the number of records
df.shape

(103, 3)