<h2 style='color: blue; font-family: verdana;'>Coursera Data Science Capstone Project</h2>

In [1]:
%%capture
# Installing prerequisites
!pip install pandas numpy bs4 requests lxml html5lib

In [2]:
# importing modules required for this project
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from geopy.geocoders import Nominatim

<h4>Part 1: Scraping Neighbourhood information from wiki page</h4>

In [3]:
# Creating beautiful soup object to load the page information
url_canada_neighbourhood = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_text = requests.get(url_canada_neighbourhood).text
soup = BeautifulSoup(html_text, 'lxml')

In [4]:
# Locate table on the webpage and store its object
table = soup.find('table', class_='wikitable')
header_tag = table.find_all('th')
header_list = [header.text for header in header_tag]

# Load table information in a dictionary
table_dict = {header: [] for header in header_list}
for rows in table.find_all('tr'):
    data = rows.find_all('td')
    if data and data[1].text != 'Not assigned':
        for i in range(0, 3):
            table_dict[header_list[i]].append(data[i].text)

# Convert dictionary into a pandas data frame
df = pd.DataFrame(table_dict)

# Cleaning up and correcting table data
df.rename(columns={'Neighbourhood\n': 'Neighborhood', 'Postcode': 'PostalCode'}, inplace=True)
df['Neighborhood'] = df['Neighborhood'].str[:-1]

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [5]:
# Grouping by postcode
df = df.groupby(
    ['PostalCode', 'Borough'], axis=0,
    as_index=False)['Neighborhood'].\
    apply(', '.join).reset_index()
df.rename(columns={0: 'Neighborhood'}, inplace=True)

# Replacing neighbourhood value with same value as borough if the value is 'Not  assigned'
df['Neighborhood'].replace('Not assigned', df['Borough'], inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
df.shape

(103, 3)

<h4>Part 2: Update location data in the data frame</h4>

In [7]:
df_location = pd.read_csv('http://cocl.us/Geospatial_data')
df_location.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
df = df.merge(df_location)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
