# Applied Data Science Capstone Week 3 Assignment

## 1. Scraping Toronto Neighborhood data from a webpage

In [1]:
# import required libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests

# get webpage content
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
webpage = requests.get(url)

# parse webpage content
soupcontent = BeautifulSoup(webpage.content,'html.parser')

# find the first/only table
toronto_FSA = soupcontent.find('tbody')

# select each row (defined as <tr> in an HTML table)
toronto_FSA_r = toronto_FSA.select('tr')

# get text in each row
toronto_FSA_rows = [r_number.get_text() for r_number in toronto_FSA_r]

# convert to a dataframe
toronto_df = pd.DataFrame(toronto_FSA_rows)

# split into multiple columns by searching \n
toronto_df = toronto_df[0].str.split('\n',expand=True)

# rename columns
toronto_df = toronto_df.rename(columns=toronto_df.iloc[0])

# drop the first row
toronto_df = toronto_df.drop(toronto_df.index[0])

# drop the rows where Borough is Not assigned
toronto_df = toronto_df[toronto_df.Borough!='Not assigned']

# put neighborhoods that have the same postcodes/boroughs in a single row
toronto_df = toronto_df.groupby(['Postcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()

# check the row where Neighborhood is Not assigned
toronto_df[toronto_df['Neighborhood']=='Not assigned']

# replace each Not assigned in Column Neighborhood with the entry in the same row of Column Borough
for toronto_df_row in range(toronto_df.shape[0]):
    if toronto_df.iloc[toronto_df_row,2] == 'Not assigned':
        toronto_df.iloc[toronto_df_row,2] = toronto_df.iloc[toronto_df_row,1]

# check if there still exists Not assigned
'Not assigned' in toronto_df['Neighborhood'].values

print('the number of rows of the dataframe is:')
toronto_df.shape

the number of rows of the dataframe is:


(103, 3)

## 2. Add the dataframe with both Latitude and Longitude information

In [2]:
# get the csv file that has the geographical coordinates of each postal code
# get it as a dataframe
toronto_pc_geo = pd.read_csv('http://cocl.us/Geospatial_data')
toronto_pc_geo

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [3]:
# update the column name(s) so that they are consistent with the one we just created
toronto_pc_geo.rename(columns={'Postal Code': 'Postcode'},inplace=True)
toronto_pc_geo

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [4]:
# merge the two dataframes 
toronto_df_with_geo = pd.merge(toronto_df,toronto_pc_geo,on='Postcode')
toronto_df_with_geo

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437
