<h1> Coursera Capstone Notebook </h1>

In [1]:
### import libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests
import folium

In [2]:
### start requests

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
### check code output
response.status_code

200

In [4]:
### convert output into soup object
soup = bs(response.text, 'html.parser')
### scan object for table class
table = soup.find('table',class_ = 'wikitable sortable' )

In [5]:
### use list comprehension to split up table text
table = [i.strip('\n') for i in table.text.split('\n') if i !='']
### turn table from a list into an array to reshape
table = np.array(table)
table

array(['Postcode', 'Borough', 'Neighbourhood', 'M1A', 'Not assigned',
       'Not assigned', 'M2A', 'Not assigned', 'Not assigned', 'M3A',
       'North York', 'Parkwoods', 'M4A', 'North York', 'Victoria Village',
       'M5A', 'Downtown Toronto', 'Harbourfront', 'M5A',
       'Downtown Toronto', 'Regent Park', 'M6A', 'North York',
       'Lawrence Heights', 'M6A', 'North York', 'Lawrence Manor', 'M7A',
       "Queen's Park", 'Not assigned', 'M8A', 'Not assigned',
       'Not assigned', 'M9A', 'Etobicoke', 'Islington Avenue', 'M1B',
       'Scarborough', 'Rouge', 'M1B', 'Scarborough', 'Malvern', 'M2B',
       'Not assigned', 'Not assigned', 'M3B', 'North York',
       'Don Mills North', 'M4B', 'East York', 'Woodbine Gardens', 'M4B',
       'East York', 'Parkview Hill', 'M5B', 'Downtown Toronto', 'Ryerson',
       'M5B', 'Downtown Toronto', 'Garden District', 'M6B', 'North York',
       'Glencairn', 'M7B', 'Not assigned', 'Not assigned', 'M8B',
       'Not assigned', 'Not assigned', 'M9

In [6]:
### currnetly table is one dimenson
table.shape

(867,)

In [7]:
### input reshaped table into dataframe
df = pd.DataFrame(table.reshape(289,3),columns=['PostCode','Borough','Neighborhood'])
df.drop(df.index[0],inplace=True)

In [8]:
df.reset_index(inplace=True)

In [9]:
df.drop('index',axis=1, inplace=True)

In [10]:
### Check DataFrame
df.head(10)

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [11]:
df.columns

Index(['PostCode', 'Borough', 'Neighborhood'], dtype='object')

In [12]:
### Boolean index dataframe to return our criteria:: Only PostCodes with Boroughs
df = df.loc[((df['Borough'] != 'Not assigned')&(df['Neighborhood'] == 'Not assigned'))|
            ((df['Borough'] != 'Not assigned'))]

In [13]:
df.head(10)

Unnamed: 0,PostCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [14]:
### Check for Not assigned Neighborhood.
df.loc[df['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostCode,Borough,Neighborhood
8,M7A,Queen's Park,Not assigned


In [15]:
### If Neighbhood is Not Assigned then set to Borough
df.loc[df['Neighborhood'] == 'Not assigned','Neighborhood'] = df['Borough']
df.head(10)
        

Unnamed: 0,PostCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [16]:
### exercise groupby method to aggregae by PostCode
df2 = df.groupby('PostCode').apply(lambda x: x+str(', ')).groupby('PostCode').sum()
df2.head()

Unnamed: 0_level_0,Borough,Neighborhood
PostCode,Unnamed: 1_level_1,Unnamed: 2_level_1
"M1B,","Scarborough, Scarborough,","Rouge, Malvern,"
"M1C,","Scarborough, Scarborough, Scarborough,","Highland Creek, Rouge Hill, Port Union,"
"M1E,","Scarborough, Scarborough, Scarborough,","Guildwood, Morningside, West Hill,"
"M1G,","Scarborough,","Woburn,"
"M1H,","Scarborough,","Cedarbrae,"


In [17]:
### Clean up Fields
def splitter(x):
    return x.split(',')[0]

def stripper(x):
    return x.strip(', ')

df2['Borough'] = df2['Borough'].apply(splitter)
df2['Neighborhood'] = df2['Neighborhood'].apply(stripper)
df2.reset_index(inplace=True)
df2['PostCode'] = df2['PostCode'].apply(stripper)

In [18]:
### Now lets check our DataFrame
df2

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [19]:
### Assignment request for DataFrame shape.
df2.shape

(103, 3)

In [20]:
### attempt to scrape lat and long data from geocoder
!pip install geocoder

[33mYou are using pip version 9.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [21]:
import geocoder
lats = []
for i in df.PostCode[:3]:
    lats.append(geocoder.google(i+'Toronto, Ontario'))

In [22]:
### at the time of the assignment, Google returned an empty list
lats

[<[REQUEST_DENIED] Google - Geocode [empty]>,
 <[REQUEST_DENIED] Google - Geocode [empty]>,
 <[REQUEST_DENIED] Google - Geocode [empty]>]

In [23]:
### Using the provided csv file.
df3 = pd.read_csv('Geospatial_Coordinates.csv')

In [24]:
df.shape

(211, 3)

In [25]:
df3.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [26]:
### merge dataframes by PostCode/Postal Code, how is an implicit 'inner'
df4 = pd.merge(df2,df3, left_on='PostCode',
                        right_on='Postal Code')

df4

Unnamed: 0,PostCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,M1J,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",M1K,43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",M1L,43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",M1M,43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",M1N,43.692657,-79.264848


In [27]:
### quick boolean reconsile check to see if all the Postal Codes Match
[i for i in df4['PostCode'] == df4['Postal Code'] if i == False]

[]

In [28]:
### Final DataFrame for Assignemnt
df4[['Postal Code','Borough','Neighborhood','Latitude','Longitude']]

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
