In [1]:
import numpy as np
import pandas as pd

# scrape website and preprocessing

In [2]:
#this makes a list of objects on the website
table=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', 
                 match='.+', 
                 flavor=None, 
                 header=None, 
                 index_col=None, 
                 skiprows=None, 
                 attrs=None, 
                 parse_dates=False, 
                 tupleize_cols=None, 
                 thousands=', ', 
                 encoding=None, 
                 decimal='.', 
                 converters=None, 
                 na_values=None, 
                 keep_default_na=True, 
                 displayed_only=True)

In [3]:
#this takes the first object, which is the table that we want and turns it into a dataframe
df=pd.DataFrame(table[0])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [33]:
# Delete rows that have Borough value of "Not assigned"
df=df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [35]:
# combine multiple neighborhoods in a single post code
df2=df.groupby('Postcode').agg({'Borough':'first','Neighbourhood': ', '.join})
df2.head()


Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [36]:
# if neighbourhood not assigned then neighbourhood = borough
df3=df2
df3.loc[df3['Neighbourhood'] == 'Not assigned', 'Neighbourhood']='Borough'
df3.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [37]:
# making sure that last step worked to replace all the Not Assigned neighbourhoods
df3.isin(['Not assigned']).any()

Borough          False
Neighbourhood    False
dtype: bool

In [40]:
df3.shape


(103, 2)

#     Getting geographic coordinates
    

## I first tried to use the geocoder but got errors when trying to import geocoder.  So I used the file

In [51]:
geocode=pd.read_csv("http://cocl.us/Geospatial_data")
geocode.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [52]:
#Merge the two dataframes
df4=pd.merge(left=df3, right=geocode, left_on='Postcode', right_on='Postal Code')
df4.head())

Unnamed: 0,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [53]:
# rearrange the columns
df5=df4[['Postal Code', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude']]
df5.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Explore and Cluster Neighbourhoods
