In [2]:
import pandas as pd
import urllib.request

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [4]:
page = urllib.request.urlopen(url)
# import the BeautifulSoup library so we can parse HTML and XML documents
from bs4 import BeautifulSoup
# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")

In [5]:
# find the correct table, loop through it and create a dataframe
right_table=soup.find('table', class_='wikitable sortable')
X=[]
Y=[]
Z=[]


for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        X.append(cells[0].find(text=True))
        Y.append(cells[1].find(text=True))
        Z.append(cells[2].find(text=True))
        
df=pd.DataFrame(X,columns=['Postal Code'])
df['Borough']=Y
df['Neighborhood']=Z

#replace all '\n' values 
df.replace(r'\s+|\\n', ' ', regex=True, inplace=True) 
df['Postal Code'] = df['Postal Code'].str.rstrip()

#drop all rows that don't have a borough assigned
df.drop(df[df['Borough'] == 'Not assigned '].index, inplace = True)

#reset the index after dropping the rows above
df.reset_index(drop=True)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [6]:
df.shape

(103, 3)

In [7]:
# Loading geo data from csv on github.  
csv_url = "https://raw.githubusercontent.com/bgarrido46/Coursera_Capstone/master/Geospatial_Coordinates.csv"
df_geo = pd.read_csv(csv_url, index_col=0)
print(df_geo.head(100))

      Postal Code   Latitude  Longitude
Index                                  
1             M1B  43.806686 -79.194353
2             M1C  43.784535 -79.160497
3             M1E  43.763573 -79.188711
4             M1G  43.770992 -79.216917
5             M1H  43.773136 -79.239476
...           ...        ...        ...
96            M9C  43.643515 -79.577201
97            M9L  43.756303 -79.565963
98            M9M  43.724766 -79.532242
99            M9N  43.706876 -79.518188
100           M9P  43.696319 -79.532242

[100 rows x 3 columns]


In [8]:
df_final = pd.merge(df, df_geo, on='Postal Code', how='inner')
df_final

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
