### Library declaration

In [8]:
import requests
from bs4 import BeautifulSoup
import lxml
import html5lib
import os
import numpy as np
import pandas as pd

### Wikipedia url path declaration along with the BeautifulSoup api call for reading the html file

In [9]:
url_path = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url_path).text
data = BeautifulSoup(source, "html.parser")

#### Table column name assignment

In [10]:
table=data.find('table')
column_names=['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns=column_names)

#### pulling the table column names and the column values based on the tr and td findings

In [11]:
for tr in table.find_all('tr'):
    url_data=[]
    for td in tr.find_all('td'):
        url_data.append(td.text.strip())
    if len(url_data)==3:
        df.loc[len(df)] = url_data
        df.head()

#### Ignoring rows with a borough that is Not assigned

In [12]:
df_tor=df[(df['Borough']!='Not assigned')]

In [13]:
df_tor[df_tor['Borough']!=0]

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


#### Grouping More than one neighborhood in one postal code area

In [14]:
toronto = df_tor.groupby(['Postalcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [15]:
def neighbor_list(grouped):    
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))


In [24]:
grp = toronto.groupby(['Postalcode', 'Borough'])
df2 = grp.apply(neighbor_list).reset_index(name='Neighborhood')
# df2.rename(columns={'Postalcode':'Postal Code'}, inplace=True)
df2.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### printing the number of rows of the dataframe

In [25]:
df2.shape

(103, 3)

#### Extracting csv with Toronto geographical coordinates from http://cocl.us/Geospatial_data to dataframe

In [26]:
path="http://cocl.us/Geospatial_data"
geocode_df = pd.read_csv(path)
geocode_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [30]:
geocode_df.rename(columns={'Postal Code':'Postalcode'},inplace=True)
geocode_merged = pd.merge(geocode_df, df2, on='Postalcode')
geocode_merged.head()

Unnamed: 0,Postalcode,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Rouge, Malvern"
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


In [31]:
geo_data=geocode_merged[['Postalcode','Borough','Neighborhood','Latitude','Longitude']]
geo_data.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
