###Importing Libraries###

In [0]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

###1. Scraping text and coverting it in required format.###

In [2]:
base_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
index = requests.get(base_url).text
soup=BeautifulSoup(index)

table = soup.find('table',{'class':'wikitable sortable'}).get_text()
table=table.split('\n')

df=pd.DataFrame(table)
df.rename(columns={0: 'text'},inplace=True)
df=df[df.text!='']
df=df.reset_index()
df.drop('index',axis=1,inplace=True)
df.head()

Unnamed: 0,text
0,Postcode
1,Borough
2,Neighbourhood
3,M1A
4,Not assigned


Now converting this data in required format.

In [3]:
postcodes = []
boroughs = []
neighborhoods = []

for i in range(3,len(df)):
    if i%3==0:
        postcodes.append(df.iloc[i][0])
    if i%3==1:
        boroughs.append(df.iloc[i][0])
    if i%3==2:
        neighborhoods.append(df.iloc[i][0])

toronto_df = pd.DataFrame({'PostalCode': postcodes,'Borough': boroughs,'Neighborhood': neighborhoods})

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Now removing rows where Borough='Not assigned'.

In [4]:
toronto_df = toronto_df[toronto_df.Borough != "Not assigned"].reset_index(drop=True)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


As shown, there is one record where Neighbourhood='Not Assigned'

In [5]:
toronto_df[toronto_df.Neighborhood=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
6,M9A,Queen's Park,Not assigned


Assigning Neighborhood = Borough where Neighbourhood='Not Assigned'

In [6]:
for index, row in toronto_df.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]

toronto_df[toronto_df.Borough=='Queen\'s Park']

Unnamed: 0,PostalCode,Borough,Neighborhood
6,M9A,Queen's Park,Queen's Park


Grouping the data as required.

In [7]:
grouped_df=toronto_df.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda k: ", ".join(k))
grouped_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Showing the shape of the data-frame.

In [8]:
print(grouped_df.shape)

(103, 3)


##2.Getting geographic coordinates and adding them to the dataframe.##

In [9]:
coords = pd.read_csv('https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv')
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
coords=coords.rename(columns={"Postal Code": "PostalCode"})
result_df=grouped_df.merge(coords, on='PostalCode')
result_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
