## part 1. Build a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name in Toronto

# 1. import libraries

In [2]:
import pandas as pd #to handale data
import urllib.request # to handle requests
from bs4 import BeautifulSoup #to parse html documents

# 2. Scrap data from wikipedia page into a  pandas data frame 

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

page = urllib.request.urlopen(url).read()
soup = BeautifulSoup(page)

In [4]:
# define the dataframe columns
column_names = ['Postcode','Borough', 'Neighborhood'] 

# instantiate the dataframe
df = pd.DataFrame(columns=column_names)

In [5]:
for tr in soup.find_all('tr')[2:]:
    tds = tr.find_all('td')
    if len(tds)==1: #To ignore empty rows
        break
    if tds[1].text=='Not assigned': #To ignore values whose borough in "Not assigned"
        continue
    pc=tds[0].text 
    borough=tds[1].text
    neigh=tds[2].text.rstrip('\n') # rstrip used to ignore extra line
    if neigh=='Not assigned':
        if borough=='Not assigned': # ignoring rows whose neighbor and borough are not assigned
            continue
        else:
            neigh=borough # if neighbor is Not assigned then it is assigned to borough
    # appending values to data frame
    df=df.append({
        'Postcode':pc,
        'Borough':borough,
        'Neighborhood':neigh},ignore_index=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [6]:
# grouping rows with same borough and neighbor
df=df.groupby(['Postcode','Borough'])['Neighborhood'].apply(','.join).reset_index()

# 3. printing shape of dataframe

In [7]:
df.shape

(103, 3)

# part 2. get geographical coordinates of boroughs

# 4. Reading data of latitude and longitude into dataframe

In [8]:
loc=pd.read_csv('https://cocl.us/Geospatial_data')
loc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
loc.rename(columns={'Postal Code':'Postcode'},inplace=True)

# 5. merging the dataframes

In [16]:
new_df=df.merge(loc,on='Postcode',how='right')

In [17]:
new_df.tail()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437
102,M9W,Etobicoke,Northwest,43.706748,-79.594054
