<h2>Importing the Required Libraries</h2>

In [1]:
import numpy as np 
import pandas as pd 
import requests 
from bs4 import BeautifulSoup 

<h2>Scrapping the data from given wikipedia page </h2>

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL) 
  
soup = BeautifulSoup(r.content, 'html5lib') 
table = soup.find('div', attrs = {'id':'container'}) 

In [3]:
postalCodes = []
boroughs= []
neighborhoods = []
columnNum = 1

for row in soup.find_all('td'):
    for cell in row:
        if cell.string and cell.string[0].isalpha() and len(cell.string) > 2:
            if columnNum == 1:
                    postalCodes.append(cell.string.rstrip());   
                    columnNum = 2
            elif columnNum == 2 :
                    boroughs.append(cell.string.rstrip());      
                    columnNum = 3
            elif columnNum == 3 :
                    neighborhoods.append(cell.string.rstrip()); 
                    columnNum = 1
                
print('Data Collected.')

Data Collected.


<h2>Creating the dataframe containing obtained data from wikipedia page</h2>

In [4]:
column_names = ['Postal Code', 'Borough', 'Neighborhood'] 
neighbors = pd.DataFrame(columns=column_names)
for data in range(len(neighborhoods)):
    code = postalCodes[data]
    borough = boroughs[data]
    neighborhood_name = neighborhoods[data]

    neighbors = neighbors.append({ 'PostalCode': code,
                                   'Borough': borough,
                                   'Neighborhood': neighborhood_name}, ignore_index=True)
    

In [5]:
neighbors

Unnamed: 0,Postal Code,Borough,Neighborhood,PostalCode
0,,Not assigned,Not assigned,M1A
1,,Not assigned,Not assigned,M2A
2,,North York,Parkwoods,M3A
3,,North York,Victoria Village,M4A
4,,Downtown Toronto,"Regent Park, Harbourfront",M5A
...,...,...,...,...
175,,Not assigned,Not assigned,M5Z
176,,Not assigned,Not assigned,M6Z
177,,Not assigned,Not assigned,M7Z
178,,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,...",M8Z


<h3>Only processing the cells that have an assigned borough. Ignoring the cells with a borough that is Not assigned. Droping row where borough is "Not assigned"</h3>

In [6]:
neighbors.drop(neighbors[neighbors['Borough']=='Not assigned'].index,inplace=True)
neighbors.reset_index(drop=True, inplace=True)
neighbors

Unnamed: 0,Postal Code,Borough,Neighborhood,PostalCode
0,,North York,Parkwoods,M3A
1,,North York,Victoria Village,M4A
2,,Downtown Toronto,"Regent Park, Harbourfront",M5A
3,,North York,"Lawrence Manor, Lawrence Heights",M6A
4,,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M7A
...,...,...,...,...
98,,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",M8X
99,,Downtown Toronto,Church and Wellesley,M4Y
100,,East Toronto,"Business reply mail Processing Centre, South C...",M7Y
101,,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",M8Y


<h3>If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough</h3>

In [7]:
neighbors.loc[neighbors['Neighborhood']=='Not assigned','Neighborhood']=neighbors['Borough']
neighbors

Unnamed: 0,Postal Code,Borough,Neighborhood,PostalCode
0,,North York,Parkwoods,M3A
1,,North York,Victoria Village,M4A
2,,Downtown Toronto,"Regent Park, Harbourfront",M5A
3,,North York,"Lawrence Manor, Lawrence Heights",M6A
4,,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M7A
...,...,...,...,...
98,,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",M8X
99,,Downtown Toronto,Church and Wellesley,M4Y
100,,East Toronto,"Business reply mail Processing Centre, South C...",M7Y
101,,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",M8Y


<h3>Rows will be same postalcode will combined into one row with the neighborhoods separated with a comma</h3>

In [8]:
result=neighbors.groupby(['PostalCode','Borough'],sort=False).agg(','.join)
neighbors_new=result.reset_index()

<h3>Shape of the resultant dataframe</h3>

In [9]:
neighbors_new.shape

(103, 3)

<h2>Saving the dataset for further analysis</h2>

In [10]:
neighbors_new.to_csv(r'C:\Users\arunr\Desktop\Toronta.csv', index = False, header=True)