# Scraping Wikipedia for Toronto Neighbourhood Information

## The objective of this code is to scrape the Neighbourhood data from a Wikipedia url and then store the data into a pandas dataframe and finally adding geospatial data to the dataframe

In [65]:
#importing libraries

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from string import Template
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors


In [2]:
#converting url to an html file using requests library and then creating a BeautifulSoup object soup of the html file


url= 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source=requests.get(url).text
soup=BeautifulSoup(source,'lxml')

In [3]:
#obtaining the table from the object soup by using the html tag 'table'


table=soup.table

In [4]:
#creating lists to store postal code, borough and neighbourhood

col_pcode=[]
col_add=[]
col_b=[]
col_nbd=[]


In [5]:
#traversing the table row wise to obtain list of postal codes and a list having both borough and neighbourhood information


for row in table.find_all('tr'):
    for element in row.find_all('td'):
        borough=element.span.text
        if(borough!='Not assigned'):
            col_pcode.append(element.b.text)
            
            col_add.append(borough)
            

    

In [6]:
#separating the list into borough and neighbourhood lists



for i in range(0,len(col_add)):
    col_b.append(col_add[i].partition('(')[0])
    s=col_add[i].partition('(')[2].replace(')','')
    col_nbd.append(s)
    
    


In [7]:
#creating pandas dataframe by joining the three lists as columns


df=pd.DataFrame(list(zip(col_pcode, col_b,col_nbd)), columns =['Postal_Code', 'Borough','Neighbourhood'])

In [8]:
df

Unnamed: 0,Postal_Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


In [9]:
df.shape

(103, 3)

In [10]:
#reading geospatioal data into a pandas dataframe

geo=pd.read_csv('Geospatial_Coordinates.csv')

In [11]:
#making a dummy dataframe and setting Postal Code as index in the dummy and geospatial dataframes


df_dummy=df
df_dummy.set_index('Postal_Code', inplace=True)
geo.set_index('Postal Code', inplace=True)

In [12]:
#creating a similarly indexed dataframe as the original dataframe

geo_indexed=geo.reindex(df_dummy.index)
geo_indexed


Unnamed: 0_level_0,Latitude,Longitude
Postal_Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,43.753259,-79.329656
M4A,43.725882,-79.315572
M5A,43.654260,-79.360636
M6A,43.718518,-79.464763
M7A,43.662301,-79.389494
...,...,...
M8X,43.653654,-79.506944
M4Y,43.665860,-79.383160
M7Y,43.662744,-79.321558
M8Y,43.636258,-79.498509


In [13]:
#Column-wise join of the dummy and indexed geospatial dataframes

df_main=pd.concat([df_dummy,geo_indexed],axis=1)
df_main

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,Regent Park / Harbourfront,43.654260,-79.360636
M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...
M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North,43.653654,-79.506944
M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L,43.662744,-79.321558
M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...,43.636258,-79.498509


In [14]:
#reset index of main dataframe
df_main.reset_index(inplace=True)
df_main

Unnamed: 0,Postal_Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.654260,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North,43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...,43.636258,-79.498509
