# Segmenting neighborhoods in Toronto 

## Scraping data from Wikipedia

https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [474]:
from bs4 import BeautifulSoup


import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [284]:
# url from our source
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"


In [285]:
s = requests.Session()
response= s.get(url, timeout=2)
response

<Response [200]>

In [286]:
# parse response content to html
soup = BeautifulSoup(response.content, 'html.parser')
# to view the content in html format
pretty_soup = soup.prettify()

In [287]:
soup.title.string

'List of postal codes of Canada: M - Wikipedia'

In [288]:
table_of_Tronto = soup.find('table',{"class":'wikitable sortable'})

In [289]:
# Number of columns in the table
for row in table_of_Tronto.findAll("tr"):
    cells = row.findAll('td')

len(cells)


3

In [290]:
# number of rows in the table including header
rows = table_of_Tronto.findAll("tr")
len(rows)

181

## Get the Header Attributes

In [291]:
# header attributes of the table
header = [th.text.rstrip() for th in rows[0].find_all('th')]
print(header)
print('------------')
print(len(header))

['Postal Code', 'Borough', 'Neighbourhood']
------------
3


## Get Tablular Data


In [292]:
lst_data = []
for row in rows[1:]:
            data = [d.text.rstrip() for d in row.find_all('td')]
            lst_data.append(data)

In [293]:
# select also works as find_all
lst_data1 = []
for row in rows[1:]:
            data = [d.text.rstrip() for d in row.select('td')]
            lst_data1.append(data)

In [294]:
# sample records
lst_data1[0:3]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods']]

In [295]:
# length of each record
len(lst_data[0])

3

In [296]:
# html of each table record

list_row = []
for row in table_of_Tronto.findAll("tr"):
    list_row.append(row)

    
print('Number of row :',len(list_row))
print('----------------')
print(list_row[10])
print('----------------')


Number of row : 181
----------------
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern, Rouge
</td></tr>
----------------


In [297]:
# length of each record
len(lst_data[0])

3

In [458]:
#Scrap the data and append to respective lists

c1=[]
c2=[]
c3=[]

for row in table_of_Tronto.findAll("tr"):
    cells = row.findAll('td')
    if len(cells)==3: #Only extract table body not heading
        c1.append(cells[0].find(text=True).replace('\n',''))
        c2.append(cells[1].find(text=True).replace('\n',''))  # fetch the text of the url in td tag. 
        c3.append(cells[2].find(text=True).replace('\n',''))
      
        

In [459]:
# create a dictionary
neighborhoods_data = dict([(x,0) for x in header])
neighborhoods_data

{'Postal Code': 0, 'Borough': 0, 'Neighbourhood': 0}

In [460]:
# append dictionary with corresponding data list.
neighborhoods_data['Postal Code'] = c1
neighborhoods_data['Borough']= c2
neighborhoods_data['Neighbourhood']=c3

In [461]:
# create a dictionary
df = dict([(x,0) for x in header])
df

{'Postal Code': 0, 'Borough': 0, 'Neighbourhood': 0}

In [462]:
# append dictionary with corresponding data list.
df['Postal Code'] = c1
df['Borough']= c2
df['Neighbourhood']=c3


In [652]:
# convert dict to DataFrame
df_Toronto = pd.DataFrame(df)

# Top 5 records

df_Toronto.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Test,Not assigned
1,M2A,Test2,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Clean Dataset

In [670]:
df_Toronto.replace("Not assigned", np.nan, inplace = True)
#remove Nan Borough's
df_Toronto.dropna(subset=['Borough'],axis=0, inplace=True)

#Fill Neighbourhood with Borough if Neighbourhood NaN
df_Toronto.fillna(method='ffill',axis=1,inplace=True)  

      

df_Toronto.head()


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Test,Test
1,M2A,Test2,Test2
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [607]:
df_Toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Test,
1,M2A,Test2,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [473]:
df_Toronto.shape

(103, 3)