# Segmenting and Clustering Neighborhoods in Toronto : Part 1 
## Webscrape data from wikipedia, and build desired dataframe

----

### Use Beautiful Soup library to scrape the following Wikipedia URL

Import libraries

In [1]:
import requests
from bs4 import BeautifulSoup

access url page content and identify type of content to choose parser

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wiki = requests.get(url)
wiki.headers['Content-Type']

'text/html; charset=UTF-8'

use html parser in Beautiful Soup

In [3]:
soup = BeautifulSoup(wiki.text, "html.parser")

#one can view contents of soup by uncommenting below
#soup

extract the table by identifying with tags - table, td

In [4]:
#import numpy library
import numpy as np

#create list and copy the table values from BeautifulSoup parsed text into list by using tags - table, td
newlist = []
for i in np.arange(0,len(soup.table.find_all('td'))):
    newlist.append(soup.table.find_all('td')[i].contents[0].string)

clean up the extracted list by replacing 'Not assigned' with 'NaN' and removing '\n'

In [5]:
for i in np.arange(0,len(newlist)):
    newlist[i]=newlist[i].replace('Not assigned','NaN')
    newlist[i]=newlist[i].replace('\n','')

###  Copy the webscraped data into Data Frame

In [6]:
#import pandas library
import pandas as pd
pd.set_option("display.max_colwidth",0)

#create new dataframe with desried columns
df = pd.DataFrame(columns = ['Postcode','Borough','Neighborhood'])

#transfer values from webscraped list into three lists, each representing the desired columns
Postcode=[];Borough=[];Neighborhood=[]
for i in np.arange(0,len(newlist),3):
    Postcode.append(newlist[i])
    Borough.append(newlist[i+1])
    Neighborhood.append(newlist[i+2])

#copy the list values into columns of dataframe
df['Postcode'] = Postcode
df['Borough'] = Borough
df['Neighborhood'] = Neighborhood

In [7]:
df.shape

(289, 3)

In [8]:
#check the dataframe
df.head(15)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,
9,M8A,,


### Apply the given operations on Data Frame
Replace all 'NaN' values with numpy identifed Null value

In [9]:
df.replace('NaN',np.nan, inplace=True)

Remove Null values in column Borough

In [10]:
df.dropna(axis=0, how='any',subset=['Borough'],inplace=True)

Identify Null values in Neighborhood and replace them with Borough values

In [11]:
#identify null values in neighborhood
null_neighbor=df['Neighborhood'].isna()

#copy dataframe columns into list
borough_list  = df['Borough']
neighbor_list = df['Neighborhood']

#replace null values in neighborhood list by borough values
neighbor_list[null_neighbor] = borough_list[null_neighbor]
neighbor_list[null_neighbor]

#copy values of neighborhood list into data fram
df['Neighborhood']=neighbor_list

In [12]:
df.shape

(212, 3)

In [13]:
#check the dataframe
df.head(15)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


combine neighborhood values which have same Postcode. This is done by using groupby and agg functions of dataframe  

In [14]:
newdf = df.groupby('Postcode', sort=False).agg({'Borough':lambda x: x.max(),'Neighborhood': lambda x: ",".join(x.tolist())})

set the index value and check the dataframe

In [15]:
#reset the index to look like desired dataframe
newdf.reset_index(inplace=True)

#check the datafram
newdf.head(15)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


In [16]:
#shape of dataframe
newdf.shape

(103, 3)

### The dataframe is correctly produced from web scraped data