# 1. Getting the data from the Wiki

In this project I'll explore, segment, and cluster the neighbourhoods in Toronto, Canada. 

To get the data, I'm going to scrap and wrangle the data from the wiki page: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M 

Once the data is structured I'll start segmenting and clustering the Toronto neighbourhoods using the Foursquare venue selection as the basis for feature building. 

In [1]:
#import necessary libraries 
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import numpy as np
import requests
from bs4 import BeautifulSoup




print('Libraries imported.')

Libraries imported.


In [2]:
# get the url text and parse it to extract the table colunn values
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
url = requests.get(url).text

soup = BeautifulSoup(url, 'html.parser')
table = soup.find('table', class_='sortable')

#make a list of items to store the extraction results
data = []
for tr in table.find_all('tr'):
    tds = tr.find_all('td')
    if not tds:
        continue
    zip_code, borough, neighbuorhood = [td.text.strip() for td in tds[:3]]
    data.append([zip_code, borough, neighbuorhood])


In [3]:
# transfer the results into a df and assign col. labels, drop 'Not assigned' values of the Borough column

df = pd.DataFrame(data, columns = ['Postcode', 'Borough', 'Neighbourhood'])
df = df[(df['Borough'] != 'Not assigned')].groupby(['Postcode','Borough'], 
               as_index = False)['Neighbourhood'].apply(lambda x: '{}'.format(', '.join(x)))

df = df.to_frame().reset_index(drop=False)
df.columns = ['Postcode', 'Borough', 'Neighbourhood']
df = df.set_index('Postcode')
df.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [4]:
# define a function to deal with 'Not assigned' values of the Neighbourhood column
def fill_missing(input):
    bor, neigh = input
    if neigh == 'Not assigned':
        neigh = bor
    else:
        neigh
    return neigh

fill_missing(df.iloc[85,:])

"Queen's Park"

In [5]:
df['Neighbourhood'] = df[['Borough','Neighbourhood']].apply(fill_missing, axis = 1)

In [6]:
df[df.index == 'M7A']

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M7A,Queen's Park,Queen's Park


In [7]:
df.shape

(103, 2)

In [8]:
df.to_csv('toronto_neigh.csv')