# Segmenting and Clustering Neighborhoods in Toronto
### Task 1: Building the Neighbourhood Dataframe

In [2]:
# Import libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
# Retrieve neighbourhood data from wikipedia page

nb_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
nb_html = requests.get(nb_url).text
nb_soup = BeautifulSoup(nb_html, 'html.parser')

nb_data = []
for tr in nb_soup.tbody.find_all('tr'):
    nb_data.append([ td.get_text().strip() for td in tr.find_all('td')])

In [4]:
# Read data into a dataframe

nb_df = pd.DataFrame(nb_data, columns=['PostalCode','Borough','Neighbourhood'])
nb_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### Cleaning up data

In [5]:
# Find rows of Boroughs that have "Not assigned"

NA_indx = nb_df[(nb_df['Borough'] == "Not assigned")].index

In [6]:
# Drop the unnecessary first row -- that marked as None

nb_df.dropna(inplace=True)

In [7]:
# Drop all rows of Boroughs that have "Not assigned"

nb_df.drop(NA_indx, inplace=True)

In [8]:
nb_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront
6,M6A,North York,Lawrence Manor / Lawrence Heights
7,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [9]:
# Merge duplicate rows based on PostalCode and Borough

nb_df = nb_df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [10]:
# Fix Not assigned Neighbourhoods with the Borough Names

def fix_NB(data):
    if data['Neighbourhood'] == 'Not assigned':
        x = data['Borough']
    else:
        x = data['Neighbourhood']
    return x

nb_df['Neighborhood'] = nb_df.apply(fix_NB, axis='columns')

# Check whether the fix has worked
print("Not assigned Neighborhood count = {}".format(len(nb_df[nb_df['Neighborhood']=='Not assigned'])))

Not assigned Neighborhood count = 0


In [11]:
# We dont need the old Neighbourhood column anymore

nb_df.drop(columns='Neighbourhood', inplace=True)
nb_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### The last row

In [12]:
nb_df.shape
print("Number of Rows = {}".format(nb_df.shape[0]))
print("Number of Columns = {}".format(nb_df.shape[1]))

Number of Rows = 103
Number of Columns = 3
