<h1>Segmenting and Clustering Neighbourhoods in Toronto</h1>

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import requests
from bs4 import BeautifulSoup
import csv
print("Initial packages imported: \nNumPy, Pandas, Matplotlib, Requests, bs4.BeautifulSoup, CSV.")

Initial packages imported: 
NumPy, Pandas, Matplotlib, Requests, bs4.BeautifulSoup, CSV.


In [36]:
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = (requests.get(wikipedia_link)).text

In [37]:
# Print Wikipedia page title
begin_title_tag_index = page.find("<title>")
end_title_tag_index = page.find("</title>")
start_title_index = begin_title_tag_index + 7
end_title_index = end_title_tag_index

title_string = page[int(start_title_index):int(end_title_index)]
wikipedia_page_title = title_string.strip(' - Wikipedia')
print(wikipedia_page_title)

List of postal codes of Canada: M


In [38]:
soup = BeautifulSoup(page, 'lxml')
# print(soup.prettify())

In [23]:
# Conduct data cleanup on the scraped webpage to generate the table with Canada postal codes
My_table = soup.find('body', {'class' : 'mediawiki'})
# print("My_table = \n", My_table)

My_table1 = soup.find('table', {'class' : 'wikitable sortable'})
# print("My_table1 = \n", My_table1.prettify())

# Generate the array for the webpage's HTML markup for the table rows ('tr'); print out the first 10 table rows 
rows1 = My_table1.findAll('tr')
rows1[0:8]

[<tr>
 <th>Postal Code
 </th>
 <th>Borough
 </th>
 <th>Neighbourhood
 </th></tr>,
 <tr>
 <td>M1A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>,
 <tr>
 <td>M2A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>,
 <tr>
 <td>M3A
 </td>
 <td>North York
 </td>
 <td>Parkwoods
 </td></tr>,
 <tr>
 <td>M4A
 </td>
 <td>North York
 </td>
 <td>Victoria Village
 </td></tr>,
 <tr>
 <td>M5A
 </td>
 <td>Downtown Toronto
 </td>
 <td>Regent Park, Harbourfront
 </td></tr>,
 <tr>
 <td>M6A
 </td>
 <td>North York
 </td>
 <td>Lawrence Manor, Lawrence Heights
 </td></tr>,
 <tr>
 <td>M7A
 </td>
 <td>Downtown Toronto
 </td>
 <td>Queen's Park, Ontario Provincial Government
 </td></tr>]

In [39]:
# Calculate the total number of rows in the table of Canada postal codes
len(rows1)

181

In [25]:
# Generate the array for the Canada Postal Codes Table; eliminate rows with empty values
postalCodesTable = []
for row in rows1:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    postalCodesTable.append([ele for ele in cols if ele]) 
# postalCodesTable

In [40]:
# Confirm the total number of rows in the table of Canada postal codes:
#   make sure the above data cleanup is correct, with no dropped rows
len(postalCodesTable)

180

In [49]:
postalCodesTable = pd.DataFrame(postalCodesTable)
postalCodesTable.columns = ['PostalCode', 'Borough', 'Neighbourhood']
postalCodesTable.head(16)

Unnamed: 0,PostalCode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"


In [52]:
postalCodesTable[postalCodesTable['Borough']=='Not assigned'].count()

PostalCode       77
Borough          77
Neighbourhood    77
dtype: int64

In [53]:
postalCodesTable1 = postalCodesTable.drop(postalCodesTable[postalCodesTable['Borough'].str.contains('Not assigned')].index)
postalCodesTable1.head(16)

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


In [54]:
postalCodesTable1.tail(16)

Unnamed: 0,PostalCode,Borough,Neighbourhood
140,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."
143,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores"
144,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
145,M1W,Scarborough,"Steeles West, L'Amoreaux West"
148,M4W,Downtown Toronto,Rosedale
149,M5W,Downtown Toronto,Stn A PO Boxes
152,M8W,Etobicoke,"Alderwood, Long Branch"
153,M9W,Etobicoke,"Northwest, West Humber - Clairville"
154,M1X,Scarborough,Upper Rouge
157,M4X,Downtown Toronto,"St. James Town, Cabbagetown"


In [55]:
print("Shape  of 'postalCodesTable1' is: ", postalCodesTable1.shape)

Shape  of 'postalCodesTable1' is:  (103, 3)


In [56]:
# Make a copy of postalCodesTable1 to generate postalCodesTable2, to ease with program debug
postalCodesTable2 = postalCodesTable1
print("Shape of 'postalCodesTable2' = ", postalCodesTable2.shape)

Shape of 'postalCodesTable2' =  (103, 3)


In [57]:

# In the loop below, the two pointers 'm' and 'n' point to records (rows) in the dataframe.
#  'm' will vary from 0 to 210, and 'n' will vary from 1 to 211
m=0
n=0
neigh1 = ''
neigh2 = ''
nrows2 = len(postalCodesTable2)-1

while m < nrows2 :
    n = m+1
    pcode1 = postalCodesTable2.iloc[m,0]
    pcode2 = postalCodesTable2.iloc[n,0]
    # print("\nm = ", m, ";  n = ", n)
    # print("pcode1 = ", pcode1)
    # print("pcode2 = ", pcode2)
    neigh1 = postalCodesTable2.iloc[m,2]
    neigh2 = postalCodesTable2.iloc[n,2]
    # print("neigh1 = ", neigh1)
    # print("neigh2 = ", neigh2)

    if pcode1 == pcode2:
        if neigh2 not in neigh1:
            neigh1 = neigh1 + ', ' + neigh2
        # print("Append Neighborhoods = ", neigh1)
        # print("postalCodesTable2.shape = ", postalCodesTable2.shape)
        # print("m = ", m, ";  n = ", n)
        postalCodesTable2.iloc[m,2] = neigh1
        # print("postalCodesTable2.iloc[m,2] = ", postalCodesTable2.iloc[m,2])
        postalCodesTable2 = postalCodesTable2[postalCodesTable2.Neighbourhood != 'neigh2']
        
        postalCodesTable2 = postalCodesTable2.drop(postalCodesTable2.index[n])
        nrows2 = nrows2-1
        postalCodesTable2 = postalCodesTable2.reset_index(drop=True)
        
    else:
        m = m+1

In [58]:
print("Shape of 'postalCodesTable2' is: ", postalCodesTable2.shape)

Shape of 'postalCodesTable2' is:  (103, 3)


In [59]:
postalCodesTable2.head(16)

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


In [60]:
postalCodesTable2.tail(16)

Unnamed: 0,PostalCode,Borough,Neighbourhood
140,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."
143,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores"
144,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
145,M1W,Scarborough,"Steeles West, L'Amoreaux West"
148,M4W,Downtown Toronto,Rosedale
149,M5W,Downtown Toronto,Stn A PO Boxes
152,M8W,Etobicoke,"Alderwood, Long Branch"
153,M9W,Etobicoke,"Northwest, West Humber - Clairville"
154,M1X,Scarborough,Upper Rouge
157,M4X,Downtown Toronto,"St. James Town, Cabbagetown"


In [61]:
grouped = postalCodesTable2.groupby(['PostalCode','Borough'], as_index=False)

postalCodesTable3 = pd.DataFrame(grouped.sum())
postalCodesTable3.rename(columns={'PostalCode':'Postal Code'}, inplace = True)
postalCodesTable3.head(20)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [62]:
postalCodesTable3.tail(20)

Unnamed: 0,Postal Code,Borough,Neighbourhood
83,M6R,West Toronto,"Parkdale, Roncesvalles"
84,M6S,West Toronto,"Runnymede, Swansea"
85,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
86,M7R,Mississauga,Canada Post Gateway Processing Centre
87,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
88,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores"
89,M8W,Etobicoke,"Alderwood, Long Branch"
90,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
91,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
92,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [63]:
print("Shape of 'postalCodesTable3' is: ", postalCodesTable3.shape)

Shape of 'postalCodesTable3' is:  (103, 3)
