# Segmenting and Clustering Neighborhoods in Toronto

### Scrape Wikipedia page and obtain a dataframe with postal code's data

#### Import library we use to open URLs

In [1]:
import urllib.request

#### specify which URL/web page we are going to be scraping

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

#### Put the html content of the web page into the page variable

In [3]:
page = urllib.request.urlopen(url)

#### Import library to parse HTML and XML documents

In [4]:
from bs4 import BeautifulSoup

#### parse the HTML from our URL into the BeautifulSoup parse tree format

In [5]:
soup = BeautifulSoup(page, "lxml")

#### Use the 'find_all' function to bring back all instances of the 'table' tag in the HTML and store in 'all_tables' variable

In [6]:
all_tables=soup.find_all("table")

#### Extract the table containing the targeted data

In [7]:
postal_table=soup.find('table', class_='wikitable sortable')

#### Processing the table html content so that we get data in a dataframe

In [9]:
A=[]

B=[]

C=[]

for row in postal_table.findAll('tr'):

    cells=row.findAll('td')

    if len(cells)==3:

        A.append(cells[0].find(text=True))

        B.append(cells[1].find(text=True))

        C.append(cells[2].find(text=True))

In [10]:
import pandas as pd

df=pd.DataFrame(A,columns=['PostalCode'])

df['Borough']=B

df['Neighborhood']=C

df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


#### We eliminate cells with a borough that is Not assigned

In [11]:
#Setting the dataframe
df = df[df['Borough'] != 'Not assigned\n']
df

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


#### Combine row with same PostalCode and concatenate Neighborhood content

In [12]:
# We sort dataframe's data by PostalCode
df.sort_values(by=['PostalCode'], inplace=True)
df.reset_index(drop = True, inplace = True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [13]:
for ind in df.index:
    if (ind < len(df.index) - 1):
        if (df['PostalCode'][ind] == df['PostalCode'][ind+1]):
            df['Neighborhood'][ind] = df['Neighborhood'][ind] + ', ' + df['Neighborhood'][ind+1]
            df.drop(df.index[ind+1], inplace=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### Make Neighborhood the same as Borough if it's Not Assigned

In [14]:
# Resetting indexes
df.reset_index(drop = True, inplace = True)
# process
for ind in df.index:
    if (df['Neighborhood'][ind] == 'Not assigned'):
        df['Neighborhood'][ind] = df['Borough'][ind]
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [15]:
df.shape

(103, 3)