# Segmenting and Clustering Neighborhoods in Toronto (Part 1 of 3)

In [11]:
# Import libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

### Scrape postal codes, boroughs & neighborhoods of Canana on Wikipedia page.

In [12]:
url_canada_list = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source_code = requests.get(url_canada_list).text
soup = BeautifulSoup(source_code, 'xml')
table_postal_codes = soup.find('table')
row_postal_codes = table_postal_codes.find('tbody')

### Create an empty dataframe.

In [13]:
# define the dataframe columns (PostalCode, Borough, and Neighborhood)
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
df_postal_codes = pd.DataFrame(columns=column_names)
df_postal_codes

Unnamed: 0,PostalCode,Borough,Neighborhood


###  Loop through the data and fill the dataframe.

In [14]:
# Loop table rows
for row in row_postal_codes.find_all('tr'):
    data = []
    for cell in row.find_all('td'):
        data.append(cell.text.strip())

    if len(data) > 0:
        df_postal_codes.loc[len(df_postal_codes)] = data

df_postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Remove rows with a borough that is 'Not assigned'.

In [15]:
df_postal_codes.drop(df_postal_codes[df_postal_codes['Borough']=='Not assigned'].index, inplace=True)
df_postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [16]:
df_postal_codes.loc[df_postal_codes['Neighborhood']=='Not assigned', ['Neighborhood']] = df_postal_codes['Borough']
df_postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Combine neighborhoods if same postal code exist in multiple rows.

In [17]:
def join_neighborhood(dataframe):    
    return ', '.join(sorted(dataframe['Neighborhood'].tolist()))
                    
temp_df = df_postal_codes.groupby(['PostalCode', 'Borough'])
df_postal_codes = temp_df.apply(join_neighborhood).reset_index(name='Neighborhood')

In [18]:
df_postal_codes.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Print the number of rows of the dataframe.

In [19]:
df_postal_codes.shape

(103, 3)