<h1 align=center>Part 1- Segment & Cluster Neighborhoods in Toronto</h1>

<h4>Importing the necessary packages and libraries</h4>

In [34]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [35]:
List_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(List_url).text

In [36]:
soup = BeautifulSoup(source, 'xml')

In [37]:
table=soup.find('table')

<h4>Dataframe with three columns: PostalCode, Borough, and Neighborhood</h4>

In [40]:
#dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
column_names = ['Postalcode','Borough','Neighbourhood']
df = pd.DataFrame(columns = column_names)

In [41]:
# Search all the postcode, borough, neighborhood 
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [42]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


<h4>Data Cleaning</h4>
<h5>Remove rows where Borough is 'Not assigned' and Reassigning Neighborhood to Borough column if Neighborhood is 'Not Assigned'</h5>

In [43]:
df=df[df['Borough']!='Not assigned']

In [45]:
df['Neighbourhood'].replace(0,'Not assigned', inplace = True)

In [46]:
for index, row in df.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']

In [47]:
df['Neighbourhood']=df['Neighbourhood'].apply(lambda x: x.replace('/', ','))

In [48]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


<h4>More than one neighborhood can exist in one postal code area</h4>

In [49]:
temp_df=df.groupby('Postalcode')['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x))
temp_df=temp_df.reset_index(drop=False)
temp_df.rename(columns={'Neighbourhood':'Neighbourhood_joined'},inplace=True)

In [50]:
df_merge = pd.merge(df, temp_df, on='Postalcode')

In [51]:
df_merge.drop(['Neighbourhood'],axis=1,inplace=True)

In [53]:
df_merge.drop_duplicates(inplace=True)

In [54]:
df_merge.rename(columns={'Neighbourhood_joined':'Neighbourhood'},inplace=True)

In [55]:
df_merge.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


<h4>.Shape method to print the dataframe </h4>

In [56]:
df_merge.shape

(103, 3)