# Segmenting and Clustering Neighborhoods in Toronto (Q2)

In [50]:
!pip install beautifulsoup4
##parse html
!pip install lxml 
!pip install html5lib
#request library
!pip install requests



### 1. Import the necessary package

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### 2. Set parameter and inititate web scraping using pandas looking for table

In [29]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [30]:
tables = pd.read_html(url)
print('Tables found:', len(tables))

Tables found: 3


In [31]:
df = tables[0]  #we identified the first table is the one that we are looking for
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### 4. Cleanup the extracted table

In [32]:
#rename column 'postal code' for easy referenec
df = df.rename(columns={'Postal Code': 'PostalCode'})
#drop 'not assigned' borough
df = df[df.Borough != 'Not assigned']
#reset index
df.reset_index(drop=True, inplace=True)
#View the first 12 lines
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [33]:
# Dimenstion of the table as requested in the question
df.shape

(103, 3)

### 5. Import CSV file containing longitude and latitude

In [34]:
longlat_url='https://cocl.us/Geospatial_data'

In [35]:
df2= pd.read_csv(longlat_url)
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 6. Merge the longitude latitude into existing table

In [36]:
df = pd.merge(df, df2, 
                     left_on = 'PostalCode', 
                     right_on = 'Postal Code', 
                     how='left')

df.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M6A,43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M7A,43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",M9A,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353
7,M3B,North York,Don Mills,M3B,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",M4B,43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",M5B,43.657162,-79.378937


In [37]:
df.shape #-To check on dimension of the table

(103, 6)

### 7. Drop unnecessary column and finalize the result

In [38]:
df = df.drop(['Postal Code'], axis = 1) #Drop Postal Code column which is redundant
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [39]:
df.shape #check new table dimension

(103, 5)