In [1]:
import pandas as pd 
import numpy as np 
from bs4 import BeautifulSoup
import requests

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
req = requests.get(url).text

In [4]:
neighborhood_soup = BeautifulSoup(req, 'html.parser')

## Creating Dataframe

In [60]:
neighborhood_toronto = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])

for box in neighborhood_soup.find_all('td')[0:180]:
    for element in box.find_all('p'):
        postalcode = element.b.text
        borough = ((element.span.text).split('('))[0]
        try:
            neighborhood = (((element.span.text.split('('))[1].split(')'))[0]).replace('/',',')
        except:
            neighborhood = "Not assigned"
        

        neighborhood_toronto = neighborhood_toronto.append({'PostalCode':postalcode, 'Borough':borough, 'Neighborhood':neighborhood}, ignore_index=True) 
#, 'Borough':borough, 'Neighborhood':neighborhood   .split('(')[0]
#neighborhood = ((element.span.text).split('('))

In [61]:
neighborhood_toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern , Rouge"


In [62]:
neighborhood_toronto.dtypes

PostalCode      object
Borough         object
Neighborhood    object
dtype: object

## Dropping the not assigned rows

In [63]:
neighborhood_toronto['Borough'].replace(to_replace='Not assigned', value=np.nan, inplace=True)

In [64]:
neighborhood_toronto.dropna(inplace=True)

## Reseting the index

In [65]:
neighborhood_toronto = neighborhood_toronto.reset_index(drop=True)
neighborhood_toronto

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


In [66]:
#replacing legnthy name into short one
neighborhood_toronto['Borough']=neighborhood_toronto['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
neighborhood_toronto.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [67]:
neighborhood_toronto.shape

(103, 3)

In [68]:
neighborhood_toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


## Grouping with Postal code

In [86]:
df_postcode = neighborhood_toronto.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()

In [87]:
df_postcode.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Joining the csv file from the local storage which already have downloaded to system

In [91]:
locgeo_df = pd.read_csv(r"C:\Users\balas\Desktop\Data Science\Data Science Materials\Datasets\Data set used for capstone project clustering\Toronto Neighborhood\Geospatial_Coordinates.csv", index_col='Postal Code')
toronto_data = df_postcode.join(locgeo_df, on='PostalCode') 
toronto_data.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848
