## Importing the required libraries

In [34]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

## Reading data from URL

In [35]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
web_url = requests.get(url).text
soup = BeautifulSoup(web_url,"lxml")


In [36]:
My_table = soup.find("table",{"wikitable sortable"})
My_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

In [37]:
rows = My_table.find_all('tr')

## Extracting the table values and create Dataframe

In [38]:
l_Postcode     = np.empty(288 ,dtype="S100")
l_Borough  = np.empty(288 ,dtype="S100")
l_Neighbourhood  =np.empty(288 ,dtype="S100")
for i in np.arange(len(rows)-1):
    c1 = rows[i+1].find_all('td')[0].get_text()
    c2 = rows[i+1].find_all('td')[1].get_text()
    c3 = rows[i+1].find_all('td')[2].get_text()
    l_Postcode[i]=c1
    l_Borough[i]=c2
    l_Neighbourhood[i]=c3

In [39]:
df = pd.DataFrame([l_Postcode,l_Borough,l_Neighbourhood]).T
df.columns=(['Postcode','Borough','Neighbourhood'])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,b'M1A',b'Not assigned',b'Not assigned\n'
1,b'M2A',b'Not assigned',b'Not assigned\n'
2,b'M3A',b'North York',b'Parkwoods\n'
3,b'M4A',b'North York',b'Victoria Village\n'
4,b'M5A',b'Downtown Toronto',b'Harbourfront\n'


## Cleaning the values from unneeded characters

In [40]:
def clean_text(t):
    n =t.replace('\\n',"")
    n = n.replace("b'","")     
    n=n.replace("'","")
    n=n.replace('b"',"")
    n=n.replace('"',"")
    return n.rstrip()

In [41]:
df.Postcode = df.Postcode.astype(str)
df.Borough = df.Borough.astype(str)
df.Neighbourhood = df.Neighbourhood.astype(str)
df['Postcode'] = df['Postcode'].apply(clean_text)
df['Borough'] = df['Borough'].apply(clean_text)
df['Neighbourhood'] = df['Neighbourhood'].apply(clean_text)

In [42]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Dropping the Unassigned Boroughs

In [43]:
df = df[df.Borough!= 'Not assigned']

In [44]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queens Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


## Grouping the data and drop duplicated rows

In [45]:
df.Neighbourhood = df.groupby(['Postcode','Borough'])['Neighbourhood'].transform(lambda x: ', '.join(x))

In [46]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

In [47]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queens Park,Not assigned


## Assigning the unassigned neighbourhoods to Borough name

In [48]:
def assign_neighbourhood(x):
    if x['Neighbourhood'] == 'Not assigned':
        return x['Borough']
    else:
        return x['Neighbourhood']

In [49]:
df['Neighbourhood']= df.apply(assign_neighbourhood ,axis = 1)   

In [50]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queens Park,Queens Park


## Printing Dataframe's shape

In [51]:
df.shape

(103, 3)

In [52]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')

In [53]:
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [54]:
new_df = df.merge(geo_df,left_on='Postcode' , right_on='Postal Code')

In [55]:
new_df = new_df[['Postcode','Borough','Neighbourhood','Latitude','Longitude']]

In [56]:
new_df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queens Park,Queens Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [57]:
from sklearn.cluster import KMeans
clusters = 5
tor_grouped_clustering = new_df[['Latitude','Longitude']]

# run k-means clustering
kmeans = KMeans(n_clusters= clusters, random_state=0).fit(tor_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 4, 2, 3, 2, 1, 0, 4, 4, 2], dtype=int32)

In [59]:
len(kmeans.labels_)

103

In [60]:
new_df['Cluster Label']= pd.Series(kmeans.labels_)

In [61]:
new_df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Label
0,M3A,North York,Parkwoods,43.753259,-79.329656,4
1,M4A,North York,Victoria Village,43.725882,-79.315572,4
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,2
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,3
4,M7A,Queens Park,Queens Park,43.662301,-79.389494,2
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242,1
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,0
7,M3B,North York,Don Mills North,43.745906,-79.352188,4
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937,4
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,2


In [68]:
from geopy.geocoders import Nominatim
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)

latitude = location.latitude
longitude = location.longitude


The geograpical coordinate of New York City are 43.653963, -79.387207.


In [77]:
import folium 
import matplotlib.cm as cm
import matplotlib.colors as colors
x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

markers_colors = []
for lat, lon, poi, cluster in zip(new_df['Latitude'], new_df['Longitude'], new_df['Neighbourhood'], new_df['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)  
    
map_clusters