### Library declaration

In [42]:
import requests
from bs4 import BeautifulSoup
import lxml
import html5lib
import os
import numpy as np
import pandas as pd

### Wikipedia url path declaration along with the BeautifulSoup api call for reading the html file

In [43]:
url_path = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url_path).text
data = BeautifulSoup(source, "html.parser")

#### Table column name assignment

In [44]:
table=data.find('table')
column_names=['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns=column_names)

#### pulling the table column names and the column values based on the tr and td findings

In [45]:
for tr in table.find_all('tr'):
    url_data=[]
    for td in tr.find_all('td'):
        url_data.append(td.text.strip())
    if len(url_data)==3:
        df.loc[len(df)] = url_data
        df.head()

#### Ignoring rows with a borough that is Not assigned

In [46]:
df_tor=df[(df['Borough']!='Not assigned')]

In [47]:
df_tor[df_tor['Borough']!=0]

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


#### Grouping More than one neighborhood in one postal code area

In [48]:
toronto = df_tor.groupby(['Postalcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [49]:
def neighbor_list(grouped):    
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))


In [50]:
grp = toronto.groupby(['Postalcode', 'Borough'])
df2 = grp.apply(neighbor_list).reset_index(name='Neighborhood')
# df2.rename(columns={'Postalcode':'Postal Code'}, inplace=True)
df2.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### printing the number of rows of the dataframe

In [51]:
df2.shape

(103, 3)

#### Extracting csv with Toronto geographical coordinates from http://cocl.us/Geospatial_data to dataframe

In [52]:
path="http://cocl.us/Geospatial_data"
geocode_df = pd.read_csv(path)
geocode_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merging the toronto data frame with geo code latitude and longitude based on the postal code

In [53]:
geocode_df.rename(columns={'Postal Code':'Postalcode'},inplace=True)
geocode_merged = pd.merge(geocode_df, df2, on='Postalcode')
geocode_merged.head()

Unnamed: 0,Postalcode,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Rouge, Malvern"
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


#### Rearring the data frame as required

In [54]:
geo_data=geocode_merged[['Postalcode','Borough','Neighborhood','Latitude','Longitude']]
geo_data.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### Clustering the neighborhoods in Toronto with only borough that contain the word Toronto

In [70]:
toronto_data=geo_data[geo_data['Borough'].str.contains("Toronto", regex=False)]
toronto_data.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


#### Publishing the shape of the required data frame

In [69]:
toronto_data.shape

(39, 5)

### Clustering the neighborhoods in Toronto

#### Installing library for Visualization

In [57]:
!conda install -c conda-forge folium=0.5.0 --yes
!pip install geocoder
!pip install geopy

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



#### Importing Libraries

In [77]:
import folium
import geocoder
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors

#### Visualization using folium

In [72]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(toronto_data['Latitude'],toronto_data['Longitude'],
                                         toronto_data['Borough'],toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

#### clsutering of the neighbourhoods using K-Mean Algorithm

In [74]:
k=5
toronto_clustering = toronto_data.drop(['Postalcode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
toronto_data.insert(0, 'Cluster Labels', kmeans.labels_)

In [75]:
toronto_data

Unnamed: 0,Cluster Labels,Postalcode,Borough,Neighborhood,Latitude,Longitude
37,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,0,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,0,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,0,M4M,East Toronto,Studio District,43.659526,-79.340923
44,1,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,1,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,1,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,1,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,1,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [79]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood'], toronto_data['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters