In [14]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium

#print('Libraries imported.')

In [6]:
#Getting the data from url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
res = requests.get(url)
soup = bs(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
data = pd.read_json(df[0].to_json(orient='records'))

## Q1
#### let's build a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name

In [7]:
data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Choosing only data where field Borough doesn't have "not assigned" value

In [8]:
data_selection = data[data['Borough'] != 'Not assigned']

In [9]:
#Grouping Data
data_selection = data_selection.groupby(['Borough', 'Postal Code'], as_index=False).agg(','.join)
data_selection.head()

Unnamed: 0,Borough,Postal Code,Neighborhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,"North Toronto West, Lawrence Park"
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"


### Replacing values in Neighbourhood field with Borough where Neighbourhood is not assigned

In [10]:
data_selection['Neighborhood'] = np.where(data_selection['Neighborhood'] == 'Not assigned', data_selection['Borough'], data_selection['Neighborhood'])

In [11]:
data_selection.head()

Unnamed: 0,Borough,Postal Code,Neighborhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,"North Toronto West, Lawrence Park"
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"


## Q2

 ### Let's use the csv file to create the dataframe with the longitude and latitude

In [12]:
geospatial_url = "https://cocl.us/Geospatial_data"
geospatial_data = pd.read_csv(geospatial_url)

In [13]:
data_selection_geo = pd.merge(data_selection, geospatial_data, on='Postal Code')
data_selection_geo.head()

Unnamed: 0,Borough,Postal Code,Neighborhood,Latitude,Longitude
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197
2,Central Toronto,M4R,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,Central Toronto,M4S,Davisville,43.704324,-79.38879
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316


## Q3

### Clustering the neighborhoods in Toronto

In [15]:
toronto_map = folium.Map(location=[43.65, -79.4], zoom_start=12)

X = data_selection_geo['Latitude']
Y = data_selection_geo['Longitude']
Z = np.stack((X, Y), axis=1)

kmeans = KMeans(n_clusters=4, random_state=0).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
data_selection_geo['Cluster'] = clusters

for latitude, longitude, borough, cluster in zip(data_selection_geo['Latitude'],data_selection_geo['Longitude'],data_selection_geo['Borough'],data_selection_geo['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(toronto_map)  

toronto_map