# Question 1 Creating the Dataframe

In [25]:
import pandas as pd
from bs4 import BeautifulSoup

In [26]:
import requests
import numpy as np
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import ssl
import csv

In [50]:
!pip install folium



In [39]:
from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors

In [40]:
#Getting raw table
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source, 'lxml')

In [41]:
table = soup.find("table")
table_rows = table.tbody.find_all("tr")

data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df = df[~df['PostalCode'].isnull()]
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [42]:
df= df.groupby('PostalCode').agg(lambda x: ','.join(x))
df.head()

Unnamed: 0_level_0,Borough,Neighbourhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1A,Not assigned,Not assigned
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn


In [43]:
df.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)
df.head()

Unnamed: 0_level_0,Borough,Neighbourhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [44]:
df.shape

(103, 2)

# Adding Coordinates

In [45]:
dfgeo = pd.read_csv("https://cocl.us/Geospatial_data")

In [46]:
dfgeo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [47]:
dfnew = pd.merge(df, dfgeo, how = 'left', left_on = 'PostalCode', right_on = 'Postal Code')
dfnew.head()

Unnamed: 0,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353
1,Scarborough,"Rouge Hill, Port Union, Highland Creek",M1C,43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


# Analysis

In [48]:
# Getting only Boroughs with "Toronto"
newdf = dfnew[dfnew['Borough'].str.contains('Toronto', regex = False)]
newdf.head()

Unnamed: 0,Borough,Neighbourhood,Postal Code,Latitude,Longitude
37,East Toronto,The Beaches,M4E,43.676357,-79.293031
41,East Toronto,"The Danforth West, Riverdale",M4K,43.679557,-79.352188
42,East Toronto,"India Bazaar, The Beaches West",M4L,43.668999,-79.315572
43,East Toronto,Studio District,M4M,43.659526,-79.340923
44,Central Toronto,Lawrence Park,M4N,43.72802,-79.38879


In [52]:
import folium

In [53]:
# Folium Map
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(newdf['Latitude'],newdf['Longitude'],newdf['Borough'],newdf['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

In [55]:
#Clustering neighborhoods
k = 10
clustered = newdf.drop(['Postal Code', 'Borough', 'Neighbourhood'],1)
kmean = KMeans(n_clusters = k, random_state = 0).fit(clustered)
kmean.labels_
newdf.insert(0, 'Cluster Labels', kmean.labels_)

In [58]:
# new map
clusteredmap = folium.Map(location=[43.651070,-79.347015],zoom_start=10)
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(newdf['Latitude'], newdf['Longitude'], newdf['Neighbourhood'], newdf['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(clusteredmap)
    
clusteredmap