# This notebook has been built in Google Colab due to system constraints.

## Part 1

In [1]:
# Importing relevant libraries
import numpy as np # library to handle data in a vectorized manner

import pandas as pd

import requests # library to handle requests

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

> Requesting data from the given url.

In [3]:
results= requests.get(url)
results

<Response [200]>

> As the data is in html format so using Beautiful Soup for extracting for web scraping.

In [4]:
import bs4

In [5]:
soup = bs4.BeautifulSoup(results.content, 'lxml')

> As there is only one table in the web page so using table function.

In [6]:
table = soup.table

> Storing the table into a new dataframe df.

In [7]:
columns=['PostalCode','Borough','Neighborhood']
z=[]
final = pd.DataFrame(columns=columns)
table_rows = table.find_all('tr')
for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text for i in td]
    z.append(row)

In [8]:
df=pd.DataFrame(z,columns=columns)

> Extracting the head of the dataframe.

In [65]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A\n,Not assigned\n,\n
2,M2A\n,Not assigned\n,\n
3,M3A\n,North York\n,Parkwoods\n
4,M4A\n,North York\n,Victoria Village\n


> As the first row does not contain any data so slicing it out from the dataframe.

In [9]:
df=df[1:]

In [10]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A\n,Not assigned\n,\n
2,M2A\n,Not assigned\n,\n
3,M3A\n,North York\n,Parkwoods\n
4,M4A\n,North York\n,Victoria Village\n
5,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


> As we can see that each cell contains /n so removing it accordingly.

In [11]:
df['PostalCode'] = df['PostalCode'].str.replace('\n','')
df['Borough'] = df['Borough'].str.replace('\n','')
df['Neighborhood'] = df['Neighborhood'].str.replace('\n','')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


> Removing those rows which do not have any borough assigned.

In [12]:
df=df.drop(df[df['Borough']=='Not assigned'].index)

In [13]:
df=df.reset_index()
df.head()

Unnamed: 0,index,PostalCode,Borough,Neighborhood
0,3,M3A,North York,Parkwoods
1,4,M4A,North York,Victoria Village
2,5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,6,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [14]:
df.drop(['index'],axis=1,inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


> Those neighborhoods which have been assigned replacing it with the same as that of Borough.

In [16]:
for i in range(len(df)):
    if(df.loc[i,'Neighborhood']=='Not assigned'):
        df.loc[i,'Neighborhood']=df.loc[i,'Borough']

> Checking for above operation.

In [17]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Part 2

> Uploading the csv file from https://cocl.us/Geospatial_data for latitude and longitude based on the postal codes.

In [None]:
from google.colab import files

In [None]:
uploaded = files.upload()

In [None]:
for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [None]:
# Storing the csv file into a new dataframe
new_df=pd.read_csv('Geospatial_Coordinates.csv')

> First few rows of the new dataframe.

In [None]:
new_df.head()

> Renaming the Postal Code column in the new dataframe same as that of the df for merging the two dataframes.

In [None]:
new_df=new_df.rename(columns={'Postal Code':'PostalCode'})
new_df.head()

> Merging the df and new_df dataframe into a new dataframe based on postal codes.

In [None]:
df_clean=pd.merge(df,new_df,on='PostalCode',how='inner')
df_clean.head()

## Part 3 

> Importing the folium package for plotting the points into map.

In [None]:
import folium

> Creating a map for Toronto and highlighting the locations present in the dataframe.

In [None]:
for lat, lng, borough, neighborhood in zip(df_clean['Latitude'], df_clean['Longitude'], df_clean['Borough'], df_clean['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

> Importing Kmeans package for clustering.

In [None]:
from sklearn.cluster import KMeans

In [None]:
clus=df_clean.loc[:,['PostalCode','Latitude','Longitude']]
clus.head(10)

> Importing matplotlib.pyplot for plotting the the within cluster sum of squares.

In [None]:
import matplotlib.pyplot as plt

In [None]:
K_clusters = range(1,8)
kmeans = [KMeans(n_clusters=i) for i in K_clusters]
Y_axis = df_clean[['Latitude']]
X_axis = df_clean[['Longitude']]
score = [kmeans[i].fit(Y_axis).score(Y_axis) for i in range(len(kmeans))]
plt.plot(K_clusters, score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.show()

> From the above graph it is clear that the locations are quite close to each other as the maximum score in the graph is -0.25 but as the number of clusters increase from 3 onwards the change in score is quite insignificant so we choose 3 as the optimum no of clusters.

In [None]:
means = KMeans(n_clusters = 3, init ='k-means++')
kmeans.fit(clus[clus.columns[1:3]])
clus['cluster_label'] = kmeans.fit_predict(clus[clus.columns[1:3]])
centers = kmeans.cluster_centers_
labels = kmeans.predict(clus[clus.columns[1:3]])

> Plotting the cluster of locations depending upon their Latitude and Longitude.

In [None]:
clus.plot.scatter(x = 'Latitude', y = 'Longitude', c=labels, s=50, cmap='viridis')
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=100, alpha=1.0);

> Comparing the result with actual distribution of locations.

In [None]:
for lat, lng, borough, neighborhood in zip(df_clean['Latitude'], df_clean['Longitude'], df_clean['Borough'], df_clean['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_opacity=1.0,
        parse_html=False).add_to(map_toronto)
map_toronto        