# IBM Data Science Capstone Week 3
## Analysis
### Calvin Todorovich

#### Setting up the data from the other 2 parts, in case this notebook is run independently
#### If maps do not render in Github, please run the notebook in an editor
##### Skip to cell 2 for the new work.

In [1]:
import pandas as pd
import csv
import numpy as np

from bs4 import BeautifulSoup
import requests

wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

class_id = 'wikitable sortable jquery-tablesorter'

response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, "html.parser")

canada_table = soup.find("table",{"class": "wikitable sortable"})

#I'm not sure if I parsed it wrong, but the canada_table saved as a list, which still had all the html tags in it
#So I found some user defined functions to translate html to csv

table = canada_table

def get_table_headers(table):
    headers = []
    for th in table.find("tr").find_all("th"):
        headers.append(th.text.strip())


df = pd.read_csv("can_table.csv")

#drop that extra unnamed row
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df.Neighborhood.fillna(df.Borough, inplace = True)

#If a neighborhood is still unassigned, drop it
df = df.replace('Not assigned', np.nan).dropna()

df2 = pd.read_csv(r'C:\Users\Todo\Documents\Geospatial_Coordinates.csv')

merged = pd.merge(left = df, right = df2)
merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [2]:
#!conda install -c conda-forge folium=0.5.0 --yes
#Uncomment above line if you get an error
import folium # map rendering library

In [4]:
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim

In [5]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Analysis:

In [6]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

#k-means from clustering stage
from sklearn.cluster import KMeans

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(merged['Latitude'], merged['Longitude'], merged['Borough'], merged['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Standardizing the Latitude and Longitude

In [7]:
cols = merged
cols.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [8]:
cols = cols.drop(['Postal Code', 'Borough', 'Neighborhood'], axis=1)
cols.head()

Unnamed: 0,Latitude,Longitude
0,43.753259,-79.329656
1,43.725882,-79.315572
2,43.65426,-79.360636
3,43.718518,-79.464763
4,43.662301,-79.389494


In [9]:
from sklearn.preprocessing import StandardScaler

X = cols.values[:,1:]
cluster_merged = StandardScaler().fit_transform(X)
cluster_merged[1:5]

array([[ 0.84388426],
       [ 0.37773518],
       [-0.6993678 ],
       [ 0.07922652]])

### K-Means Clustering, starting with 4 clusters

In [10]:
num_clusters = 4

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(cluster_merged)
labels = k_means.labels_

print(labels[1:5])

[3 3 0 2]


In [11]:
merged["Labels"] = labels
merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Labels
0,M3A,North York,Parkwoods,43.753259,-79.329656,3
1,M4A,North York,Victoria Village,43.725882,-79.315572,3
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,3
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2


### I had a difficult time changing the color of a point based on Labels, since it was an integer, so I set up a function that translates integer to a string representation of a color.

In [12]:
merged['marker_color'] = pd.cut(merged['Labels'], bins=4, 
                              labels=['yellow', 'green', 'blue', 'red'])
merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Labels,marker_color
0,M3A,North York,Parkwoods,43.753259,-79.329656,3,red
1,M4A,North York,Victoria Village,43.725882,-79.315572,3,red
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,3,red
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,yellow
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2,blue


In [13]:
map_toronto2 = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood, c in zip(merged['Latitude'], merged['Longitude'], merged['Borough'], merged['Neighborhood'], merged['marker_color']):
    label = '{}, {}, {}, {}'.format(neighborhood, borough, lat, lng)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=c,
        fill=True,
        fill_color=c,
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto2)  
map_toronto2

### Repeating the same process for different number of clusters.

In [14]:
num_clusters = 5

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(cluster_merged)
labels = k_means.labels_

merged["Labels"] = labels

merged['marker_color'] = pd.cut(merged['Labels'], bins=5, 
                              labels=['yellow', 'green', 'blue', 'red', 'purple'])

map_toronto3 = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood, c in zip(merged['Latitude'], merged['Longitude'], merged['Borough'], merged['Neighborhood'], merged['marker_color']):
    label = '{}, {}, {}, {}'.format(neighborhood, borough, lat, lng)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=c,
        fill=True,
        fill_color=c,
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto3)  
map_toronto3

In [16]:
num_clusters = 6

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(cluster_merged)
labels = k_means.labels_

merged["Labels"] = labels

merged['marker_color'] = pd.cut(merged['Labels'], bins=6, 
                              labels=['yellow', 'green', 'blue', 'red', 'purple', 'black'])

map_toronto4 = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood, c in zip(merged['Latitude'], merged['Longitude'], merged['Borough'], merged['Neighborhood'], merged['marker_color']):
    label = '{}, {}, {}, {}'.format(neighborhood, borough, lat, lng)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=c,
        fill=True,
        fill_color=c,
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto4)  
map_toronto4

In [17]:
num_clusters = 7

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(cluster_merged)
labels = k_means.labels_

merged["Labels"] = labels

merged['marker_color'] = pd.cut(merged['Labels'], bins=7, 
                              labels=['yellow', 'green', 'blue', 'red', 'purple', 'black', 'white'])

map_toronto5 = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood, c in zip(merged['Latitude'], merged['Longitude'], merged['Borough'], merged['Neighborhood'], merged['marker_color']):
    label = '{}, {}, {}, {}'.format(neighborhood, borough, lat, lng)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=c,
        fill=True,
        fill_color=c,
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto5)  
map_toronto5

### I believe choosing 5 or 6 for our number of clusters yielded the best map in terms of minimizing intra-cluster distance. There seeemed to be some vertical spread in every cluster, which could be fixed by weighting Latitude more stronly than Longitude when normalizing with StandardScalar(). 