# Coursera Assignment - Applied Data Science Capstone - Week 3

#### Student: Anderson Matias
#### Topic: Segmenting and Clustering Neighborhoods in Toronto

## Generate Dataframe out of Wikipedia Page

In [1]:
import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes

import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 

from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs

print('Libraries imported.')

Libraries imported.


In [2]:
# Import the html table into a Dataframe
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


## Clean up the Dataframe

In [3]:
# Exclude rows that have the value "Not Assigned" for the Column "Borough"
borough_todrop = df[df['Borough'] == "Not assigned"].index # Generate new dataframe with the rows to be dropped using a str as criteria and getting its index labels
df.drop(borough_todrop, axis = 0, inplace = True ) # Drop rows with no data for the column "Borough"
df = df.stack().str.replace('/',',').unstack() # Replace dashes with commas on column "Neighborhood"
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [4]:
df.rename(columns={'Postal code':'PostalCode'}, 
                 inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


## Getting Geospatial Data

#### I've decided to used the csv with geospatial data, because I was having trouble importing the geocoder library

In [5]:
geo_data = pd.read_csv("http://cocl.us/Geospatial_data")
geo_data.rename(columns={'Postal Code':'PostalCode'}, 
                 inplace=True)
geo_data.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
df_merged = pd.merge(df, geo_data, on="PostalCode", how = "left")
df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


# Clustering Neighborhoods

In [7]:
from sklearn.preprocessing import StandardScaler

X = df_merged[['Latitude','Longitude']].values[:,1:]
X = np.nan_to_num(X)
cluster_dataset = StandardScaler().fit_transform(X)


#### Here I used the clustering algorithm learned in the course and tried different number of clusters to check the distribution plotted in the map

In [60]:
num_clusters = 5

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(cluster_dataset)
labels = k_means.labels_

print(labels)

[4 4 1 0 1 3 2 4 4 1 0 3 2 4 4 1 1 3 2 4 1 0 2 1 1 1 2 1 0 4 1 0 2 4 0 4 1
 1 2 1 0 4 1 1 4 1 0 4 1 0 3 2 1 0 4 1 0 3 2 1 0 1 1 0 0 2 1 1 1 0 3 4 0 1
 1 0 3 3 2 1 1 0 4 1 1 4 1 1 0 3 4 1 1 3 3 2 1 1 0 1 4 0 0]


#### After generating the respective labels to each point, I added it as a column to group it afterwards

In [61]:
df_merged["Labels"] = labels
df_merged.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Labels
0,M3A,North York,Parkwoods,43.753259,-79.329656,4
1,M4A,North York,Victoria Village,43.725882,-79.315572,4
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,1
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763,0
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494,1


In [62]:
df_merged.groupby('Labels').mean()

Unnamed: 0_level_0,Latitude,Longitude
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1
0,43.696784,-79.480484
1,43.688108,-79.391963
2,43.764024,-79.228896
3,43.683105,-79.56283
4,43.722458,-79.321667


In [11]:
!wget --quiet https://cocl.us/Geospatial_data
!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Folium installed and imported!')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Folium installed and imported!


#### I then plotted a graph with the respective markers distributed and used the labels column to color them according to their own cluster

In [63]:
# create a Stamen Toner map of the world centered around Canada
toronto_map = folium.Map(location=[43.694826, -79.373945], zoom_start=11)

# instantiate a feature group for the incidents in the dataframe
locations = folium.map.FeatureGroup()

# loop through the 100 crimes and add each to the incidents feature group
for lat, lng, labels in zip(df_merged.Latitude, df_merged.Longitude, df_merged.Labels):
    if labels == 0:
        locations.add_child(
        folium.features.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            color='blue',
            fill=True,
            fill_color='blue',
            fill_opacity=0.6))
    elif labels == 1:
        locations.add_child(
        folium.features.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            color='yellow',
            fill=True,
            fill_color='yellow',
            fill_opacity=0.6))
    elif labels == 2:
        locations.add_child(
        folium.features.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            color='green',
            fill=True,
            fill_color='green',
            fill_opacity=0.6))
    elif labels == 3:
        locations.add_child(
        folium.features.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            color='orange',
            fill=True,
            fill_color='orange',
            fill_opacity=0.6))
    else:
        locations.add_child(
        folium.features.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            color='red',
            fill=True,
            fill_color='red',
            fill_opacity=0.6
            
        )
    )
            
toronto_map.add_child(locations)

#### As the map is not visible on the Github Repository, I've added a picture separately, showing the result.