### This jupyter notebook will be used for the capstone project

In [1]:
import pandas as pd
import numpy as np

print('Hello Capstone Project Course')

Hello Capstone Project Course


### Segmenting and Clustering Neighborhoods in Toronto

### First Section 

##### Scraping wikipedia to get data

In [2]:
tables = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", header=0)

tables[0]

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


##### Ignore cells with a borough that's 'Not assigned', also we don't need to combine different neighborhoods in one postal code area nor check that a row has a borough value but anieghborhood valueof 'Not assigned' since the wikipedia table already provides that data (yey).

In [3]:
# checked this post
# https://stackoverflow.com/questions/28679930/how-to-drop-rows-from-pandas-data-frame-that-contains-a-particular-string-in-a-p
clean_1 = tables[0][~tables[0].Borough.str.contains("Not assigned")]
clean_1

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [4]:
clean_1.shape

(103, 3)

### Second Section

###### I preferred to read the spatial data from the csv file

In [5]:
geospatial_data = pd.read_csv("./Geospatial_Coordinates.csv")
geospatial_data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


##### Now we fuse the dataframes clean_1 and geospatial_data into a single more powerful dataframe

<img src="https://www.alfabetajuega.com/wp-content/uploads/2019/11/dragon-ball-fusion-goku-vegeta-770x433.jpg" width="300px"/>

In [6]:
fused_dataframes = pd.merge(clean_1, geospatial_data, how='inner', on=['Postal Code', 'Postal Code'])
fused_dataframes

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Third Section

##### Explore and cluster the nighborhoods in Toronto.

In [7]:
# We count how many unique boroughs are in the data.
fused_dataframes['Borough'].nunique()

10

In [8]:
# Count how many times does a borough appear.
fused_dataframes['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East Toronto         5
York                 5
East York            5
Mississauga          1
Name: Borough, dtype: int64

In [9]:
# Check where is each postal code located.
import folium

# Check in wikipedia the coords of Toronto.
toronto_lat = 43.670278
toronto_long = -79.386667

map_toronto = folium.Map(location=[toronto_lat, toronto_long], zoom_start=10)


# add markers to map, we take this code from the New York lab.
for lat, lng, borough, neighborhood in zip(fused_dataframes['Latitude'], fused_dataframes['Longitude'], fused_dataframes['Borough'], fused_dataframes['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    

map_toronto

##### Now we proceed to clustering

In [10]:
# set number of clusters, I picked up 10 since there are 10 borughs
k = 10

from sklearn.cluster import KMeans

In [11]:
# make a new dataframe that contains only the latitude and longitude, and use it to calculate kmeans

# I use the variable ll as a shorthand for latitude and longitude
ll = fused_dataframes.drop(['Neighborhood', 'Postal Code', 'Borough'],1)

kmeans = KMeans(n_clusters=k, random_state=0).fit(ll)

In [12]:
#add clustering labels
n = fused_dataframes
n.insert(0, 'Cluster Labels', kmeans.labels_)
n

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,1,M3A,North York,Parkwoods,43.753259,-79.329656
1,1,M4A,North York,Victoria Village,43.725882,-79.315572
2,5,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,0,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,5,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...,...
98,2,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,5,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,1,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,2,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


##### Now we visualize the map with the clusters

In [13]:
# Borrow code from the New York lab once again...

import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[toronto_lat, toronto_long], zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(n['Latitude'], n['Longitude'], n['Neighborhood'], n['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

And there, we get a map clustering with k=10