# Data Science Capstone

### Segmenting and Clustering Neighborhoods in Toronto

Andres Dominguez

----
----

# PART 1 - Creating DataFrame

### Installing and importing the required modules.

In [85]:
# Installing scraping modules:

!pip install lxml html5lib beautifulsoup4

# Importing Pandas Library:

import pandas as pd



### Extracting the table from the HTML page and dropping the "Not Assigned" rows.

In [86]:
# Extracting the table:

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)

df = dfs[0]

# Dropping rows with "Not assigned" values:

df = df[df.Borough != 'Not assigned']
df.head(15)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [87]:
# Printing the number of rows and columns of the DataFrame:

df.shape

(103, 3)

# PART 2 - Adding geographical coordinates of each postal code. -->(using csv file)<--

###  Extracting the csv file.

In [88]:
# reading csv file and creating pandas DF:

url2 = "http://cocl.us/Geospatial_data/Geospatial_Coordinates.csv"

df2 = pd.read_csv(url2)
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


###  The coordinates provided are not in order as our main Data Frame. In this case, I will reorder the coordintates so they can match with each postal code and add them to the main Data Frame.

In [89]:
# Creating lists:

lat_coords = []
lng_coords = []

# Getting coordinates for every Postal Code:

x = len(df['Postal Code'])
i = 0

for y in range(x):
    while (df.iloc[y, 0] != df2.iloc[i, 0]):
        i = i + 1
    lat_coords.append(df2.iloc[i, 1])
    lng_coords.append(df2.iloc[i, 2])
    i = 0

In [90]:
# Adding coordinates to the main Data Frame:

df["Latitude"] = lat_coords
df["Longitude"] = lng_coords

df.head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
9,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
11,M3B,North York,Don Mills,43.745906,-79.352188
12,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


# PART 3 - Exploring and clustering the neighborhoods in Toronto.

### 1. Exploring the neighbrohood in Toronto using Folium.

In [91]:
# Installing geopy and folium:

!pip install geopy folium

# Importing libraries:

import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim



In [92]:
# Getting Toronto, Canada coordinates:

address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto, Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto, Canada are 43.6534817, -79.3839347.


In [93]:
# Using Folium to generate the map:

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# Add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### 2. Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category.

In [94]:
toronto_grouped = df.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Agincourt,43.794200,-79.262029
1,"Alderwood, Long Branch",43.602414,-79.543484
2,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259
3,Bayview Village,43.786947,-79.385975
4,"Bedford Park, Lawrence Manor East",43.733283,-79.419750
5,Berczy Park,43.644771,-79.373306
6,"Birch Cliff, Cliffside West",43.692657,-79.264848
7,"Brockton, Parkdale Village, Exhibition Place",43.636847,-79.428191
8,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
9,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.394420


### 3. Culstering Neighborhoods

### Running *k*-means to cluster the neighborhood into 5 clusters.

In [95]:
# setting the number of clusters:

kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering:

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe:

kmeans.labels_[0:10]

array([2, 1, 3, 3, 3, 4, 0, 4, 0, 4], dtype=int32)

### Adding labels to DataFrame.

In [96]:
toronto_grouped_clustering.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_grouped_clustering.head()

Unnamed: 0,Cluster Labels,Latitude,Longitude
0,2,43.7942,-79.262029
1,1,43.602414,-79.543484
2,3,43.754328,-79.442259
3,3,43.786947,-79.385975
4,3,43.733283,-79.41975


### Visualizing clustered neighborhoods.

In [97]:
# creating map:

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters:

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map

markers_colors = []
for lat, lon, cluster in zip(toronto_grouped_clustering['Latitude'], toronto_grouped_clustering['Longitude'], toronto_grouped_clustering['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters