# Coursera Data Science Capstone

## Toronto Segmenting Clustering

### Part 1

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [3]:
# Add the Toronot Wikipedia page
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)

#Change html data to useable data for df
data = response.text
soup = BeautifulSoup(data,'html.parser')
wiki_table=soup.find('table')

#create the dataframe for use 
df = pd.read_html(str(wiki_table))[0]
df.head(20)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [4]:
# Remove all of the "Borough" entities with 'Not assigned' values
df_NA=df[df['Borough'].str.contains("Not assigned") == False].reset_index()
df_NA.head(20)

Unnamed: 0,index,Postal Code,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,9,M1B,Scarborough,"Malvern, Rouge"
7,11,M3B,North York,Don Mills
8,12,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [5]:
df_NA.shape

(103, 4)

### Part 2

Import Geospatial Data

In [6]:
!wget -O zipcodes.csv http://cocl.us/Geospatial_data
    

--2021-01-02 18:52:24--  http://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.63.96.194, 169.63.96.176
Connecting to cocl.us (cocl.us)|169.63.96.194|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cocl.us/Geospatial_data [following]
--2021-01-02 18:52:24--  https://cocl.us/Geospatial_data
Connecting to cocl.us (cocl.us)|169.63.96.194|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2021-01-02 18:52:25--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.29.197
Connecting to ibm.box.com (ibm.box.com)|107.152.29.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2021-01-02 18:52:25--  https://ibm.box.com/public/static/9afzr83p

In [7]:
zipcode_df = pd.read_csv('zipcodes.csv')

In [8]:
zipcode_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Join on 'Postal Code' Column

In [9]:
#Merge data frames

df_toronto = pd.merge(df_NA, zipcode_df, on = 'Postal Code')
df_toronto.head()

Unnamed: 0,index,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,2,M3A,North York,Parkwoods,43.753259,-79.329656
1,3,M4A,North York,Victoria Village,43.725882,-79.315572
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [10]:
df_toronto.shape

(103, 6)

### Part 3

Import Libraries

In [11]:
from IPython.display import Image 
from IPython.core.display import HTML
! pip install folium==0.5.0
import folium

Collecting folium==0.5.0
  Downloading folium-0.5.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 8.8 MB/s  eta 0:00:01
[?25hCollecting branca
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25ldone
[?25h  Created wheel for folium: filename=folium-0.5.0-py3-none-any.whl size=76240 sha256=3081b6dec54b77ebf0157a9838f9bb3913395b53edc6a03e15f373e7f46be8ab
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/b2/2f/2c/109e446b990d663ea5ce9b078b5e7c1a9c45cca91f377080f8
Successfully built folium
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.5.0


### Explore and cluster the neighborhoods in Toronto

In [12]:
#Filter out all Boroughs that do not contain 'Toronto'
df_toronto=df_toronto[df_toronto['Borough'].str.contains('Toronto')]
df_toronto=df_toronto.reset_index(drop=True)
df_toronto.head()

Unnamed: 0,index,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,13,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,22,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,30,M4E,East Toronto,The Beaches,43.676357,-79.293031


### Change to Integer

In [13]:
# Need to update the df to include integer data for the categorical Borough in order to use clustering
df_toronto['Index']=df_toronto['Borough'].replace(to_replace=['Downtown Toronto','Central Toronto','West Toronto','East Toronto'],value=[1,2,3,4],inplace=False)
df_toronto.head()

Unnamed: 0,index,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Index
0,4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1
1,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1
2,13,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1
3,22,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1
4,30,M4E,East Toronto,The Beaches,43.676357,-79.293031,4


Import needed libraries for the map

In [17]:
!pip install geopy
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np 



In [24]:
#coordinates of Toronto to begin the clustering

address = 'Toronto'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(f'The geograpical coordinate of Toronto are {latitude}, {longitude}.')

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Need to set up to create the map of boroughs in Toronto and then interate through

In [30]:
#set the cluster number as the Index number created above
kclusters=len(df_toronto.Index.unique())

#create map of Toronto
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters of boroughs in Toronto
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add the zip code clusters to the map of Toronto
markers_colors = []
for lat, lon, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Index']):
    label = folium.Popup(str(df_toronto['Borough']) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

In [31]:
map_clusters