#### Author: Alina Prendes Roque, assignment Week 3, Course: Applied Data Science Capstone

# Segmenting and Clustering Neighborhoods in Toronto

## Part one of the assignment (Steps 1 to 7)

#### 1. Import the required libraries

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from geopy.geocoders import Nominatim
import requests 
from pandas.io.json import json_normalize 
import json
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!pip install folium
import folium
print('folium installed')

folium installed


#### 2. Scrape the data from the website

In [2]:
link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_data= requests.get(link).text
soup = BeautifulSoup(raw_data,'xml')

#### 3. Transform the table from the website into a dataframe

In [3]:
table = soup.find('table')
fields = table.find_all('td')

PostalCode = []
Borough = []
Neighborhood = []

for i in range(0, len(fields), 3):
    PostalCode.append(fields[i].text.strip())
    Borough.append(fields[i+1].text.strip())
    Neighborhood.append(fields[i+2].text.strip())
        
df = pd.DataFrame(data=[PostalCode, Borough, Neighborhood]).transpose()
df.columns = ['Postal Code', 'Borough', 'Neighborhood']
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### 4. Delete the rows that where the Borough is not assigned

In [4]:
df['Borough'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


#### 5. Assign Borough when Neighborhood is not assigned

In [5]:
df = df.groupby(['Postal Code', 'Borough'])['Neighborhood'].apply(','.join).reset_index()
df.columns = ['Postal Code', 'Borough', 'Neighborhood']
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### 6. Show the dimensions of the dataframe

In [6]:
df.shape

(103, 3)

#### 7. Extra step, just to make sure that the output is correct

In [7]:
column_names = ["Postal Code", "Borough", "Neighborhood"]
test_df = pd.DataFrame(columns=column_names)
test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]
for postcode in test_list:
    test_df = test_df.append(df[df["Postal Code"]==postcode], ignore_index=True)
test_df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


## Part two of the assignment (Steps 8 to 10)

#### 8. Read the file that contains the geographical coordinates

In [8]:
file= "http://cocl.us/Geospatial_data"
coordinates = pd.read_csv(file)
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### 9. Merge both dataframes

In [9]:
df_coordinates = df.merge(coordinates, on ="Postal Code", how='left')
df_coordinates.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### 10. This is again an extra step, just to make sure that the output is correct

In [10]:
column_names = ["Postal Code", "Borough", "Neighborhood", "Latitude", "Longitude"]
test_df = pd.DataFrame(columns=column_names)
test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]
for postcode in test_list:
    test_df = test_df.append(df_coordinates[df_coordinates["Postal Code"]==postcode], ignore_index=True)
test_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


## Part three of the assignment (Steps 11 to 15)

#### 11. Let's obtain the latitude and longitude of Toronto

In [11]:
address = 'Toronto'
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


#### 12. Let's create a map of Toronto with its neighborhoods

In [12]:
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, borough, neighborhood in zip(df_coordinates['Latitude'], df_coordinates['Longitude'], df_coordinates['Borough'], df_coordinates['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_Toronto)  
map_Toronto

#### 13. Now let's filter the boroughs that contain the word "York" and build a dataframe with these 
#### Note: "York" was just randomly selected for the sake of the assignment, could have been any other word

In [13]:
borough_names = list(df_coordinates.Borough.unique())

borough_with_York = []

for x in borough_names:
    if "York" in x:
        borough_with_York.append(x)
print(borough_with_York)

df_york = df_coordinates[df_coordinates['Borough'].isin(borough_with_York)].reset_index(drop=True)
print(df_coordinates.shape)
df_york.head()

['North York', 'East York', 'York']
(103, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.803762,-79.363452
1,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M2L,North York,"York Mills, Silver Hills",43.75749,-79.374714
4,M2M,North York,"Willowdale, Newtonbrook",43.789053,-79.408493


#### 14. Let's cluster the neighborhoods and then merge the dataframes incluiding the cluster numbers 

In [14]:
kclusters = 4
df_toronto_cluster = df_coordinates.drop(["Postal Code", "Borough", "Neighborhood"], 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_toronto_cluster)
kmeans.labels_[0:10]
df_toronto = df_coordinates.copy()
df_toronto["Cluster Labels"] = kmeans.labels_
print(df_toronto.shape)
df_toronto

(103, 6)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,2
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,2
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,2
3,M1G,Scarborough,Woburn,43.770992,-79.216917,2
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,2
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,2
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029,2
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577,2
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476,2
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848,2


In [15]:
df_toronto.sort_values(["Cluster Labels"], inplace=True)
df_toronto

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
51,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675,0
56,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0
61,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817,0
60,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576,0
59,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752,0
36,M4C,East York,Woodbine Heights,43.695344,-79.318389,0
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,0
38,M4G,East York,Leaside,43.709060,-79.363452,0
39,M4H,East York,Thorncliffe Park,43.705369,-79.349372,0
40,M4J,East York,"East Toronto, Broadview North (Old East York)",43.685347,-79.338106,0


#### 15. Let's create a map of Toronto with the respective clusters

In [16]:
map_df_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Postal Code'], df_toronto['Borough'], df_toronto['Neighborhood'], df_toronto['Cluster Labels']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_df_toronto)
map_df_toronto