# Part 1: Scraping the neighborhoods DataFrame

In [1]:
import requests
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
print("Libraries OK")

Libraries OK


In [2]:
# Scrape the data required using BeautifulSoup
res = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))[0]
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


In [3]:
# Drop the elements withouth information
df.dropna(inplace = True)

# Reset index
df.reset_index(inplace = True)

# Drop the columns added by reseting index
df.drop(columns = ['index'], inplace = True)

In [4]:
# In the assignment it says that postals codes with more than one neighborhood 
# appear several times, one for each neighborhood. But actually in the wikipedia page it
# appears only once and separates the neighborhoods with '/'. So what i'm going to do next
# is to convert the '/' into commas
i = 0
for p_code in df.loc[:,'Postal code']:     # iterate through all the postal codes
    if '/' in df.loc[i, 'Neighborhood']:
        df.loc[i, 'Neighborhood'] = df.loc[i, 'Neighborhood'].replace('/',',') # replace '/' for comma
    i += 1
        
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


In [5]:
print('DataFrame.shape: {}'.format(df.shape))
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

DataFrame.shape: (103, 3)
The dataframe has 10 boroughs and 103 neighborhoods.


# Part 2: Importing Latitude and Longitude

In [6]:
# load csv file
df_cordinates = pd.read_csv('Geospatial_Coordinates.csv')

# Change column name in order to be the same as the previous dataframe (this
# makes easier merging both DataFrames)
df_cordinates.rename(columns = {'Postal Code':'Postal code'}, inplace = True)

In [7]:
# Merge the DataFrames
df_new = pd.merge(df, df_cordinates, on = 'Postal code')

In [8]:
df_new

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,...",43.636258,-79.498509


In [9]:
# Generate map of the locations
import folium as fl
world_map = fl.Map(location = [df_new.loc[:,'Latitude'].mean(), df_new.loc[:,'Longitude'].mean()], zoom_start = 10)

i = 0
for p_code in df_new.loc[:, 'Postal code']:
    fl.Marker([df_new.loc[i, 'Latitude'], df_new.loc[i, 'Longitude']]).add_to(world_map)
    i += 1
world_map

# Part 3: Clustering

In [10]:
# Cluster the neighborhoods

from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

df_new_clustering = df_new.drop(columns= ['Neighborhood','Postal code', 'Borough'])

# run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state=0).fit(df_new_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 4, 2, 3, 2, 1, 0, 4, 4, 2], dtype=int32)

In [11]:
# add clustering labels
df_new.insert(0, 'Cluster Labels', kmeans.labels_)

In [13]:
# Generate map of the neigborhoods segmentated
world_map = fl.Map(location = [df_new.loc[:,'Latitude'].mean(), df_new.loc[:,'Longitude'].mean()], zoom_start = 11)

# Define a functions which returns the color depending on the cluster label
def getcolor(i):
    if df_new.loc[i, 'Cluster Labels'] == 0:
        return 'red'
    elif df_new.loc[i, 'Cluster Labels'] == 1:
        return 'blue'
    elif df_new.loc[i, 'Cluster Labels'] == 2:
        return 'green'
    elif df_new.loc[i, 'Cluster Labels'] == 3:
        return 'yellow'
    elif df_new.loc[i, 'Cluster Labels'] == 4:
        return 'orange'

# Add a marker for each location
i = 0
for p_code in df_new.loc[:, 'Postal code']:
    fl.CircleMarker(
        [df_new.loc[i, 'Latitude'], df_new.loc[i, 'Longitude']],
        radius=5,
        color=getcolor(i),
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(world_map)
    i += 1
world_map

Looking at this program through GitHub repository, you can't see the map. I couldn't solve this problem but I attached an image in the asignment.