**Explore and cluster the neighborhoods in Toronto. I only work with boroughs that contain the word Toronto**

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

To obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).content
soup = BeautifulSoup(html,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table), header=0)[0]
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


Ignore cells with a borough that is Not assigned

In [4]:
df.replace("Not assigned", np.nan, inplace=True)
df.dropna(subset=['Borough'], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


Rows will be combined into one row with the neighborhoods separated with a comma

In [5]:
df1 = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ','.join(x.astype(str))).reset_index()
df1.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


Not assigned neighborhood

In [6]:
df1.tail(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
83,M6R,West Toronto,"Parkdale,Roncesvalles"
84,M6S,West Toronto,"Runnymede,Swansea"
85,M7A,Queen's Park,
86,M7R,Mississauga,Canada Post Gateway Processing Centre
87,M7Y,East Toronto,Business reply mail Processing Centre969 Eastern
88,M8V,Etobicoke,"Humber Bay Shores,Mimico South,New Toronto"
89,M8W,Etobicoke,"Alderwood,Long Branch"
90,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
91,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout..."
92,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw..."


Neighborhood will be the same as the borough

In [7]:
df1['Neighbourhood'].replace("nan", df1['Borough'], inplace=True)
df1.tail(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
83,M6R,West Toronto,"Parkdale,Roncesvalles"
84,M6S,West Toronto,"Runnymede,Swansea"
85,M7A,Queen's Park,Queen's Park
86,M7R,Mississauga,Canada Post Gateway Processing Centre
87,M7Y,East Toronto,Business reply mail Processing Centre969 Eastern
88,M8V,Etobicoke,"Humber Bay Shores,Mimico South,New Toronto"
89,M8W,Etobicoke,"Alderwood,Long Branch"
90,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
91,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout..."
92,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw..."


Print the number of rows of your dataframe

In [8]:
df1.shape

(103, 3)

We add the geographical coordinates of each postal code

In [9]:
url1 = 'http://cocl.us/Geospatial_data'
df2 = pd.read_csv(url1)
df2

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [10]:
df1[['Latitude', 'Longitude']] = df2[['Latitude', 'Longitude']]
df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


**We select Toronto**

In [11]:
df2 = df1[df1['Borough'].str.contains("Toronto")].reset_index(drop=True)
df2.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


**I separate the values of Neighbourhood**

In [12]:
def splitDataFrameList(df,target_column,separator):
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df

In [13]:
df3 = splitDataFrameList(df2,'Neighbourhood',',')
df3.head(20)

Unnamed: 0,Borough,Latitude,Longitude,Neighbourhood,Postcode
0,East Toronto,43.676357,-79.293031,The Beaches,M4E
1,East Toronto,43.679557,-79.352188,The Danforth West,M4K
2,East Toronto,43.679557,-79.352188,Riverdale,M4K
3,East Toronto,43.668999,-79.315572,The Beaches West,M4L
4,East Toronto,43.668999,-79.315572,India Bazaar,M4L
5,East Toronto,43.659526,-79.340923,Studio District,M4M
6,Central Toronto,43.72802,-79.38879,Lawrence Park,M4N
7,Central Toronto,43.712751,-79.390197,Davisville North,M4P
8,Central Toronto,43.715383,-79.405678,North Toronto West,M4R
9,Central Toronto,43.704324,-79.38879,Davisville,M4S


In [14]:
df3.shape

(74, 5)

We create map of Toronto 

In [15]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

# All requested packages already installed.



I take the mean of Latitude and the mean of Longitude to create the map

In [16]:
latitude = df3['Latitude'].mean()
longitude = df3['Longitude'].mean()
map = folium.Map(location=[latitude, longitude], zoom_start=10)
map

Neighborhoods superimposed on top

In [20]:
for lat, lon, borough, neighborhood in zip(df3['Latitude'], df3['Longitude'], df3['Borough'], df3['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='yellow',
        fill_opacity=0.5,
        parse_html=False).add_to(map)  
    
map

Cluster Neighborhoods  
4 clusters: East, West, Central, Downtown

In [22]:
from sklearn.cluster import KMeans

In [23]:
df4 = df3.drop(['Postcode', 'Borough', 'Neighbourhood'], 1)

In [24]:
kclusters = 4
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df4)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:80]

array([3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 3], dtype=int32)

In [25]:
centroids = kmeans.cluster_centers_
print(centroids)

[[ 43.69305396 -79.39856317]
 [ 43.65273031 -79.44587374]
 [ 43.64784841 -79.38446005]
 [ 43.67081971 -79.3272902 ]]


In [26]:
# add clustering labels
df3['Cluster Labels'] = kmeans.labels_
df3.head(20)

Unnamed: 0,Borough,Latitude,Longitude,Neighbourhood,Postcode,Cluster Labels
0,East Toronto,43.676357,-79.293031,The Beaches,M4E,3
1,East Toronto,43.679557,-79.352188,The Danforth West,M4K,3
2,East Toronto,43.679557,-79.352188,Riverdale,M4K,3
3,East Toronto,43.668999,-79.315572,The Beaches West,M4L,3
4,East Toronto,43.668999,-79.315572,India Bazaar,M4L,3
5,East Toronto,43.659526,-79.340923,Studio District,M4M,3
6,Central Toronto,43.72802,-79.38879,Lawrence Park,M4N,0
7,Central Toronto,43.712751,-79.390197,Davisville North,M4P,0
8,Central Toronto,43.715383,-79.405678,North Toronto West,M4R,0
9,Central Toronto,43.704324,-79.38879,Davisville,M4S,0


In [27]:
import matplotlib.cm as cm
from matplotlib import colors as colors

In [30]:
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, nei, cluster in zip(df3['Latitude'], df3['Longitude'], df3['Neighbourhood'], df3['Cluster Labels']):
    label = folium.Popup(str(nei) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.2).add_to(map)
       
map
