# Part 1: Acquiring the data

First, import libraries and create a BeautifulSoup object

In [13]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
data = response.text

soup = BeautifulSoup(data, 'html.parser')


Next, create a pandas DataFrame with the three columns requested. From inspecting the wikipedia page, there are no boroughs that dont also have a neighbourhood listed in brackets, so we can just split into two columns, with some boroughs having multiple neighbourhoods separated by commas.

In [43]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)
df=pd.DataFrame(table_contents)

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [21]:
df.shape

(103, 3)

So we have a total of 103 Boroughs.

# Part 2: Using csv file to get geographical locations of neighbourhoods

First, read data into pandas dataframe

In [30]:
coordinates = pd.read_csv("c:/Users/alexl/Downloads/Geospatial_Coordinates.csv")
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Then, loop over each row in original dataframe df to build new columns with Latitude and Longitude.

In [44]:
latitude = []
longitude = []
for postal_code in df['PostalCode'].values:
    latitude.append(float(coordinates[coordinates['Postal Code'] == postal_code]['Latitude'].values))
    longitude.append(float(coordinates[coordinates['Postal Code'] == postal_code]['Longitude'].values))
df['Latitude'] = latitude
df['Longitude'] = longitude
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


Then, we can visualize those neighbourhoods on a map using the folium library. https://github.com/a13lee/Coursera_Capstone/blob/f5cdd47477abea0a59971aebe117229a22bab951/image1.JPG)

In [52]:
import folium

# center map on center of Toronto
map_toronto = folium.Map(location=[43.6532,  -79.3832], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Part 3: Clustering neighbourhoods in Toronto

Here, I will create a simple clustering model for the neighbourhoods based solely on longitude and latitude.

In [48]:
from sklearn.cluster import KMeans

# arbitrarily chosen number of clusters
kclusters = 5 

# remove columns that aren't being used for the model
clustering_df = df.drop(['PostalCode','Borough','Neighborhood'],1) 

# create the model
kmeans = KMeans(n_clusters=kclusters, random_state=0) 

#fit it to the data
kmeans.fit(clustering_df)

KMeans(n_clusters=5, random_state=0)

Next, I add a new column to the above dataframe with the assigned cluster from the KMeans algorithm.

In [54]:
clustering_df['Cluster'] = kmeans.labels_
clustering_df.head()

Unnamed: 0,Latitude,Longitude,Cluster
0,43.753259,-79.329656,4
1,43.725882,-79.315572,4
2,43.65426,-79.360636,2
3,43.718518,-79.464763,0
4,43.662301,-79.389494,2


Finally, we can now visualize the different neighbourhoods with their assigned clusters, represented by different colours. https://github.com/a13lee/Coursera_Capstone/blob/f5cdd47477abea0a59971aebe117229a22bab951/image2.JPG

In [57]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

map_clusters = folium.Map(location=[43.6532,  -79.3832], zoom_start=10)

markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighborhood'], clustering_df['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters