In [44]:
import pandas as pd
import requests 

# Part 1: Get the data from Wikipedia

In [45]:
#Interrogate the target url
website_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
result = requests.get(website_url).text

#Define the class of the elment we're looking for [wikipedia table]
class_to_search = 'wikitable sortable'

#Retrieve all the tables in the document. Return an exception if no table is found.
tables = pd.read_html(website_url, attrs = {'class': class_to_search})
if not len(tables) > 0:
    raise Exception('The source page contains no tables')

#Get the first table (there is only one in the source page)
df = tables[0]

Clean the imported data. Rename the 'Postal Code' column as 'PostalCode' and drop the records that have no assigned neighbourhood 

In [46]:
#Rename 'Postal Code' column as 'PostalCode'
df.rename(columns = {'Postal Code' : 'PostalCode'}, inplace = True)

#Drop the records for which Borough is 'Not assigned'
df.drop(df[df.Borough == 'Not assigned'].index, inplace = True)

# Part 2: Add the geographical data

In [47]:
#Read from the csv
csv_source = 'https://cocl.us/Geospatial_data'
df_coordinates = pd.read_csv(csv_source)
df_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [48]:
df_coordinates.rename(columns = {'Postal Code' : 'PostalCode'}, inplace = True)
df_coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Add longitude and latitude data to the original neighbourhood dataset

In [49]:
#Do inner join on 'PostalCode'
df = pd.merge(df, df_coordinates, how = 'inner', on = ['PostalCode'])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Part 3: Do the clustering

Let's work only with boroughs that contain the word Toronto

In [50]:
#Drop the records that do not have the keyword 'Toronto' in the 'Borough' field
df.drop(df[~df.Borough.str.contains('Toronto')].index, inplace = True)

Cluster the geographical data using k-means (scikit-learn implementation). 

In [51]:
from sklearn.cluster import KMeans

#Set the colours for clustering. 
colours = ['#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e']

#Let the number of clusters be the same as the number of colours defined. Add or remove colours
#to change the number of clusters
k = len(colours)

#Do the clustering
df.dtypes
clusters = KMeans(n_clusters = k, random_state = 0).fit(df[['Latitude', 'Longitude']].to_numpy())

#Add the clusters' labels to the dataframe
df['Cluster'] = clusters.labels_
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0
19,M4E,East Toronto,The Beaches,43.676357,-79.293031,1


Finally, let's visualize the resulting clusters

In [52]:
import folium
from geopy.geocoders import Nominatim

The code in the following cell was used to retrieve the center of the map. No we used the cached vale to avoid calling the service again.

In [53]:
#geolocator = Nominatim(user_agent = 'coursera_capstone_project')
#location = geolocator.geocode("Toronto")
#map_center = [location.latitude, location.longitude]

In [56]:
#Use the cached value
map_center = [43.6534817, -79.3839347]

#Create the map
clusters_map = folium.Map(location = map_center, zoom_start = 12)

#Add markers to the map to show the clusters
markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighborhood'], df['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color = colours[cluster],
        fill = True,
        fill_color = colours[cluster],
        fill_opacity=0.7).add_to(clusters_map)

clusters_map