Parse table from Wikipedia webpage

In [28]:
import pandas as pd # library for data analysis
from bs4 import BeautifulSoup # library to parse web pages
import requests # library to handle requests
import csv
import folium # map rendering library
from sklearn.cluster import KMeans
import numpy as np
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

Parse the website with BeautifulSoup - extracting the data from the table

In [30]:
req = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(req.content, 'html.parser')
#print(soup.prettify())
data = []
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_body = table.find('tbody')
#print(table_body)

# get the headers of the table and store in a list
table_headers = []
headers = table_body.find_all('th')
for header in headers:
    header_value = header.get_text().strip()
    table_headers.append(header_value)

# get the rows of the table
rows = table_body.find_all('tr')
for row in rows:
    row_data = {}
    cells = row.find_all('td')
    for position, cell in enumerate(cells):
        value = cell.get_text().strip()
        key = table_headers[position]
        # add the value to a dictionary
        row_data[key] = value

    # check that there is some data and that Borough is not unassigned
    if row_data and row_data.get('Borough', '') != 'Not assigned':
        data.append(row_data)



load the data into a DataFrame

In [31]:
df = pd.DataFrame(data)
# rename the postal code heading
df.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [32]:
# print the shape of the data
df.shape

(103, 3)

In [33]:
#import geocoder # import geocoder
coordinate_data = {}
with open('Geospatial_Coordinates.csv') as in_file:
    data = csv.DictReader(in_file)
    for row in data:
        coordinate_data[row['Postal Code']] = {'longitude': row['Longitude'],
                                               'latitude': row['Latitude']}

def get_coordinates(postal_code):
    ret = coordinate_data.get(postal_code, {})
    latitude = ret.get('latitude')
    longitude = ret.get('longitude')
    return longitude, latitude

get the longitude and latitude for each postcode



In [34]:
longitude = []
latitude = []

for index, row in df.iterrows():
    postal_code = row.get('PostalCode')
    row_long, row_lat = get_coordinates(postal_code=postal_code)
    longitude.append(float(row_long))
    latitude.append(float(row_lat))

df['Latitude'] = latitude
df['Longitude'] = longitude

Show all the data is in the dataFrame

In [35]:
df


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [36]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df[['Longitude','Latitude']])

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([4, 4, 2, 0, 2, 1, 3, 4, 4, 2])

In [37]:

# add clustering labels
df.insert(0, 'Cluster Labels', kmeans.labels_)

df.head()

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,4,M3A,North York,Parkwoods,43.753259,-79.329656
1,4,M4A,North York,Victoria Village,43.725882,-79.315572
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,0,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,2,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [38]:
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

ValueError: Location should consist of two numerical values, but [43.7532586, 43.7258823, 43.6542599, 43.718518, 43.6623015, 43.6678556, 43.8066863, 43.7459058, 43.7063972, 43.6571618, 43.709577, 43.6509432, 43.7845351, 43.7258997, 43.6953439, 43.6514939, 43.6937813, 43.6435152, 43.7635726, 43.6763574, 43.6447708, 43.6890256, 43.7709921, 43.7090604, 43.6579524, 43.669542, 43.773136, 43.8037622, 43.7543283, 43.7053689, 43.6505712, 43.6690051, 43.7447342, 43.7785175, 43.7679803, 43.685347, 43.6408157, 43.6479267, 43.7279292, 43.7869473, 43.7374732, 43.6795571, 43.6471768, 43.6368472, 43.7111117, 43.7574902, 43.7390146, 43.6689985, 43.6481985, 43.7137562, 43.7563033, 43.716316, 43.789053, 43.7284964, 43.6595255, 43.7332825, 43.6911158, 43.7247659, 43.692657, 43.7701199, 43.7616313, 43.7280205, 43.7116948, 43.6731853, 43.706876, 43.7574096, 43.7527583, 43.7127511, 43.6969476, 43.6616083, 43.696319, 43.7500715, 43.7827364, 43.7153834, 43.6727097, 43.6489597, 43.6369656, 43.6889054, 43.7942003, 43.7043244, 43.6626956, 43.6515706, 43.7816375, 43.6895743, 43.6532057, 43.8152522, 43.6864123, 43.6289467, 43.6056466, 43.7394164, 43.7995252, 43.6795626, 43.6464352, 43.6024137, 43.7067483, 43.8361247, 43.667967, 43.6484292, 43.6536536, 43.6658599, 43.6627439, 43.6362579, 43.6288408] of type <class 'list'> is not convertible to float.

Find the average position to center the map

In [None]:
def Average(lst):
    return sum(lst) / len(lst)

avg_latitude = Average(latitude)
avg_longitude = Average(longitude)

In [None]:
# create map
map_clusters = folium.Map(location=[avg_latitude, avg_longitude], zoom_start=11)


Add positions to the map

In [40]:
markers_colors = []
for index, row in df.iterrows():
    postal_code = row['PostalCode']
    lat = row['Latitude']
    lon = row['Longitude']
    neighbour = row['Neighborhood']
    cluster = row['Cluster Labels']
    label = folium.Popup(str(postal_code) + ' Cluster ' + str(neighbour), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters