Parse table from Wikipedia webpage

In [1]:
import pandas as pd # library for data analysis
from bs4 import BeautifulSoup # library to parse web pages
import requests # library to handle requests
import csv
import folium # map rendering library


Parse the website with BeautifulSoup - extracting the data from the table

In [3]:
req = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(req.content, 'html.parser')
#print(soup.prettify())
data = []
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_body = table.find('tbody')
#print(table_body)

# get the headers of the table and store in a list
table_headers = []
headers = table_body.find_all('th')
for header in headers:
    header_value = header.get_text().strip()
    table_headers.append(header_value)

# get the rows of the table
rows = table_body.find_all('tr')
for row in rows:
    row_data = {}
    cells = row.find_all('td')
    for position, cell in enumerate(cells):
        value = cell.get_text().strip()
        key = table_headers[position]
        # add the value to a dictionary
        row_data[key] = value

    # check that there is some data and that Borough is not unassigned
    if row_data and row_data.get('Borough', '') != 'Not assigned':
        data.append(row_data)



load the data into a DataFrame

In [4]:
df = pd.DataFrame(data)
# rename the postal code heading
df.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [5]:
# print the shape of the data
df.shape

(103, 3)

In [6]:
#import geocoder # import geocoder
coordinate_data = {}
with open('Geospatial_Coordinates.csv') as in_file:
    data = csv.DictReader(in_file)
    for row in data:
        coordinate_data[row['Postal Code']] = {'longitude': row['Longitude'],
                                               'latitude': row['Latitude']}

def get_coordinates(postal_code):
    ret = coordinate_data.get(postal_code, {})
    latitude = ret.get('latitude')
    longitude = ret.get('longitude')
    return longitude, latitude

get the longitude and latitude for each postcode



In [7]:
longitude = []
latitude = []

for index, row in df.iterrows():
    postal_code = row.get('PostalCode')
    row_long, row_lat = get_coordinates(postal_code=postal_code)
    longitude.append(float(row_long))
    latitude.append(float(row_lat))

df['Latitude'] = latitude
df['Longitude'] = longitude

Show all the data is in the dataFrame

In [None]:
df



Find the average position to center the map

In [None]:
def Average(lst):
    return sum(lst) / len(lst)

avg_latitude = Average(latitude)
avg_longitude = Average(longitude)

In [26]:
map_clusters = folium.Map(location=[avg_latitude, avg_longitude], zoom_start=11)

Add positions to the map

In [None]:
for index, row in df.iterrows():
    postal_code = row['PostalCode']
    lat = row['Latitude']
    lon = row['Longitude']
    neighbour = row['Neighborhood']
    label = folium.Popup(str(postal_code) + ' Cluster ' + str(neighbour), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        #color=rainbow[cluster-1],
        fill=True,
        #fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters