In [64]:
# Tutorial on how to scrap data from Wikipedia here: https://simpleanalytical.com/how-to-web-scrape-wikipedia-python-urllib-beautiful-soup-pandas

# Import the libraries

import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
print('Done')

Done


In [65]:
# Get the webpage where to extract the data

URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(URL)

In [66]:
# Scrap the data

soup = BeautifulSoup(page, "lxml")
all_tables=soup.find_all("table")
right_table=soup.find('table', class_='wikitable sortable')


# Analyse the source code and classsify the data in columns

A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

In [67]:
# Transfrom the data into a readable dataframe with pandas

df=pd.DataFrame(A,columns=['Postal_Code'])
df['Borough']=B
df['Neighborhood']=C
df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [68]:
# drop rows if borough is not assigned

df1=df[df['Neighborhood'].str.strip().astype(bool)]
df1.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [69]:
df1.astype('str')
df1.dtypes

Postal_Code     object
Borough         object
Neighborhood    object
dtype: object

In [70]:
df1.shape

(103, 3)

In [71]:
# Importing and reading CSV file with geographical coordinates

url2='http://cocl.us/Geospatial_data'
df2=pd.read_csv(url2)
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [72]:
# Rename df2 column "Postal Code" to have the same key: "Postal_Code" than df1

df2.rename(columns={'Postal Code':'Postal_Code'},inplace=True)
df2.head()

Unnamed: 0,Postal_Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [73]:
# Merge the 2 dataframes 

df3=pd.concat([df1,df2],sort=True, ignore_index=True,axis=1, verify_integrity=True)
df3.head()

Unnamed: 0,0,1,2,3,4,5
0,,,,M1B,43.806686,-79.194353
1,,,,M1C,43.784535,-79.160497
2,M3A,North York,Parkwoods,M1E,43.763573,-79.188711
3,M4A,North York,Victoria Village,M1G,43.770992,-79.216917
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",M1H,43.773136,-79.239476


In [74]:
# Clean the dataframe from NaN Values and undesirable columns

df3.dropna(inplace=True)
df3.columns=["Postal_Code","Borough","Neighborhood","ToErase","Longitude","Latitude"]
df3.drop("ToErase",axis=1, inplace=True)
df3.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood,Longitude,Latitude
2,M3A,North York,Parkwoods,43.763573,-79.188711
3,M4A,North York,Victoria Village,43.770992,-79.216917
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.773136,-79.239476
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.744734,-79.239476
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.727929,-79.262029


In [75]:
df3.dtypes

Postal_Code      object
Borough          object
Neighborhood     object
Longitude       float64
Latitude        float64
dtype: object

In [76]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df3['Borough'].unique()),
        df3.shape[0]
    )
)

The dataframe has 9 boroughs and 68 neighborhoods.


In [77]:
# Import some libraries to cluster and display maps

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
print("Matplot done")

# import k-means from clustering stage
from sklearn.cluster import KMeans
print("Sklearn done")

# !conda install -c conda-forge geopy --yes # uncomment if not installed
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
print ("Geopy done")


# !conda install -c conda-forge folium=0.5.0 --yes #uncomment if not installed
import folium # map rendering library
print("Folium done")


print('All Libraries imported.')

Matplot done
Sklearn done
Geopy done
Folium done
All Libraries imported.


In [78]:
# Define the area

address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [79]:
# create map of Toronto using latitude and longitude values

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df3['Latitude'], df3['Longitude'], df3['Borough'], df3['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [91]:
# Segment and cluster only the neighborhoods in Downtown Toronto

dttoronto_data=df3[df3["Borough"] == "Downtown Toronto"].reset_index(drop=True)
dttoronto_data

Unnamed: 0,Postal_Code,Borough,Neighborhood,Longitude,Latitude
