#### Install pandas and read html

In [1]:
import pandas as pd

url= 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df=pd.read_html(url)

#### Check how many tables are in the website

In [2]:
print(len(df))

3


#### Find out if working data is correct

In [3]:
print(df[0])

    Postal Code           Borough  \
0           M1A      Not assigned   
1           M2A      Not assigned   
2           M3A        North York   
3           M4A        North York   
4           M5A  Downtown Toronto   
..          ...               ...   
175         M5Z      Not assigned   
176         M6Z      Not assigned   
177         M7Z      Not assigned   
178         M8Z         Etobicoke   
179         M9Z      Not assigned   

                                         Neighbourhood  
0                                         Not assigned  
1                                         Not assigned  
2                                            Parkwoods  
3                                     Victoria Village  
4                            Regent Park, Harbourfront  
..                                                 ...  
175                                       Not assigned  
176                                       Not assigned  
177                                       

#### Ensure column title can be read properly

In [4]:
print(df[0]['Postal Code'])

0      M1A
1      M2A
2      M3A
3      M4A
4      M5A
      ... 
175    M5Z
176    M6Z
177    M7Z
178    M8Z
179    M9Z
Name: Postal Code, Length: 180, dtype: object


#### Assign working data frame to its own dataframe variable

In [5]:
df1=df[0]
print(df1)

    Postal Code           Borough  \
0           M1A      Not assigned   
1           M2A      Not assigned   
2           M3A        North York   
3           M4A        North York   
4           M5A  Downtown Toronto   
..          ...               ...   
175         M5Z      Not assigned   
176         M6Z      Not assigned   
177         M7Z      Not assigned   
178         M8Z         Etobicoke   
179         M9Z      Not assigned   

                                         Neighbourhood  
0                                         Not assigned  
1                                         Not assigned  
2                                            Parkwoods  
3                                     Victoria Village  
4                            Regent Park, Harbourfront  
..                                                 ...  
175                                       Not assigned  
176                                       Not assigned  
177                                       

#### Describe current data

In [6]:
df1.describe()

Unnamed: 0,Postal Code,Borough,Neighbourhood
count,180,180,180
unique,180,11,100
top,M6P,Not assigned,Not assigned
freq,1,77,77


#### Removing any borough that is 'Not assigned' and checking it has been removed per step 3

In [7]:
df1.drop(df1.loc[df1['Borough']=='Not assigned'].index,inplace=True)
df1['Borough'].unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

#### Check for any 'Not assigned' values in Neighbourhood column as per step 3

In [8]:
'Not assigned' in df1.Neighbourhood

False

#### Check that there are no duplicate Postal Codes as per step 3

In [9]:
df1.describe()

Unnamed: 0,Postal Code,Borough,Neighbourhood
count,103,103,103
unique,103,10,99
top,M4J,North York,Downsview
freq,1,24,4


In [10]:
df1.set_index('Postal Code')

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...
M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
M4Y,Downtown Toronto,Church and Wellesley
M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


 Seeing as there are no unassigned boroughs and Neighborhoods and all postal codes are unique this data can be read. Below will be the output of the shape of the data frame.

In [11]:
df1.shape

(103, 3)

## Start of Part 2
#### Can't use Geocoder so importing data from website

In [12]:
lat_lng_url = 'http://cocl.us/Geospatial_data'
lat_lng_df = pd.read_csv(lat_lng_url)
lat_lng_df.set_index('Postal Code')
lat_lng_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Inner join table based on Postal Code index

In [13]:
# Inner join based on wikipedia Postal Code index

merge_df=df1.merge(lat_lng_df,how='inner')

#### Check if Latitude or Longitude have any null values after merge

In [14]:
merge_df['Latitude'].isnull().any()

False

In [15]:
merge_df['Longitude'].isnull().any()

False

### Final merged dataframe

In [16]:
merge_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


## Start of Part 3


#### Import libraries for mapping and clustering

In [17]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium
import numpy as np

#### Place points on map of Toronto

In [18]:
address = 'Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [19]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(merge_df['Latitude'], merge_df['Longitude'], merge_df['Postal Code'], merge_df['Borough']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Cluster analysis

We will find the number of unique boroughs and make clusters based on that number

In [20]:
merge_df['Borough'].unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [21]:
# set number of clusters for 10 different borough names
kclusters = 10

#Dropping non numeric values
toronto_grouped_clustering = merge_df.drop(['Postal Code','Borough','Neighbourhood'],1)

toronto_grouped_clustering.head()

Unnamed: 0,Latitude,Longitude
0,43.753259,-79.329656
1,43.725882,-79.315572
2,43.65426,-79.360636
3,43.718518,-79.464763
4,43.662301,-79.389494


#### Run K means test and add to dataset 

In [22]:
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 1, 5, 0, 5, 9, 3, 4, 1, 5, 8, 9, 3, 1, 1, 5, 8, 9, 3, 1, 5, 2,
       3, 1, 5, 8, 7, 4, 0, 1, 5, 2, 7, 4, 0, 1, 5, 5, 7, 4, 0, 1, 5, 5,
       1, 4, 0, 1, 5, 0, 6, 7, 4, 0, 1, 8, 2, 6, 1, 4, 0, 8, 8, 2, 6, 7,
       4, 8, 8, 2, 6, 7, 4, 8, 5, 2, 9, 6, 7, 8, 5, 2, 7, 8, 5, 7, 8, 5,
       9, 6, 7, 5, 5, 9, 6, 3, 5, 5, 2, 5, 1, 2, 9])

In [23]:
# add clustering labels to dataset
merge_df.insert(0, 'Cluster Labels', kmeans.labels_)

In [24]:
merge_df

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,1,M3A,North York,Parkwoods,43.753259,-79.329656
1,1,M4A,North York,Victoria Village,43.725882,-79.315572
2,5,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,0,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,5,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...,...
98,2,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,5,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,1,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,2,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Final map of Postal Codes

In [25]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(merge_df['Latitude'], merge_df['Longitude'], merge_df['Postal Code'], merge_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters