Let's download the required dependencies

In [1]:
import numpy as np
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium
import requests
#!pip install geocoder

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



**Question 1**

We extract our data with the GET method using the requests method

In [2]:
data_url= 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
rdata = requests.get(data_url).text
soup= BeautifulSoup(rdata)
table= soup.find('table')

We get rid of thr rows where borough is not assigned and perform a series of text formatting to our data to place the postal code, borough and neighborhood into one line separated with commas

In [3]:
table_contents=[]

for row in table.findAll('td'):
    cell = {}
    if row.p.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

Our dataframe is ready

In [4]:
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
df.shape

(103, 3)

**Question 2 & 3**

In [7]:
import geocoder
def get_latlng(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [8]:
df['Lat_Lng'] = df['PostalCode'].apply(get_latlng)

def get_lat(latlng):
    return latlng[0]

def get_lng(latlng):
    return latlng[1]

    
df["Lat"] = df['Lat_Lng'].apply(get_lat)
df['Lng'] = df['Lat_Lng'].apply(get_lng)
df = df.drop(['Lat_Lng'], axis = 1)

print(df.shape)
print(df.head())

(103, 5)
  PostalCode           Borough                      Neighborhood       Lat  \
0        M3A        North York                         Parkwoods  43.75245   
1        M4A        North York                  Victoria Village  43.73057   
2        M5A  Downtown Toronto         Regent Park, Harbourfront  43.65512   
3        M6A        North York  Lawrence Manor, Lawrence Heights  43.72327   
4        M7A      Queen's Park     Ontario Provincial Government  43.66253   

        Lng  
0 -79.32991  
1 -79.31306  
2 -79.36264  
3 -79.45042  
4 -79.39188  


In [9]:
toronto_map = folium.Map(location=[43.65, -79.4], zoom_start=12) #Toronto latitude, longitude is [43.65, -79.4]

X = df['Lat']
Y = df['Lng']
Z = np.stack((X, Y), axis=1)

# Clustering with K-means on the basis of similar neighborhoods and boroughs given in the form of their Latitude and Longitude
kmeans = KMeans(n_clusters=4, random_state=0).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
df['Cluster'] = clusters

for latitude, longitude, borough, neighborhood, cluster in zip(df['Lat'], df['Lng'], df['Borough'], df['Neighborhood'] ,df['Cluster']):
    label = folium.Popup('{}, {}'.format(neighborhood, borough), parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(toronto_map)  

toronto_map

In [10]:
new_df = df.drop(['Cluster'], axis = 1)
new_df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Lat,Lng
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Queen's Park,Ontario Provincial Government,43.66253,-79.39188
5,M9A,Etobicoke,Islington Avenue,43.66263,-79.52831
6,M1B,Scarborough,"Malvern, Rouge",43.81139,-79.19662
7,M3B,North York,Don Mills North,43.74923,-79.36186
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.70718,-79.31192
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804


In [11]:
new_df.shape

(103, 5)