In [None]:
!pip3 install beautifulsoup4
!pip3 install lxml
!pip3 install geocoder
!conda install -c conda-forge folium=0.5.0 --yes 

### After installing necessary tools, we download all the dependencies that we will need.

In [1]:
import urllib.request, urllib.parse, urllib.error
import pandas as pd
from bs4 import BeautifulSoup
import geocoder
import requests
import numpy as np
from geopy.geocoders import Nominatim

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium 

### In this section we will extract data from the Wikipedia link and we will make Beautiful Soup object in order to parse HTML file.

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page=urllib.request.urlopen(url) 
soup=BeautifulSoup(page,'html.parser')

In [None]:
#print(soup.prettify())

### We will find all tables from soup object and then we will save the favorite table in right_table.

In [3]:
all_tables=soup.find_all('table')
#all_tables

In [4]:
right_table=all_tables[0]
#right_table

## Now the informtion that we need will be extracted from righte_tabel and the cells with a borough that is Not assigned will be ignored. Also If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [5]:
table_contents=[]

for row in right_table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
    

In [6]:
df.shape

(103, 3)

### Now we have all the neighborhoods in Pandas data frame called df. From PostalCode Column in the data frame, we will find the latitude and longitude of neighborhoods and we will save them in the data frame called df_ll. Then the two data frames (df and df_ll) will be concatenated to form df_t.

In [7]:
lat=[]
long=[]
for code in df['PostalCode']:
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
    while (g.latlng is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
    latlng = g.latlng
    lat.append(latlng[0])
    long.append(latlng[1])

In [8]:
df_ll=pd.DataFrame([lat,long],['latitude','longitude']).T
#df_ll

In [9]:
df_t=pd.concat([df,df_ll],axis=1)
df_t

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Queen's Park,Ontario Provincial Government,43.66253,-79.39188
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.65319,-79.51113
99,M4Y,Downtown Toronto,Church and Wellesley,43.66659,-79.38133
100,M7Y,East Toronto Business,Enclave of M4L,43.64869,-79.38544
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.63278,-79.48945


### We will save only boroughs that contain the word Toronto in the new data frame.

In [10]:
toronto_data =df_t[df_t['Borough'].str.match('.*Toronto')==True].reset_index(drop=True)
toronto_data

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804
2,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587
3,M4E,East Toronto,The Beaches,43.67709,-79.29547
4,M5E,Downtown Toronto,Berczy Park,43.64536,-79.37306
5,M5G,Downtown Toronto,Central Bay Street,43.65609,-79.38493
6,M6G,Downtown Toronto,Christie,43.66869,-79.42071
7,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6497,-79.38258
8,M6H,West Toronto,"Dufferin, Dovercourt Village",43.66505,-79.43891
9,M4J,East York/East Toronto,The Danforth East,43.68811,-79.33418


### Define Foursquare Credentials and Version

In [46]:
# @hidden_cell
CLIENT_ID = 'B120YYF20FFDHBITQQAWNKY50YB4S5O2G0GIXI4TEQJBYNQ3' 
CLIENT_SECRET = 'EYMLJ3JZH00XVRUJCGDI0VYBXEKXXOU4EXCR2WWKC24UATZM' 
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value


### Let's create a function to explore the venues to all the neighborhoods in Toronto.

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    n=0
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(n,name)
        n=n+1    
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
             name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['latitude'],
                                   longitudes=toronto_data['longitude']
                                  )

0 Regent Park, Harbourfront
1 Garden District, Ryerson
2 St. James Town
3 The Beaches
4 Berczy Park
5 Central Bay Street
6 Christie
7 Richmond, Adelaide, King
8 Dufferin, Dovercourt Village
9 The Danforth  East
10 Harbourfront East, Union Station, Toronto Islands
11 Little Portugal, Trinity
12 The Danforth West, Riverdale
13 Toronto Dominion Centre, Design Exchange
14 Brockton, Parkdale Village, Exhibition Place
15 India Bazaar, The Beaches West
16 Commerce Court, Victoria Hotel
17 Studio District
18 Lawrence Park
19 Roselawn
20 Davisville North
21 Forest Hill North & West
22 High Park, The Junction South
23 North Toronto West
24 The Annex, North Midtown, Yorkville
25 Parkdale, Roncesvalles
26 Davisville
27 University of Toronto, Harbord
28 Runnymede, Swansea
29 Moore Park, Summerhill East
30 Kensington Market, Chinatown, Grange Park
31 Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
32 CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South

In [14]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65512,-79.36264,Tandem Coffee,43.653559,-79.361809,Coffee Shop
1,"Regent Park, Harbourfront",43.65512,-79.36264,Roselle Desserts,43.653447,-79.362017,Bakery
2,"Regent Park, Harbourfront",43.65512,-79.36264,Souvlaki Express,43.655584,-79.364438,Greek Restaurant
3,"Regent Park, Harbourfront",43.65512,-79.36264,Berkeley Church,43.655123,-79.365873,Event Space
4,"Regent Park, Harbourfront",43.65512,-79.36264,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot


In [15]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,...,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category.

In [42]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Aquarium,Art Gallery,Arts & Crafts Store,...,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.021277,0.0,0.021277,0.0,...,0.0,0.0,0.0,0.0,0.042553,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.012346,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.024691,...,0.0,0.012346,0.0,0.0,0.024691,0.0,0.012346,0.0,0.0,0.0
2,"CN Tower, King and Spadina, Railway Lands, Har...",0.013514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.013514,0.0,0.0,0.013514,0.0,0.0,0.0
3,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.018182,0.018182,0.018182,0.0,0.0
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Church and Wellesley,0.0,0.0,0.014706,0.014706,0.014706,0.0,0.0,0.0,0.0,...,0.014706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014706,0.0
6,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
7,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville North,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Dufferin, Dovercourt Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Run k-means to cluster the neighborhood into 5 clusters.

In [43]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)

### Let's create a new dataframe that includes the cluster as well.

In [44]:
toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged=toronto_data
toronto_merged=toronto_merged.join(toronto_grouped.set_index('Neighborhood'), on='Neighborhood').dropna()
toronto_merged['Cluster Labels']=toronto_merged['Cluster Labels'].astype('int32')
toronto_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude,Cluster Labels,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,...,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0
2,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.012658,0.0,0.0,0.012658,0.0,0.0
3,M4E,East Toronto,The Beaches,43.67709,-79.29547,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M5E,Downtown Toronto,Berczy Park,43.64536,-79.37306,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.042553,0.0,0.0,0.0,0.0,0.0
5,M5G,Downtown Toronto,Central Bay Street,43.65609,-79.38493,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.018182,0.018182,0.018182,0.0,0.0
6,M6G,Downtown Toronto,Christie,43.66869,-79.42071,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6497,-79.38258,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
8,M6H,West Toronto,"Dufferin, Dovercourt Village",43.66505,-79.43891,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M4J,East York/East Toronto,The Danforth East,43.68811,-79.33418,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Finally, let's visualize the resulting clusters.

In [45]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['latitude'], toronto_merged['longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters