#### Importing Libraries

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

### Scraping data using Beautiful Soup

In [3]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(URL).text
soup = BeautifulSoup(page, 'xml')

In [4]:
tb=soup.find('table')

#### Transforming the data into a pandas dataframe

In [5]:
col_names=['Postal Code','Borough','Neighborhood']
neigh_df = pd.DataFrame(columns = col_names)

In [6]:
for r in tb.find_all('tr'):
    row_data=[]
    for c in r.find_all('td'):
        row_data.append(c.text.strip())
    if len(row_data)==3:
        neigh_df.loc[len(neigh_df)] = row_data

In [7]:
neigh_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Droping rows in which Borough that is 'Not assigned'.

In [8]:
dt = neigh_df[neigh_df['Borough'] =='Not assigned'].index
neigh_df.drop(dt,inplace=True)

#### Assigning Neighborhood same as Borough (for Neighborhood that is 'Not assigned')

In [9]:
neigh_df.loc[ neigh_df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = neigh_df['Borough']

In [10]:
neigh_df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [11]:
neigh_df.shape

(103, 3)

#### Accessing geographical coordinates of the neighborhood

In [12]:
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')

In [13]:
geo_df.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


#### Merging both the dataframes

In [14]:
neigh_df= pd.merge(neigh_df, geo_df, on='Postal Code')

In [15]:
neigh_df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


#### Use geopy library to get the latitude and longitude values of Toronto,ON.


In [16]:
# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes 
# map rendering library
import folium 

print('Libraries imported.')

Libraries imported.


In [17]:
address = 'Toronto,ON'

geolocator = Nominatim(user_agent="Toronto")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### Create a map of Toronto,ON with neighborhoods superimposed on top.

In [18]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neigh_df['Latitude'], neigh_df['Longitude'], neigh_df['Borough'], neigh_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='cyan',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [19]:
CLIENT_ID = '3VXZ3STQBXO3JC4LNHGE0NPIAY52YWM5SLR4LHXWGSCV14HO' # your Foursquare ID
CLIENT_SECRET = '3VVOI21U0FK5C5MCRC0BMMEWATJ30ZIV32A5NRHCBWPRY5CW' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3VXZ3STQBXO3JC4LNHGE0NPIAY52YWM5SLR4LHXWGSCV14HO
CLIENT_SECRET:3VVOI21U0FK5C5MCRC0BMMEWATJ30ZIV32A5NRHCBWPRY5CW


In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            100)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue','Venue Latitude','Venue Longitude','Venue Category']
    
    return(nearby_venues)

In [21]:
toronto_venues = getNearbyVenues(names=neigh_df['Neighborhood'],latitudes=neigh_df['Latitude'],longitudes=neigh_df['Longitude'])

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

In [22]:
toronto_venues.head(10)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
5,Victoria Village,43.725882,-79.315572,The Frig,43.727051,-79.317418,French Restaurant
6,Victoria Village,43.725882,-79.315572,Pizza Nova,43.725824,-79.31286,Pizza Place
7,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
8,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
9,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center


####  Analyzing Each Neighborhood

In [23]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Agincourt,0.000000,0.000000,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.0000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
1,"Alderwood, Long Branch",0.000000,0.000000,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.0000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
2,"Bathurst Manor, Wilson Heights, Downsview North",0.000000,0.000000,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.0000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
3,Bayview Village,0.000000,0.000000,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.0000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
4,"Bedford Park, Lawrence Manor East",0.000000,0.000000,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.0000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
5,Berczy Park,0.000000,0.000000,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.0000,...,0.00,0.017241,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
6,"Birch Cliff, Cliffside West",0.000000,0.000000,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.0000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
7,"Brockton, Parkdale Village, Exhibition Place",0.000000,0.000000,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.0000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
8,"Business reply mail Processing Centre, South C...",0.055556,0.000000,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.0000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
9,"CN Tower, King and Spadina, Railway Lands, Har...",0.000000,0.000000,0.000000,0.062500,0.0625,0.0625,0.125,0.1875,0.0625,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000


In [25]:
len(toronto_grouped[toronto_grouped["Gym"] > 0])

20

In [26]:
toronto_grouped.shape

(95, 268)

In [27]:
toronto_gym = toronto_grouped[["Neighborhood","Gym"]]
toronto_gym.shape

(95, 2)

### Cluster Neighborhoods

In [28]:
kclusters = 4
toronto_grouped_clustering = toronto_gym.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 0, 0, 0, 0, 0, 2, 0, 0])

In [29]:
gym_merged= toronto_gym.copy()

In [30]:
# add clustering labels
gym_merged.insert(0, 'Cluster Labels', kmeans.labels_)

In [31]:
gym_merged.head()

Unnamed: 0,Cluster Labels,Neighborhood,Gym
0,0,Agincourt,0.0
1,3,"Alderwood, Long Branch",0.166667
2,0,"Bathurst Manor, Wilson Heights, Downsview North",0.0
3,0,Bayview Village,0.0
4,0,"Bedford Park, Lawrence Manor East",0.0


In [32]:
gym_merged = gym_merged.join(neigh_df.set_index("Neighborhood"), on="Neighborhood")
gym_merged=gym_merged.drop(columns=['Postal Code','Borough'])
print(gym_merged.shape)
gym_merged.head(10) 

(99, 5)


Unnamed: 0,Cluster Labels,Neighborhood,Gym,Latitude,Longitude
0,0,Agincourt,0.0,43.7942,-79.262029
1,3,"Alderwood, Long Branch",0.166667,43.602414,-79.543484
2,0,"Bathurst Manor, Wilson Heights, Downsview North",0.0,43.754328,-79.442259
3,0,Bayview Village,0.0,43.786947,-79.385975
4,0,"Bedford Park, Lawrence Manor East",0.0,43.733283,-79.41975
5,0,Berczy Park,0.017241,43.644771,-79.373306
6,0,"Birch Cliff, Cliffside West",0.0,43.692657,-79.264848
7,2,"Brockton, Parkdale Village, Exhibition Place",0.045455,43.636847,-79.428191
8,0,"Business reply mail Processing Centre, South C...",0.0,43.662744,-79.321558
9,0,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,43.628947,-79.39442


#### Visualizing the resulting clusters using map

In [33]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(gym_merged['Latitude'], gym_merged['Longitude'], gym_merged['Neighborhood'], gym_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Examining Clusters

#### Cluster 1

In [34]:
gym_merged.loc[gym_merged['Cluster Labels'] == 0].shape

(82, 5)

#### Cluster 2

In [38]:
gym_merged.loc[gym_merged['Cluster Labels'] == 1].shape

(1, 5)

#### Cluster 3

In [39]:
gym_merged.loc[gym_merged['Cluster Labels'] == 2].shape

(13, 5)

#### Cluster 4

In [40]:
gym_merged.loc[gym_merged['Cluster Labels'] == 3].shape

(3, 5)

## Results
#### The results from the k-means clustering show that we can categorize the neighborhoods into 4 clusters based on the frequency of occurrence for “Gym”: The information is important as we can see that the highest number of Gyms are in Neighborhood in Cluster 1, while Cluster 4 has less number Gym in the neighborhood. The second greatest number of Gyms are formed around Cluster 3 and the least number of Gyms were found in Cluster 2 with only 1 in that neighborhood. The results of the clustering are visualized in the map below with cluster 0 in red, cluster 1 in purple color, and cluster 3 in blue and cluster 4 in green color respectively.