# Segmenting and Clustering Neighborhoods in Calgary

### Part 1: The code below will be used to scrape the Wikipedia page and generate the requested Panadas Dataframe

In [1]:
import pandas as pd

In [2]:
# Using Pands read_html() method to read the table from wiki webpage

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_T'

df=pd.read_html(url)[1]
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,T1A,Medicine Hat,Central Medicine Hat,50.036460,-110.679250
1,T2A,Calgary,"Penbrooke Meadows, Marlborough",51.049680,-113.964320
2,T3A,Calgary,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.126060,-114.143158
3,T4A,Airdrie,East Airdrie,51.272450,-113.986980
4,T5A,Edmonton,"West Clareview, East Londonderry",53.5899,-113.4413
...,...,...,...,...,...
175,T5Z,Edmonton,West Lake District,53.5966,-113.4882
176,T6Z,Not assigned,Not assigned,Not assigned,Not assigned
177,T7Z,Stony Plain,Not assigned,53.5202,-114.0135
178,T8Z,Not assigned,Not assigned,Not assigned,Not assigned


In [3]:
# The code below removes cells with a borough value that is "Not assigned"

df=df[~df.Borough.str.contains("Not assigned")]
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,T1A,Medicine Hat,Central Medicine Hat,50.036460,-110.679250
1,T2A,Calgary,"Penbrooke Meadows, Marlborough",51.049680,-113.964320
2,T3A,Calgary,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.126060,-114.143158
3,T4A,Airdrie,East Airdrie,51.272450,-113.986980
4,T5A,Edmonton,"West Clareview, East Londonderry",53.5899,-113.4413
...,...,...,...,...,...
171,T1Z,Rocky View,Not assigned,Not assigned,Not assigned
172,T2Z,Calgary,"Douglas Glen, McKenzie Lake, Copperfield, East...",50.9023,-113.9873
173,T3Z,Redwood Meadows,Not assigned,50.9821,-114.5178
175,T5Z,Edmonton,West Lake District,53.5966,-113.4882


In [4]:
# The code below uses groupby() function to show only unique Postal Code and concatenates Neighbourhoods with the same postal code together

df=df.groupby(['Postal Code','Borough'])['Neighborhood'].apply(','.join).reset_index()
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,T1A,Medicine Hat,Central Medicine Hat
1,T1B,Medicine Hat,South Medicine Hat
2,T1C,Medicine Hat,North Medicine Hat
3,T1G,Taber,Not assigned
4,T1H,Lethbridge,North Lethbridge
...,...,...,...
139,T9N,Bonnyville,Not assigned
140,T9S,Athabasca,Not assigned
141,T9V,Lloydminster,Not assigned
142,T9W,Wainwright,Not assigned


In [5]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
# The code below will iterate through the df_dataframe and look for when neighbourhood value is "Not assigned" and replace with Borough value

df
temp=[]
for borough, neighbourhood in zip(df['Borough'],df['Neighborhood']):
    if neighbourhood=='Not assigned':
        neighbourhood=borough
    temp.append(neighbourhood)

df['Neighborhood']=temp

df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,T1A,Medicine Hat,Central Medicine Hat
1,T1B,Medicine Hat,South Medicine Hat
2,T1C,Medicine Hat,North Medicine Hat
3,T1G,Taber,Taber
4,T1H,Lethbridge,North Lethbridge
...,...,...,...
139,T9N,Bonnyville,Bonnyville
140,T9S,Athabasca,Athabasca
141,T9V,Lloydminster,Lloydminster
142,T9W,Wainwright,Wainwright


In [6]:
df.shape

(144, 3)

### Part 2: The code below will use geopy to generate location data for the dataframe:

In [7]:
# Since we are only interested in analyzing the boroughs that contain "Calgary" we can first clean the dataframe
# to reduce the workload required for getting location data 

# Selecting only Boroughs that contains Calgary:

calgary_df=df[df['Borough'].str.contains("Calgary")]
calgary_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
15,T1Y,Calgary,"Rundle, Whitehorn, Monterey Park"
17,T2A,Calgary,"Penbrooke Meadows, Marlborough"
18,T2B,Calgary,"Forest Lawn, Dover, Erin Woods"
19,T2C,Calgary,"Lynnwood Ridge, Ogden, Foothills Industrial, G..."
20,T2E,Calgary,"Bridgeland, Greenview, Zoo, YYC"


In [8]:
!pip install geopy
!pip install geocoder
import geocoder # import geocoder


Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 8.0 MB/s  eta 0:00:01
[?25hCollecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [None]:
latitude=[]
longitude=[]
for code in calgary_df['Postal Code']:
    g = geocoder.arcgis('{}, Calgary, Alberta'.format(code))
    print(code, g.latlng)
    while (g.latlng is None):
        g = geocoder.arcgis('{}, Calgary, Alberta'.format(code))
        print(code, g.latlng)
    latlng = g.latlng
    latitude.append(latlng[0])
    longitude.append(latlng[1])

T1Y [51.08309000000003, -113.95673]
T2A [51.051690000000065, -113.95673]
T2B [51.022080000000074, -113.96029999999996]
T2C [50.97847000000007, -113.97686999999996]
T2E [51.09146000000004, -114.02752999999996]
T2G [51.02934000000005, -114.03046999999998]
T2H [50.98864000000003, -114.04734999999994]
T2J [50.94364000000007, -114.04178999999999]
T2K [51.10047000000003, -114.07255999999995]
T2L [51.107200000000034, -114.12117999999998]
T2M [51.07080000000008, -114.09382]
T2N [51.06493000000006, -114.12185999999997]
T2P [51.050410000000056, -114.07343999999995]
T2R [51.04147000000006, -114.07611999999995]
T2S [51.02094000000005, -114.07627999999994]
T2T [51.02119000000005, -114.09937999999994]
T2V [50.98514000000006, -114.09335999999996]
T2W [50.94625000000008, -114.10564999999997]
T2X [50.88561000000004, -114.08200999999997]
T2Y [50.89241000000004, -114.18064999999996]
T2Z [50.928870000000074, -113.95598999999999]
T3A [51.13254000000006, -114.13615999999996]
T3B [51.09483000000006, -114.273

In [None]:
calgary_df['Latitude']=latitude
calgary_df['Longitude']=longitude
calgary_df.tail()

### Part 3: The code below will be used to explore and cluster the neighborhoods in Toronto

In [None]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

!pip -q install folium
import folium # map rendering library
!pip install lxml
print('Libraries imported.')

In [None]:
!pip install geopy
from geopy.geocoders import Nominatim

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import numpy as np

In [None]:
# Getting the geographical coordinates of Calgary

address = 'Calgary, AB'

geolocator = Nominatim(user_agent="Cal_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Calgary City are {}, {}.'.format(latitude, longitude))

In [None]:
# @hidden_cell
# Defining Credentials for Foursquare

CLIENT_ID = 'IEBRFVWPTAYMKDNRHEVEWUXL34RM5CYE5Y2QPPQXHXVMOD40' # your Foursquare ID
CLIENT_SECRET = 'IBDCKQLBB0LY2UIX02UQ440V5JSKWCNEC4IOUF1ZYLKJZA4B' # your Foursquare Secret
VERSION = '20201115' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
#print('CLIENT_ID: ' + CLIENT_ID)
#print('CLIENT_SECRET:' + CLIENT_SECRET)

***3.1 The code below will be used to Explore the Neighbourhoods in Calgary***

In [None]:
# User defined function to extract and process all neighbourhoods venues in Toronto

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])    
    
    # The code below contains two for loops:the first loop iterates over venues_list and gives row wise value at each iteration in terms of value_list
    # second for loop which is inside this loop iterates over values in value_list which we get from the above for loop and gives the value in the list.
    # It can be written as:        for venue_list in venue_list:
                                        # for item in venue_list:
                                            # item
            
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
# Running the user defined function and pulling all relevant venues 

calgary_venues=getNearbyVenues(names=calgary_df['Neighborhood'], latitudes=calgary_df['Latitude'], longitudes=calgary_df['Longitude'])

In [None]:
# Checking the size of resulting dataframe

print(calgary_venues.shape)
calgary_venues.head()

In [None]:
# Check how many venues were returned for each neighbourhood

calgary_venues.groupby('Neighborhood').count()

In [None]:
# Analyzing each neighbourhood 

# one hot encoding
calgary_onehot = pd.get_dummies(calgary_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
calgary_onehot['Neighborhood'] = calgary_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [calgary_onehot.columns[-1]] + list(calgary_onehot.columns[:-1])
calgary_onehot = calgary_onehot[fixed_columns]

calgary_onehot.head()

In [None]:
# Group rows by neighbourhoods and taking the frequency of occurence for each category

calgary_grouped = calgary_onehot.groupby('Neighborhood').mean().reset_index()
calgary_grouped

In [None]:
# Writing Function to sort each venues in descending order 

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
# Creating new dataframe to display top 10 venues in each neighbourhood

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = calgary_grouped['Neighborhood']

for ind in np.arange(calgary_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(calgary_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

In [None]:
# Import and run K-means to create 5 clusters


# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

calgary_grouped_clustering = calgary_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(calgary_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

In [None]:
# Creating a new dataframe that contains the labels as well as the top 10 venues for each neighbourhood in Calgary

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

calgary_df.head()

calgary_merged = calgary_df

# merge calgary_grouped with calgary_data to add latitude/longitude for each neighborhood
calgary_merged = calgary_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

calgary_merged=calgary_merged.dropna()
calgary_merged=calgary_merged.astype({'Cluster Labels': 'int32'})



In [None]:
calgary_merged

In [None]:
# Visualzing the Resulting Clusters

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=8)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]

colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(calgary_merged['Latitude'], calgary_merged['Longitude'], calgary_merged['Neighborhood'], calgary_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

***After Visualizing, each cluster will also be examined to determined the discriminating venue categories that distinguish each cluster***

***Cluster 1***

In [None]:
calgary_merged.loc[calgary_merged['Cluster Labels'] == 0, calgary_merged.columns[[2] + list(range(6, calgary_merged.shape[1]))]]

***Cluster 2***

In [None]:
calgary_merged.loc[calgary_merged['Cluster Labels'] == 1, calgary_merged.columns[[2] + list(range(6, calgary_merged.shape[1]))]]

***Cluster 3***

In [None]:
calgary_merged.loc[calgary_merged['Cluster Labels'] == 2, calgary_merged.columns[[2] + list(range(6, calgary_merged.shape[1]))]]

***Cluster 4***

In [None]:
calgary_merged.loc[calgary_merged['Cluster Labels'] == 3, calgary_merged.columns[[2] + list(range(6, calgary_merged.shape[1]))]]

***Cluster 5***

In [None]:
calgary_merged.loc[calgary_merged['Cluster Labels'] == 4, calgary_merged.columns[[2] + list(range(6, calgary_merged.shape[1]))]]