 # Final assignment: Applied Data Science capstone
 ## 'The Battle of the Neighbourhoods' - B Kim

## Part 1 of assignment: clustering districts according to venues

In [1]:
# Import all libraries needed for this Notebook.
import numpy as np
import pandas as pd 
import json 
import requests
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import folium
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
import math
print('Done importing libraries.')

Done importing libraries.


### There are two datasets we need to bring from the repository offered by Gobierno de Canarias:
#### 1. List of municipalities in Gran Canaria.
#### 2. List of districts for a given municipality in Gran Canaria, with some basic properties.
#### The resulting dataframe from the above will the core dataframe that will be used throughout the current work.

In [2]:
# This is the first dataset we'll download from Gobierno de Canarias.
# The table lists all municipilaties within Canary Islands using the url.
municipalityCanaries = pd.read_csv('https://datos.canarias.es/catalogos/estadisticas/dataset/6dd8baf4-14f4-43a3-88b2-984d034c965c/resource/74237bc8-1e5f-43f1-9f6b-24ef9e073903/download/municipios_20170101.csv')

# Filter to only obtain Gran Canara, i.e. 'cd_isla' needs to have a value of 'ES705'.
municipalityGC = municipalityCanaries.loc[municipalityCanaries['gcd_isla'] == 'ES705'].reset_index(drop=True)
print('There are {} municipalities listed for Gran Canaria.'.format(len(municipalityGC)))

There are 21 municipalities listed for Gran Canaria.


In [3]:
# We are only interested in keeping columns 'geocode' and 'etiqueta' from this table.
# 'Geocode' contains the municipality identifier code and 'etiqueta' is the label, i.e. the name of the municipality.
# Let's rename the columns and display the first rows.
municipalityGC = municipalityGC[['geocode', 'etiqueta']]
municipalityGC = municipalityGC.rename(columns={'geocode':'municipalityId', 'etiqueta':'municipalityName'})
municipalityGC.head()

Unnamed: 0,municipalityId,municipalityName
0,35001,Agaete
1,35002,Agüimes
2,35005,Artenara
3,35006,Arucas
4,35008,Firgas


In [4]:
# This table lists all districts and administrative sections for Canary Islands.
districtCanaries = pd.read_csv('https://datos.canarias.es/catalogos/estadisticas/dataset/1f8a16e1-11dc-4bb1-8d6b-3ce1c3328b48/resource/0eed83b9-8705-457c-8d49-d945f55792b7/download/distritos_20170101.csv')
districtCanaries

# Again, we need to set a filter so that we only get those districts within Gran Canaria.
districtGC = districtCanaries.loc[districtCanaries['gcd_isla'] == 'ES705'].reset_index(drop=True)
print('There are {} districts listed for Gran Canaria.'.format(len(districtGC)))

There are 53 districts listed for Gran Canaria.


In [5]:
# We are only interested in columns 'etiqueta', 'gcd_municipio', 'longitud' and 'latitud'.
# 'etiqueta' is the label for the district. Unfortunately, the actual name of the district is not given.
# 'gcd_municipio' is equivalent to 'municipalityId' we saw earlier.
# 'longitud' and 'latitud' are the geo-coordinates (longitude & latitude) for each district.
# Let's rename the columns too.
districtGC = districtGC[['gcd_municipio', 'etiqueta', 'longitud', 'latitud', 'superficie']]
districtGC = districtGC.rename(columns={'etiqueta': 'districtLabel', 'gcd_municipio':'municipalityId', 
                                        'longitud':'longitude', 'latitud':'latitude', 'superficie': 'surface'})
districtGC.head()

Unnamed: 0,municipalityId,districtLabel,longitude,latitude,surface
0,35001,Distrito 01 - Agaete,-15.689645,28.073135,4452.5952
1,35002,Distrito 01 - Agüimes,-15.453004,27.897893,7877.593
2,35005,Distrito 01 - Artenara,-15.682969,28.018081,6641.6517
3,35006,Distrito 01 - Arucas,-15.520911,28.119561,228.0795
4,35006,Distrito 02 - Arucas,-15.540187,28.115284,332.6582


In [6]:
# Merge the two dataframes so that column 'municipalityId' on 'districtGC' ....
geolocGC = pd.merge(municipalityGC, districtGC, on='municipalityId')
geolocGC.head()

Unnamed: 0,municipalityId,municipalityName,districtLabel,longitude,latitude,surface
0,35001,Agaete,Distrito 01 - Agaete,-15.689645,28.073135,4452.5952
1,35002,Agüimes,Distrito 01 - Agüimes,-15.453004,27.897893,7877.593
2,35005,Artenara,Distrito 01 - Artenara,-15.682969,28.018081,6641.6517
3,35006,Arucas,Distrito 01 - Arucas,-15.520911,28.119561,228.0795
4,35006,Arucas,Distrito 02 - Arucas,-15.540187,28.115284,332.6582


In [7]:
print('There are {} districts listed for the municipality of Las Palmas.'
      .format(len(geolocGC.loc[geolocGC['municipalityName'] == 'Las Palmas de Gran Canaria'])))

There are 5 districts listed for the municipality of Las Palmas.


In [8]:
# Let's illustrate the information on 'geolocGC' on a map using Folium.
# Markers are added for each of the districts at the respective location.
map_gc = folium.Map(location=[27.95, -15.6], tiles = 'Stamen Terrain', zoom_start=10)
for latitude, longitude, district in zip(geolocGC['latitude'], geolocGC['longitude'], geolocGC['districtLabel']):
    label = '{}'.format(district)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([latitude, longitude], radius=5, popup=label, color='blue', fill=True, fill_color='#3186cc',
                        fill_opacity=0.7, parse_html=False).add_to(map_gc)  
map_gc

# ---------------------------------------------------------------------------------------

### Obtaining venues within districts using Foursquare API

In [1]:
# Credentials required for placing API calls on Foursquare data.
# !! They will be deleted when uploading the Notebook !!
CLIENT_ID = '----'
CLIENT_SECRET = '----'
ACCESS_TOKEN = '---'
VERSION = '20180604'
LIMIT = 100

In [33]:
# Define function that will:
# (i) make an API call to Foursqaure for each of the districts' coordinates within a 5 km radius,
# (ii) desearialise JSON data,
# (iii) tabulate using Pandas.
def getNearbyVenues(names, latitudes, longitudes, radii):
    
    venues_list=[]
    for name, lat, lng, rad in zip(names, latitudes, longitudes, radii):            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, rad, LIMIT)
        
        if requests.get(url).json()['response']['groups'][0]['items']:
            results = requests.get(url).json()['response']['groups'][0]['items']        
        else:
            pass
        venues_list.append([(name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], 
            v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['district', 'districtLatitude', 'districtLongitude', 
                             'venue', 'venueLatitude', 'venueLongitude', 'venueCategory']
    
    return(nearby_venues)

In [37]:
# Before running function 'getNearbyVenues' the dataframe 'venuesGC' needs some further manipulation.
# The API call requires a radius value. Given that districts are different in size, an arbitrary value will cause
# inaccuracies: smaller neighbouring districts will have an overlap, whereas larger districts may not be completely
# covered. Given that we have the surface area of each district, we can estimate the shape of the circle.
# The radius we extract (equivRadius, units meters) can be used for the API call.
geolocGC['equivRadius'] = (((geolocGC['surface']/math.pi)**0.5)*1000).round(decimals=0)
geolocGC['equivRadius'] = geolocGC['equivRadius'].astype('int64')
geolocGC.head()

Unnamed: 0,municipalityId,municipalityName,districtLabel,longitude,latitude,surface,equivRadius
0,35001,Agaete,Distrito 01 - Agaete,-15.689645,28.073135,4452.5952,37647
1,35002,Agüimes,Distrito 01 - Agüimes,-15.453004,27.897893,7877.593,50075
2,35005,Artenara,Distrito 01 - Artenara,-15.682969,28.018081,6641.6517,45979
3,35006,Arucas,Distrito 01 - Arucas,-15.520911,28.119561,228.0795,8521
4,35006,Arucas,Distrito 02 - Arucas,-15.540187,28.115284,332.6582,10290


In [35]:
# Run the 'getNearbyVenues' function defined above.
venuesGC = getNearbyVenues(names=geolocGC['districtLabel'], latitudes=geolocGC['latitude'],
                           longitudes=geolocGC['longitude'], radii=geolocGC['equivRadius'])

In [38]:
# For example, let's print the top rows from District 02 in Las Palmas.
# This is where I personally like to hang out! :))
(venuesGC.loc[venuesGC['district'] == 'Distrito 02 - Las Palmas de Gran Canaria']).head()

Unnamed: 0,district,districtLatitude,districtLongitude,venue,venueLatitude,venueLongitude,venueCategory
2030,Distrito 02 - Las Palmas de Gran Canaria,28.11232,-15.421078,Pastelería Colomar,28.115799,-15.421875,Cupcake Shop
2031,Distrito 02 - Las Palmas de Gran Canaria,28.11232,-15.421078,Restaurante Allende,28.107074,-15.41796,Spanish Restaurant
2032,Distrito 02 - Las Palmas de Gran Canaria,28.11232,-15.421078,Regaliz Funwear,28.105516,-15.417707,Men's Store
2033,Distrito 02 - Las Palmas de Gran Canaria,28.11232,-15.421078,Teatro Pérez Galdós,28.103382,-15.414024,Theater
2034,Distrito 02 - Las Palmas de Gran Canaria,28.11232,-15.421078,La Azotea De Benito,28.102523,-15.415288,Beer Garden


In [39]:
# Create a Yes/No type matrix summarising venue and category by one hot encoding.
onehotGC = pd.get_dummies(venuesGC[['venueCategory']], prefix="", prefix_sep="")
onehotGC['district'] = venuesGC['district'] 
fixed_columns = [onehotGC.columns[-1]] + list(onehotGC.columns[:-1])
onehotGC = onehotGC[fixed_columns]

# Group the venues by district.
groupedGC = onehotGC.groupby('district').mean().reset_index()
groupedGC.head()

Unnamed: 0,district,Airport,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Argentinian Restaurant,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Steakhouse,Supermarket,Surf Spot,Tapas Restaurant,Tea Room,Theater,Toy / Game Store,Vegetarian / Vegan Restaurant,Water Park,Zoo
0,Distrito 01 - Agaete,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.01,0.0,0.03,0.04,0.0,0.01,0.0,0.0,0.0,0.0
1,Distrito 01 - Agüimes,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,...,0.02,0.0,0.0,0.03,0.0,0.01,0.0,0.0,0.0,0.01
2,Distrito 01 - Artenara,0.0,0.0,0.0,0.0,0.0,0.02,0.01,0.0,0.0,...,0.01,0.0,0.03,0.04,0.0,0.01,0.0,0.0,0.0,0.0
3,Distrito 01 - Arucas,0.0,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.042553,...,0.0,0.0,0.021277,0.085106,0.0,0.0,0.0,0.0,0.0,0.0
4,Distrito 01 - Firgas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050847,...,0.016949,0.0,0.016949,0.033898,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
# Define function that will sort venues in descensing order.
def returnMostCommonVenues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [41]:
# Run function returnMostCommonVenues and display the ranked venue categories for each district.
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
columns = ['district']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

districtsVenuesSorted = pd.DataFrame(columns=columns)
districtsVenuesSorted['district'] = groupedGC['district']

for ind in np.arange(groupedGC.shape[0]):
    districtsVenuesSorted.iloc[ind, 1:] = returnMostCommonVenues(groupedGC.iloc[ind, :], num_top_venues)

districtsVenuesSorted.head()

Unnamed: 0,district,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Distrito 01 - Agaete,Beach,Hotel,Restaurant,Scenic Lookout,Spanish Restaurant,Tapas Restaurant,Ice Cream Shop,Italian Restaurant,Surf Spot,Plaza
1,Distrito 01 - Agüimes,Beach,Hotel,Italian Restaurant,Bar,Café,Scenic Lookout,Spanish Restaurant,Restaurant,Tapas Restaurant,Ice Cream Shop
2,Distrito 01 - Artenara,Hotel,Beach,Spanish Restaurant,Scenic Lookout,Restaurant,Italian Restaurant,Resort,Tapas Restaurant,Ice Cream Shop,Surf Spot
3,Distrito 01 - Arucas,Restaurant,Tapas Restaurant,Spanish Restaurant,Beach,Scenic Lookout,Plaza,Shopping Mall,BBQ Joint,Italian Restaurant,Café
4,Distrito 01 - Firgas,Restaurant,Spanish Restaurant,Plaza,Scenic Lookout,Hotel,BBQ Joint,Beach,History Museum,Italian Restaurant,Tapas Restaurant


In [42]:
# Define the number of clusters to be used for the category of venues from the grouped dataframe.
kclusters = 4
clusterGroupedGC = groupedGC.drop('district', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(clusterGroupedGC)

# Merge the results from the k-means clustering and the the 'geolocGC' dataframe.
districtsVenuesSorted.insert(0, 'labelCluster', kmeans.labels_)
mergedGC = geolocGC
mergedGC = mergedGC.join(districtsVenuesSorted.set_index('district'), on='districtLabel')
mergedGC.head()

Unnamed: 0,municipalityId,municipalityName,districtLabel,longitude,latitude,surface,equivRadius,labelCluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,35001,Agaete,Distrito 01 - Agaete,-15.689645,28.073135,4452.5952,37647,2,Beach,Hotel,Restaurant,Scenic Lookout,Spanish Restaurant,Tapas Restaurant,Ice Cream Shop,Italian Restaurant,Surf Spot,Plaza
1,35002,Agüimes,Distrito 01 - Agüimes,-15.453004,27.897893,7877.593,50075,1,Beach,Hotel,Italian Restaurant,Bar,Café,Scenic Lookout,Spanish Restaurant,Restaurant,Tapas Restaurant,Ice Cream Shop
2,35005,Artenara,Distrito 01 - Artenara,-15.682969,28.018081,6641.6517,45979,1,Hotel,Beach,Spanish Restaurant,Scenic Lookout,Restaurant,Italian Restaurant,Resort,Tapas Restaurant,Ice Cream Shop,Surf Spot
3,35006,Arucas,Distrito 01 - Arucas,-15.520911,28.119561,228.0795,8521,3,Restaurant,Tapas Restaurant,Spanish Restaurant,Beach,Scenic Lookout,Plaza,Shopping Mall,BBQ Joint,Italian Restaurant,Café
4,35006,Arucas,Distrito 02 - Arucas,-15.540187,28.115284,332.6582,10290,3,Restaurant,Plaza,Spanish Restaurant,Scenic Lookout,Tapas Restaurant,Beach,Shopping Mall,BBQ Joint,Italian Restaurant,Café


In [60]:
# Visualise the created clusters on the map of the island.
map_clusters = folium.Map(location=[27.95, -15.6], tiles = 'Stamen Terrain', zoom_start=10)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, district, cluster in zip(mergedGC['latitude'], mergedGC['longitude'], mergedGC['districtLabel'], mergedGC['labelCluster']):
    label = folium.Popup(str(district) + ' | cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=7,
        popup=label,
        color='black',
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=1).add_to(map_clusters)
       
map_clusters

In [45]:
# Examine first cluster (index 0).
firstCluster = mergedGC.loc[mergedGC['labelCluster'] == 0, mergedGC.columns[[2] + list(range(7, mergedGC.shape[1]))]]
firstCluster.head()

Unnamed: 0,districtLabel,labelCluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Distrito 04 - Arucas,0,Restaurant,Boat or Ferry,Spanish Restaurant,Plaza,Tapas Restaurant,Beach,Seafood Restaurant,Shopping Mall,Clothing Store,Park
7,Distrito 05 - Arucas,0,Restaurant,Spanish Restaurant,Seafood Restaurant,Tapas Restaurant,Beach,Plaza,Hotel,Ice Cream Shop,Burger Joint,Beer Garden
8,Distrito 06 - Arucas,0,Spanish Restaurant,Restaurant,Plaza,Beach,Hotel,Tapas Restaurant,Seafood Restaurant,Grocery Store,Scenic Lookout,Shopping Mall
9,Distrito 07 - Arucas,0,Restaurant,Spanish Restaurant,Beach,Tapas Restaurant,Plaza,Seafood Restaurant,Scenic Lookout,Shopping Mall,Ice Cream Shop,Hotel
25,Distrito 02 - Las Palmas de Gran Canaria,0,Restaurant,Beach,Spanish Restaurant,Tapas Restaurant,Clothing Store,Seafood Restaurant,Shopping Mall,Sporting Goods Shop,Plaza,Supermarket


In [48]:
# Examine second cluster (index 1).
secondCluster = mergedGC.loc[mergedGC['labelCluster'] == 1, mergedGC.columns[[2] + list(range(7, mergedGC.shape[1]))]]
secondCluster.head()

Unnamed: 0,districtLabel,labelCluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Distrito 01 - Agüimes,1,Beach,Hotel,Italian Restaurant,Bar,Café,Scenic Lookout,Spanish Restaurant,Restaurant,Tapas Restaurant,Ice Cream Shop
2,Distrito 01 - Artenara,1,Hotel,Beach,Spanish Restaurant,Scenic Lookout,Restaurant,Italian Restaurant,Resort,Tapas Restaurant,Ice Cream Shop,Surf Spot
18,Distrito 01 - Mogán,1,Hotel,Beach,Italian Restaurant,Scenic Lookout,Restaurant,Spanish Restaurant,Café,Resort,Bar,Steakhouse
19,Distrito 02 - Mogán,1,Hotel,Beach,Italian Restaurant,Restaurant,Resort,Spanish Restaurant,Café,Scenic Lookout,Gay Bar,Cocktail Bar
20,Distrito 03 - Mogán,1,Hotel,Restaurant,Beach,Spanish Restaurant,Resort,Seafood Restaurant,Paella Restaurant,Neighborhood,Pool,Pizza Place


In [50]:
# Examine third cluster (index 2).
thirdCluster = mergedGC.loc[mergedGC['labelCluster'] == 2, mergedGC.columns[[2] + list(range(7, mergedGC.shape[1]))]]
thirdCluster.head()

Unnamed: 0,districtLabel,labelCluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Distrito 01 - Agaete,2,Beach,Hotel,Restaurant,Scenic Lookout,Spanish Restaurant,Tapas Restaurant,Ice Cream Shop,Italian Restaurant,Surf Spot,Plaza
14,Distrito 01 - Gáldar,2,Beach,Hotel,Scenic Lookout,Restaurant,Spanish Restaurant,Tapas Restaurant,Plaza,Ice Cream Shop,Resort,Surf Spot
15,Distrito 01 - Ingenio,2,Beach,Restaurant,Spanish Restaurant,Tapas Restaurant,Seafood Restaurant,Hotel,Italian Restaurant,Scenic Lookout,Bar,Cocktail Bar
16,Distrito 02 - Ingenio,2,Beach,Coffee Shop,Airport Service,Rental Car Location,Bar,Grocery Store,Fast Food Restaurant,Duty-free Shop,Restaurant,Plaza
17,Distrito 03 - Ingenio,2,Beach,Restaurant,Seafood Restaurant,Scenic Lookout,Spanish Restaurant,Shopping Mall,Clothing Store,Supermarket,American Restaurant,Fast Food Restaurant


In [51]:
# Examine fourth cluster (index 3).
fourthCluster = mergedGC.loc[mergedGC['labelCluster'] == 3, mergedGC.columns[[2] + list(range(7, mergedGC.shape[1]))]]
fourthCluster.head()

Unnamed: 0,districtLabel,labelCluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Distrito 01 - Arucas,3,Restaurant,Tapas Restaurant,Spanish Restaurant,Beach,Scenic Lookout,Plaza,Shopping Mall,BBQ Joint,Italian Restaurant,Café
4,Distrito 02 - Arucas,3,Restaurant,Plaza,Spanish Restaurant,Scenic Lookout,Tapas Restaurant,Beach,Shopping Mall,BBQ Joint,Italian Restaurant,Café
5,Distrito 03 - Arucas,3,Restaurant,Spanish Restaurant,Plaza,Beach,Tapas Restaurant,Seafood Restaurant,Scenic Lookout,Italian Restaurant,Burger Joint,BBQ Joint
10,Distrito 01 - Firgas,3,Restaurant,Spanish Restaurant,Plaza,Scenic Lookout,Hotel,BBQ Joint,Beach,History Museum,Italian Restaurant,Tapas Restaurant
11,Distrito 02 - Firgas,3,Restaurant,Spanish Restaurant,Plaza,Scenic Lookout,Hotel,BBQ Joint,Tapas Restaurant,History Museum,Grocery Store,Beach


# ------------------------------------ End of Part 1 ------------------------------------