# Capstone project week 5

## Import libraries

In [109]:
import pandas as pd
import numpy as np
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
import requests
from sklearn.cluster import KMeans

## Gather the regions

In [2]:
df_flemish = pd.read_html("https://nl.wikipedia.org/wiki/Lijst_van_gemeenten_in_het_Vlaams_Gewest")[2]['Gemeente']
df_brussels = pd.read_html("https://nl.wikipedia.org/wiki/Lijst_van_gemeenten_in_het_Brussels_Hoofdstedelijk_Gewest")[0]['Gemeente']
df_walloon = pd.read_html("https://nl.wikipedia.org/wiki/Lijst_van_gemeenten_in_het_Waals_Gewest")[1]['Gemeente']

Explore the data

In [3]:
df_flemish.head()

0         Aalst
1     Aalter[1]
2      Aarschot
3    Aartselaar
4      Affligem
Name: Gemeente, dtype: object

In [4]:
df_brussels.head()

0        Anderlecht
1    Brussel (stad)
2            Elsene
3         Etterbeek
4             Evere
Name: Gemeente, dtype: object

In [5]:
df_walloon.head()

0    's-Gravenbrakel (Braine-le-Comte)
1                       Aarlen (Arlon)
2                            Aat (Ath)
3                       Aiseau-Presles
4                                 Amay
Name: Gemeente, dtype: object

In [6]:
mergedlist = []
mergedlist.extend(df_flemish)
mergedlist.extend(df_brussels)
mergedlist.extend(df_walloon)

Note: due to limitations of foursquare API, we only check the regions in Walloon.

## Find coordinates for each region

In [7]:
from geopy.geocoders import Nominatim
def get_location(region):
    geolocator = Nominatim(user_agent="Coursera Capstone")
    location = geolocator.geocode(region + ", Belgium")
    print('Coordinates found of  '+ region)
    return [location.latitude, location.longitude]

Prepare data for coordinates extraction

In [8]:
df = pd.DataFrame(df_walloon) 
df.columns = ['Region']
df.drop(df[df.Region.str.contains('[1]')].index,inplace=True)

Use function to extract all coordinates

In [9]:
df['Coordinates'] = df['Region'].apply(get_location)

Coordinates found of  's-Gravenbrakel (Braine-le-Comte)
Coordinates found of  Aarlen (Arlon)
Coordinates found of  Aat (Ath)
Coordinates found of  Aiseau-Presles
Coordinates found of  Amay
Coordinates found of  Amel
Coordinates found of  Andenne
Coordinates found of  Anderlues
Coordinates found of  Anhée
Coordinates found of  Ans
Coordinates found of  Anthisnes
Coordinates found of  Antoing
Coordinates found of  Assesse
Coordinates found of  Attert
Coordinates found of  Aubange
Coordinates found of  Aubel
Coordinates found of  Awans
Coordinates found of  Aywaille
Coordinates found of  Baelen
Coordinates found of  Bastenaken (Bastogne)
Coordinates found of  Beaumont
Coordinates found of  Beauraing
Coordinates found of  Belœil
Coordinates found of  Bergen (Mons)
Coordinates found of  Berloz
Coordinates found of  Bernissart
Coordinates found of  Bertogne
Coordinates found of  Bertrix
Coordinates found of  Bevekom (Beauvechain)
Coordinates found of  Beyne-Heusay
Coordinates found of  Bièvr

Backup the coordinates in a csv

In [10]:
df.to_csv('regions.csv')

In [11]:
df.Coordinates[0]

[50.6057582, 4.1382245]

In [12]:
df_coord = df.apply(lambda x: x['Coordinates'],axis=1,  result_type='expand')

In [13]:
df['Latitude'],df['Longitude']=df_coord[0],df_coord[1]

In [14]:
df = df.drop(columns=['Coordinates'])

In [15]:
df.head(35)

Unnamed: 0,Region,Latitude,Longitude
0,'s-Gravenbrakel (Braine-le-Comte),50.605758,4.138224
1,Aarlen (Arlon),49.680415,5.809531
2,Aat (Ath),50.628868,3.785318
3,Aiseau-Presles,50.416358,4.571198
4,Amay,50.549776,5.324099
5,Amel,50.354306,6.170556
6,Andenne,50.489398,5.096547
7,Anderlues,50.407997,4.26961
8,Anhée,50.310151,4.877604
9,Ans,50.667986,5.507754


## Find ventures for each region

In [16]:
CLIENT_ID = 'LVU42VHN3L144RFPJHP0XDDZ1U4YZEOMSJ2HQBVVRCSWG0JD' # your Foursquare ID
CLIENT_SECRET = '5GVZIN0N1MCQPNY1BGRFYGXJP3DWM1FJSXFGNOATFZN5U5K5' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: LVU42VHN3L144RFPJHP0XDDZ1U4YZEOMSJ2HQBVVRCSWG0JD
CLIENT_SECRET:5GVZIN0N1MCQPNY1BGRFYGXJP3DWM1FJSXFGNOATFZN5U5K5


In [17]:
def getNearbyVenues(names, latitudes, longitudes, radius=20000):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        cat = '4bf58dd8d48988d165941735,4bf58dd8d48988d1f8931735'   #Search for scenic lookouts and bed and breakfasts
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT,
            cat)  
        # make the GET request
        
        while True:
            try:
                returnjson = requests.get(url).json()
                results = returnjson["response"]['groups'][0]['items']
            except:
                print(returnjson)
                continue
            break
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [None]:
venues = getNearbyVenues(df['Region'],df['Latitude'],df['Longitude'])

Backup the venue list

In [20]:
venues.to_csv('venues.csv')

In [30]:
venues.head(20)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,'s-Gravenbrakel (Braine-le-Comte),50.605758,4.138224,hof te spieringen,50.754791,4.045691,Bed & Breakfast
1,'s-Gravenbrakel (Braine-le-Comte),50.605758,4.138224,Zevenbronnen - Sept Fontaines,50.734512,4.335727,Scenic Lookout
2,'s-Gravenbrakel (Braine-le-Comte),50.605758,4.138224,Petit Train du Bonheur,50.661057,4.134471,Scenic Lookout
3,'s-Gravenbrakel (Braine-le-Comte),50.605758,4.138224,Bed & Breakfast Wisteria,50.725606,4.019616,Bed & Breakfast
4,'s-Gravenbrakel (Braine-le-Comte),50.605758,4.138224,KUBUS,50.732236,4.036421,Scenic Lookout
5,'s-Gravenbrakel (Braine-le-Comte),50.605758,4.138224,B&B Kaai 16,50.733389,4.239344,Bed & Breakfast
6,'s-Gravenbrakel (Braine-le-Comte),50.605758,4.138224,Villa D'Elbeek,50.744133,4.20081,Bed & Breakfast
7,'s-Gravenbrakel (Braine-le-Comte),50.605758,4.138224,Boesmolen,50.735248,4.026112,Scenic Lookout
8,'s-Gravenbrakel (Braine-le-Comte),50.605758,4.138224,Shelter Studio’s,50.75091,4.258561,Bed & Breakfast
9,'s-Gravenbrakel (Braine-le-Comte),50.605758,4.138224,Belvédère des Bisons,50.583618,3.882122,Scenic Lookout


In [31]:
venues_filtered = venues[(venues['Venue Category']=='Scenic Lookout') | (venues['Venue Category']=='Bed & Breakfast')]

Check what was returned for each region

In [33]:
venues_filtered.groupby(['Neighborhood','Venue Category']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude
Neighborhood,Venue Category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
's-Gravenbrakel (Braine-le-Comte),Bed & Breakfast,8,8,8,8,8
's-Gravenbrakel (Braine-le-Comte),Scenic Lookout,7,7,7,7,7
Aarlen (Arlon),Bed & Breakfast,1,1,1,1,1
Aarlen (Arlon),Scenic Lookout,1,1,1,1,1
Aat (Ath),Bed & Breakfast,15,15,15,15,15
...,...,...,...,...,...,...
Éghezée,Bed & Breakfast,4,4,4,4,4
Éghezée,Scenic Lookout,4,4,4,4,4
Érezée,Bed & Breakfast,22,22,22,22,22
Érezée,Scenic Lookout,10,10,10,10,10


One hot encoding

In [34]:
# one hot encoding
onehot = pd.get_dummies(venues_filtered[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot['Neighborhood'] = venues_filtered['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [onehot.columns[-1]] + list(onehot.columns[:-1])
onehot = onehot[fixed_columns]

onehot.head()

Unnamed: 0,Neighborhood,Bed & Breakfast,Scenic Lookout
0,'s-Gravenbrakel (Braine-le-Comte),1,0
1,'s-Gravenbrakel (Braine-le-Comte),0,1
2,'s-Gravenbrakel (Braine-le-Comte),0,1
3,'s-Gravenbrakel (Braine-le-Comte),1,0
4,'s-Gravenbrakel (Braine-le-Comte),0,1


Group them together

In [62]:
grouped = onehot.groupby('Neighborhood').sum().reset_index()
grouped.columns= ['Region','Bed & Breakfast', 'Scenic Lookout']

In [68]:
grouped.Regio

0      's-Gravenbrakel (Braine-le-Comte)
1                         Aarlen (Arlon)
2                              Aat (Ath)
3                         Aiseau-Presles
4                                   Amay
                     ...                
254                    Zinnik (Soignies)
255                          Écaussinnes
256                              Éghezée
257                               Érezée
258                               Étalle
Name: Region, Length: 259, dtype: object

In [146]:
merged = df.merge(grouped,on='Region')

## Analyze results

### DBSCAN clusters

In [43]:
df_clustered = df.copy()

In [60]:
from sklearn.cluster import DBSCAN
import sklearn.utils
from sklearn.preprocessing import StandardScaler
sklearn.utils.check_random_state(1000)
Clus_dataSet = df_clustered[['Latitude','Longitude']]
Clus_dataSet = np.nan_to_num(Clus_dataSet)
Clus_dataSet = StandardScaler().fit_transform(Clus_dataSet)

# Compute DBSCAN
db = DBSCAN(eps=0.5, min_samples=100).fit(Clus_dataSet)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
df_clustered["Clus_Db"]=labels

realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels)) 


# A sample of clusters
df_clustered[["Region","Latitude","Longitude","Clus_Db"]].head()

Unnamed: 0,Region,Latitude,Longitude,Clus_Db
0,'s-Gravenbrakel (Braine-le-Comte),50.605758,4.138224,-1
1,Aarlen (Arlon),49.680415,5.809531,-1
2,Aat (Ath),50.628868,3.785318,-1
3,Aiseau-Presles,50.416358,4.571198,-1
4,Amay,50.549776,5.324099,-1


DBSCAN on location and Scenic Lookout and BnB

In [103]:
from sklearn.cluster import DBSCAN
import sklearn.utils
from sklearn.preprocessing import StandardScaler
merged_clustered = merged.copy()
sklearn.utils.check_random_state(1000)
Clus_dataSet = merged_clustered[['Latitude','Longitude','Scenic Lookout','Bed & Breakfast']]
Clus_dataSet = np.nan_to_num(Clus_dataSet)
Clus_dataSet = StandardScaler().fit_transform(Clus_dataSet)

# Compute DBSCAN
db = DBSCAN(eps=0.2, min_samples=5).fit(Clus_dataSet)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
merged_clustered["Clus_Db"]=labels

realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels)) 


# A sample of clusters
merged_clustered[['Latitude','Longitude','Scenic Lookout','Bed & Breakfast',"Clus_Db"]].head(5)

Unnamed: 0,Latitude,Longitude,Scenic Lookout,Bed & Breakfast,Clus_Db
0,50.605758,4.138224,7,8,-1
1,49.680415,5.809531,1,1,-1
2,50.628868,3.785318,6,15,-1
3,50.416358,4.571198,0,1,-1
4,50.549776,5.324099,1,3,-1


In [104]:
set(labels)

{-1, 0, 1}

## K means clustering

In [147]:
df_clustered = merged.copy()

In [148]:
# set number of clusters
kclusters = 100

df_clustering = df_clustered.drop('Region', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([63, 96, 88, 44, 22, 34, 59,  1, 68,  8])

In [149]:
# add clustering labels 
merged.insert(0, 'Cluster Labels', kmeans.labels_)

In [150]:
merged[merged['Cluster Labels']==47]

Unnamed: 0,Cluster Labels,Region,Latitude,Longitude,Bed & Breakfast,Scenic Lookout
131,47,Jalhay,50.559253,5.964753,16,15
222,47,Spa,50.492084,5.862623,17,16
224,47,Stavelot,50.394085,5.930836,17,15


In [218]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[49.680415, 5.809531], zoom_start=8)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(merged['Latitude'], merged['Longitude'], merged['Region'], merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Get the clusters with the least bed and breakfasts and the highest number of lookout points

In [183]:
grouped = merged.drop(columns='Region')

In [184]:
grouped = grouped.groupby(by='Cluster Labels').agg({'Latitude': 'mean','Longitude': 'mean','Bed & Breakfast':'sum' , 'Scenic Lookout': 'sum'})

In [185]:
grouped.head()

Unnamed: 0_level_0,Latitude,Longitude,Bed & Breakfast,Scenic Lookout
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,50.505291,5.431396,91,28
1,50.424191,4.27063,14,28
2,50.697278,4.04713,19,14
3,50.191315,5.118261,45,26
4,50.745651,3.511054,38,13


We are interested in regions where there are more scenic lookouts than bed and breakfasts

In [186]:
#Define a ratio factor
grouped['Ratio']=grouped['Scenic Lookout']/grouped['Bed & Breakfast']

In [189]:
sorted = grouped.sort_values(by=['Ratio'], ascending=False, axis=0)
sorted.head(20)

Unnamed: 0_level_0,Latitude,Longitude,Bed & Breakfast,Scenic Lookout,Ratio
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
52,50.231717,4.313929,0,9,inf
48,50.472442,4.696927,4,18,4.5
85,50.394374,4.261633,4,13,3.25
21,50.590719,4.571203,5,14,2.8
19,50.67613,6.11084,7,19,2.714286
1,50.424191,4.27063,14,28,2.0
60,50.319766,5.293815,16,30,1.875
96,49.739695,5.473635,4,7,1.75
54,50.237363,4.217197,4,7,1.75
59,50.507538,4.606792,13,22,1.692308


In [203]:
#Take out the most interesting ones, this means, a lot of scenic lookouts and a good ratio
filtered = sorted[((sorted.Ratio > 1.5)) & (sorted['Scenic Lookout']>15)].reset_index()

In [206]:
filtered

Unnamed: 0,Cluster Labels,Latitude,Longitude,Bed & Breakfast,Scenic Lookout,Ratio
0,48,50.472442,4.696927,4,18,4.5
1,19,50.67613,6.11084,7,19,2.714286
2,1,50.424191,4.27063,14,28,2.0
3,60,50.319766,5.293815,16,30,1.875
4,59,50.507538,4.606792,13,22,1.692308
5,18,50.70415,5.858555,16,27,1.6875


Show the ideal locations on a map

In [217]:
# create map
map_clusters = folium.Map(location=[49.680415, 5.809531], zoom_start=8)

# set color scheme for the clusters
x = np.arange(filtered.Ratio.count())
ys = [i + x + (i*x)**2 for i in range(filtered.Ratio.count())]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, bnb,scenic, cluster in zip(filtered['Latitude'], filtered['Longitude'],filtered['Bed & Breakfast'],filtered['Scenic Lookout'], filtered['Cluster Labels'].index):
    label = folium.Popup('BNB: ' + str(bnb) + ' Scenic: ' + str(scenic), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [210]:
filtered.Ratio.count()

6