In [1]:
import pandas as pd
import driver
import googlemaps

import matplotlib
import matplotlib.pyplot as plt
import numpy as np

import config

Choose our destination and grab information.

In [2]:
df = driver.google_search('Cancun')

In [3]:
df

Unnamed: 0,Attractions,Descriptions,City,Ratings,Reviews,Genres
0,Isla Mujeres,Playa Norte beach & sea turtle sanctuary,Cancun,4.7,"(2,122)",Outdoors
1,Xcaret Park,Eco-archaeological park with attractions,Cancun,4.8,"(60,994)",Outdoors
2,Xel-Há Park,Natural aquarium with many activities,Cancun,4.8,"(24,714)",Outdoors
3,Cancún Underwater Museum,Contemporary underwater sculpture museum,Cancun,4.5,(383),Outdoors
4,Dos Ojos Cenote,Natural caves & scenic swimming holes,Cancun,4.7,"(2,357)",Outdoors
...,...,...,...,...,...,...
143,Cancun Country Club,Golf and country club,Cancun,4.5,(293),Kid-friendly
144,Amigos de Isla Contoy,Island,Cancun,5.0,(1),Kid-friendly
145,Holy Family Parish Cancun,Catholic church,Cancun,4.7,(575),Kid-friendly
146,Marlín Beach,Beach,Cancun,4.6,(89),Kid-friendly


In [None]:
test = df.groupby("Attractions")['Genres'].apply(lambda genres_cats: '|'.join(genres_cats)).to_frame()

In [None]:
del df['Genres']

In [None]:
df = df.drop_duplicates()

In [None]:
final_df =pd.merge(test,df, how = "left", on = 'Attractions')

In [None]:
final_df1 = final_df.drop_duplicates(subset = 'Attractions', keep = 'first').reset_index(drop = True)

In [None]:
df = final_df1

From above, we see that in the 'Reviews' column, there are parentheses. Let's perform regex and remove them so that they are more readable.

In [None]:
# Convert using map and lambda if possible
import re 

clear_lst = []

for i in range(len(df)):
    if df.Reviews[i] == 'N/A':
        clear_lst.append('N/A')
    else:
        result = re.search(r'\((.*?)\)',df['Reviews'][i]).group(1)
        clear_lst.append(result)
        
df.Reviews = clear_lst

In [None]:
df.head()

Geocoding: get the coordinates of each location

In [None]:
list_of_lat, list_of_lon, list_of_addr  = [], [], []

gmaps = googlemaps.Client(key=config.api_key) 

def geocode_address(attraction, city):
    try:
        query = attraction + city
        geocode_result = gmaps.geocode(query)
        
        geom = geocode_result[0]
        lat = geom["geometry"]["location"]["lat"]
        lon = geom["geometry"]["location"]["lng"]
        addr = geom['formatted_address']

        list_of_lat.append(lat)
        list_of_lon.append(lon)
        list_of_addr.append(addr)
    
    except:
        list_of_lat.append('NA')
        list_of_lon.append('NA')
        list_of_addr.append('NA')

for attraction, city in zip(df.Attractions, df.City):
    geocode_address(attraction,city)

In [None]:
df['Latitude'] = list_of_lat
df['Longitude'] = list_of_lon
df['Address'] = list_of_addr

In [None]:
df.head()

In [None]:
df = df[df.Latitude != 'NA']

Let's try to pick a couple random choices 

In [None]:
sample_df = df.sample(n = 20)
sample_df.Attractions

In [None]:
sample_df = sample_df.reset_index(drop = True)

Round the decimals to 4 digits for Latitude and Longitude

In [None]:
sample_df['Latitude'] = sample_df['Latitude'].apply(lambda x: round(x, 4))
sample_df['Longitude'] = sample_df['Longitude'].apply(lambda x: round(x, 4))

Let's first plot the locations and see their locations

In [None]:
fig = plt.figure(figsize=(16,8))
cmap = plt.cm.rainbow
norm = matplotlib.colors.BoundaryNorm(np.arange(0,10,1), cmap.N)
plt.scatter(sample_df['Longitude'], sample_df['Latitude'],
            cmap=cmap, norm=norm, s=150, edgecolor='none')
plt.xlabel('Latitude', fontsize=18)
plt.ylabel('Longitude', fontsize=18)
plt.grid()
plt.show()


**Creating clusters**

In [None]:
from sklearn.cluster import KMeans

In [None]:
# Creating an instance of KMeans to find 10 clusters
kmeans_1 = KMeans(n_clusters=10)
# Using fit_predict to cluster the dataset
X = sample_df[['Longitude','Latitude']].values
predictions = kmeans_1.fit_predict(X)

In [None]:
clustered = pd.concat([sample_df.reset_index(), 
                       pd.DataFrame({'Cluster':predictions})], 
                      axis=1)
clustered.drop('index', axis = 1, inplace = True)

In [None]:
conditions = [
    clustered['Cluster'] == 0, 
    clustered['Cluster'] == 1,
    clustered['Cluster'] == 2,
    clustered['Cluster'] == 3,
    clustered['Cluster'] == 4,
    clustered['Cluster'] == 5,
    clustered['Cluster'] == 6,
    clustered['Cluster'] == 7,
    clustered['Cluster'] == 8,
    clustered['Cluster'] == 9 ]
choices = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday','Saturday', 
           'Sunday', 'Monday', 'Tuesday', 'Wednesday' ]
clustered['Vacation Day'] = np.select(conditions, choices, default='black')

In [None]:
clustered.sort_values(by = 'Cluster')

In [None]:
fig = plt.figure(figsize=(16,8))
cmap=plt.cm.rainbow
norm = matplotlib.colors.BoundaryNorm(np.arange(0,10,1), cmap.N)
plt.scatter(clustered['Longitude'], clustered['Latitude'], c=clustered['Cluster'],
            cmap=cmap, norm=norm, s=150, edgecolor='none')
plt.colorbar(ticks=np.linspace(0,9,10))
centers = kmeans_1.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=100, alpha=0.3);
#plt.xlim(2.28, 2.375)
#plt.ylim(48.84, 48.89)
plt.xlabel('Latitude', fontsize=14)
plt.ylabel('Longitude', fontsize=14)
plt.title('k-means clustering results (n_clusters=10)', fontsize=14)
plt.grid()
plt.show()

In [None]:
import hdbscan

In [None]:
rads = np.radians(X)
clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='haversine')
predictions = clusterer.fit_predict(rads)

In [None]:
clustered = pd.concat([sample_df.reset_index(),
                       pd.DataFrame({'Cluster':predictions})],
                     axis = 1)
clustered.drop('index', axis = 1, inplace = True)

conditions = [
    clustered['Cluster'] == -1, 
    clustered['Cluster'] == 0,
    clustered['Cluster'] == 1,
    clustered['Cluster'] == 2,
    clustered['Cluster'] == 3,
    clustered['Cluster'] == 4,
    clustered['Cluster'] == 5,
    clustered['Cluster'] == 6,
    clustered['Cluster'] == 7,
    clustered['Cluster'] == 8 ]
choices = ['NA/Noise', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 
           'Friday','Saturday', 'Sunday', 'Monday', 'Tuesday' ]
clustered['Vacation Day'] = np.select(conditions, choices, default='black')
clustered.sort_values(by=['Cluster'])

In [None]:
fig = plt.figure(figsize=(16,8))
cmap=plt.cm.gnuplot2
#norm = matplotlib.colors.BoundaryNorm(np.arange(0,10,1), cmap.N)
plt.scatter(clustered['Longitude'], clustered['Latitude'], c=clustered['Cluster'],
            cmap=cmap, s=100, edgecolor='none')
plt.colorbar(ticks=[-1,0,1,2,3,4,5,6,7,8])
centers = kmeans_1.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=100, alpha=0.3);
#plt.xlim(2.28, 2.375)
#plt.ylim(48.84, 48.89)
plt.xlabel('Latitude', fontsize=14)
plt.ylabel('Longitude', fontsize=14)
plt.title('HDBSCAN clustering results', fontsize=14)
plt.grid()
plt.show()

Although HDBSCAN is a great density based spatial clustering algorithm, the cons to it is that some locations will not be located because they are deemed "noise." 

There may be some attractions that may be out of the way and not in normal common areas. These attractions may be ones that one would want to visit, so using hdbscan may not be the best ideal algorithm to use.

To tackle this problem, we should use spatial data clustering with DBSCAN


In [None]:
from sklearn.cluster import DBSCAN

In [None]:
cols = ['Latitude','Longitude']
coords = sample_df.as_matrix(columns = cols)


In [None]:
kms_per_radian = 6371.0088
epsilon = 5/ kms_per_radian
db = DBSCAN(eps = epsilon, min_samples = 1, algorithm = 'ball_tree', metric = 'haversine').fit(np.radians(coords))

In [None]:
core_samples_mask = np.zeros_like(db.labels_, dtype = bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

In [None]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)


In [None]:
print('Estimated number of clusters: %d' % n_clusters_)

In [None]:
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()