# Final project case: building a personalised tourism recommender -  NEW USER MODEL: CONTENT BASED RECOMMENDER 

# Loading the libraries

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Loading the data

In [2]:
df = pd.read_excel("data_final_not_encoded.xlsx")

In [4]:
df.head()

Unnamed: 0,id,name,average_sentiment,cat_detailed,cat_reduced,latitud,longitud,address,area,age,way_travel,rating
0,0,Teatro Flamenco Madrid,history_freak,theater,"experiences, cultural centers, theaters and music",40.423258,-3.704502,"C. del Pez, 10, 28004 Madrid, España",Madrid,adult,in couple,4.7
1,1,Urban Safari,adventurous,experience,"experiences, cultural centers, theaters and music",40.4043,-3.69467,"Calle de las Delicias, 9, 28045 Madrid, España",Madrid,old,in couple,4.9
2,2,Museo Gran Via 15,artsy,museum,"historic building, museums and archaeological ...",40.41975,-3.700222,"C/ Gran Vía, 15, Local, 28013 Madrid, España",Madrid,adult,in family,4.4
3,3,Parque Princesa Leonor,relax,park,"town, parks or lookouts",40.488108,-3.619405,"F9RJ+4F, 28055 Madrid, España",Madrid,young,in couple,4.4
4,4,Parque Juan Pablo II,relax,park,"town, parks or lookouts",40.454777,-3.626851,"Av. de Machupichu, 1, 28043 Madrid, España",Madrid,young,in couple,4.5


# Encoding values

In [5]:
df["average_sentiment"].value_counts()

average_sentiment
history_freak    1297
curious          1011
relax             859
artsy             442
adventurous       189
Name: count, dtype: int64

In [7]:
df["age"].value_counts()

age
adult    1685
young    1510
old       603
Name: count, dtype: int64

In [8]:
df["way_travel"].value_counts()

way_travel
in couple    1709
in family    1496
in group      492
alone         101
Name: count, dtype: int64

In [9]:
df["cat_reduced"].value_counts()

cat_reduced
historic building, museums and archaeological rests    1076
town, parks or lookouts                                 585
points of interest, like squares and sculptures         520
experiences, cultural centers, theaters and music       511
religious                                               367
contemporary buildings and art galleries                303
route and urban routes                                  271
food, music, seaside, sport and others                  165
Name: count, dtype: int64

In [10]:
df["area"].value_counts()

area
Euskadi      1272
Barcelona    1250
Madrid        875
Vigo          373
La Palma       28
Name: count, dtype: int64

In [13]:
# Encoding categoricals. We are going to use manual encoding because we need it for the model after.

sentiment = {
    'artsy': 0,
    'adventurous': 1,
    'relax': 2,
    'curious': 3,
    'history_freak': 4,
}


user_age = {
    'young': 0,
    'adult': 1,
    'old': 2
}

user_way = {
    'alone': 0,
    'in couple': 1,
    'in group': 2,
    'in family':3
}


destination_encoded = {
    "Madrid": 1,
    "Vigo": 2,
    "La Palma": 3,
    "Euskadi": 4,
    "Barcelona": 5,
}

cat_sites_reduced_encoded = {
    'historic building, museums and archaeological rests': 1,
    'town, parks or lookouts': 2,
    'points of interest, like squares and sculptures': 3,
    'experiences, cultural centers, theaters and music': 4,
    'religious': 5,
    'contemporary buildings and art galleries': 6,
    'route and urban routes': 7,
    'food, music, seaside, sport and others': 8
}


# No let's do the mapping
df['age_encoded'] = df['age'].map(user_age)
df['way_travel_encoded'] = df['way_travel'].map(user_way)
df['sentiment_encoded'] = df['average_sentiment'].map(sentiment)
df['destination_encoded'] = df['area'].map(destination_encoded)
df['cat_sites_reduced_encoded'] = df['cat_reduced'].map(cat_sites_reduced_encoded)

In [14]:
df.isna().sum()

id                           0
name                         0
average_sentiment            0
cat_detailed                 0
cat_reduced                  0
latitud                      0
longitud                     0
address                      0
area                         0
age                          0
way_travel                   0
rating                       0
age_encoded                  0
way_travel_encoded           0
sentiment_encoded            0
destination_encoded          0
cat_sites_encoded            0
cat_sites_reduced_encoded    0
dtype: int64

In [None]:
# df.to_excel("data_final_encoded.xlsx")

# Models: Option 1 - using age, way travel, sentiment, destination and rating

The model will work in such a way that will tell in the destination of the travel of the user, which category of sites is better for him to visit and then recommend three sites of this category

**_Random Forest_** ---> the one we will choose, because I understand it better than gradient boosting.

In [49]:
X_content = df[["age_encoded", "way_travel_encoded", "sentiment_encoded", "destination_encoded", "rating"]]
y_content = df["cat_sites_reduced_encoded"]

X_train_content, X_test_content, y_train_content, y_test_content = train_test_split(X_content, y_content, test_size=0.2, random_state=42)


In [51]:
rf_content = RandomForestClassifier(n_estimators=100, random_state=42)

rf_content.fit(X_train_content, y_train_content)

y_pred_rf = rf_content.predict(X_test_content)
accuracy_rf = accuracy_score(y_test_content, y_pred_rf)
print(f'Random Forest - Accuracy: {accuracy_rf:.3f}')

Random Forest - Accuracy: 0.778


In [36]:
from sklearn.metrics import classification_report

# Generating the classification report
classification_rep = classification_report(y_test_content, y_pred_rf)

print("Random Forest Classification Report:")
print(classification_rep)

Random Forest Classification Report:
              precision    recall  f1-score   support

           1       0.86      0.87      0.87       198
           2       0.82      0.85      0.83       125
           3       0.66      0.64      0.65        97
           4       0.75      0.82      0.78       111
           5       0.71      0.75      0.73        69
           6       0.64      0.59      0.61        63
           7       0.95      0.85      0.90        62
           8       0.61      0.49      0.54        35

    accuracy                           0.78       760
   macro avg       0.75      0.73      0.74       760
weighted avg       0.78      0.78      0.78       760



**_Logistic regression_**

In [31]:
from sklearn.linear_model import LogisticRegression


logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train_content, y_train_content)

y_pred_logreg = logreg_model.predict(X_test_content)

accuracy_logreg = accuracy_score(y_test_content, y_pred_logreg)
print("Logistic Regression Accuracy:", accuracy_logreg)

Logistic Regression Accuracy: 0.5815789473684211


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**_Decission Tree_**

In [32]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_content, y_train_content)

y_pred_dt = dt_model.predict(X_test_content)

accuracy_dt = accuracy_score(y_test_content, y_pred_dt)
print("Decision Tree Accuracy:", accuracy_dt)

Decision Tree Accuracy: 0.7671052631578947


**_Naïve Bayes_**

In [33]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train_content, y_train_content)

y_pred_nb = nb_model.predict(X_test_content)

accuracy_nb = accuracy_score(y_test_content, y_pred_nb)
print("Naive Bayes Accuracy:", accuracy_nb)

Naive Bayes Accuracy: 0.6157894736842106


**_Gradient boosting_**

In [19]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

gradient_boosting_classifier = GradientBoostingClassifier()

gradient_boosting_classifier.fit(X_train_content, y_train_content)

y_pred_gb = gradient_boosting_classifier.predict(X_test_content)

accuracy_gb = accuracy_score(y_test_content, y_pred_gb)

print(f'Gradient Boosting - Accuracy: {accuracy_gb:.2f}')

Gradient Boosting - Accuracy: 0.81


**_KNN_**

In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn_classifier = KNeighborsClassifier()

knn_classifier.fit(X_train_content, y_train_content)

y_pred_knn = knn_classifier.predict(X_test_content)

accuracy_knn = accuracy_score(y_test_content, y_pred_knn)

print(f'K-Nearest Neighbors - Accuracy: {accuracy_knn:.2f}')

K-Nearest Neighbors - Accuracy: 0.76


**_Gaussian mixture_**

In [34]:
from sklearn.mixture import GaussianMixture

n_components = 4  # Set the number of components (clusters) in the GMM
gmm_model = GaussianMixture(n_components=n_components, random_state=42)
gmm_model.fit(X_train_content)

y_pred_gmm = gmm_model.predict(X_test_content)

# GMM is an unsupervised algorithm, so there's no accuracy score to calculate.
# Instead, GMM is used for clustering and density estimation tasks.
# You can use metrics like AIC, BIC, or silhouette score to evaluate the quality of the clustering.

# Example of AIC (Akaike Information Criterion) to evaluate the GMM
aic_score = gmm_model.aic(X_train_content)
print("AIC Score:", aic_score)

# Example of BIC (Bayesian Information Criterion) to evaluate the GMM
bic_score = gmm_model.bic(X_train_content)
print("BIC Score:", bic_score)

AIC Score: 9815.697261517987
BIC Score: 10315.270500219913




***

# Models: Option 2 - sentiment, destination and rating

**_Random Forest_**

In [21]:
X_content = df[["sentiment_encoded", "destination_encoded", "rating"]]
y_content = df["cat_sites_reduced_encoded"]

X_train_content, X_test_content, y_train_content, y_test_content = train_test_split(X_content, y_content, test_size=0.2, random_state=42)


In [22]:
rf_content = RandomForestClassifier(n_estimators=100, random_state=42)

rf_content.fit(X_train_content, y_train_content)

y_pred_rf = rf_content.predict(X_test_content)
accuracy_rf = accuracy_score(y_test_content, y_pred_rf)
print(f'Random Forest - Accuracy: {accuracy_rf:.2f}')

Random Forest - Accuracy: 0.61


With this second try we see that the columns age and way travel, that were synthetically created really influence the model. 

When creating this columns I encountered an "ethical" problem. The ones that were used here are imbalanced, but not so much so they look "real".

I used a dataset where they were really imbalanced and it gave me a result of 0.95 of accuracy. But if I chose this dataset I believe I will be doing it for the model, not for the product. In reality the real dataset will be varied and not so imbalanced, more closer to what we have in the Option 1. Eventhough the accuracy is not perfect in this option, I think is pretty decent taking into account that the dataset was entirely created by myself.  

***

# Models: Option 3 - using age, way travel, sentiment, destination and rating and deleting La Palma from destinations

In [37]:
df_reduced = df.copy()

In [38]:
# I realized the destinations were very imbalanced:
df_reduced["destination_encoded"].value_counts()

destination_encoded
4    1272
5    1250
1     875
2     373
3      28
Name: count, dtype: int64

In [41]:
df_reduced = df_reduced[df_reduced["destination_encoded"] != 3]

In [42]:
df_reduced["destination_encoded"].value_counts()

destination_encoded
4    1272
5    1250
1     875
2     373
Name: count, dtype: int64

**_Random Forest_**

In [45]:
X_content = df_reduced[["age_encoded", "way_travel_encoded", "sentiment_encoded", "destination_encoded", "rating"]]
y_content = df_reduced["cat_sites_reduced_encoded"]

X_train_content, X_test_content, y_train_content, y_test_content = train_test_split(X_content, y_content, test_size=0.2, random_state=42)


In [52]:
rf_content = RandomForestClassifier(n_estimators=100, random_state=42)

rf_content.fit(X_train_content, y_train_content)

y_pred_rf = rf_content.predict(X_test_content)
accuracy_rf = accuracy_score(y_test_content, y_pred_rf)
print(f'Random Forest - Accuracy: {accuracy_rf:.3f}') 

# Slightly better but not a lot of change, so let's keep La Palma. 

Random Forest - Accuracy: 0.778


In [None]:
# deleting La Palma didn't make much difference so I stayed with option 1.