# Final project case: building a personalised tourism recommender: Building the recommender

# Loading the libraries

In [4]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Loading the data

In [6]:
df = pd.read_excel("data_final_encoded.xlsx")

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,average_sentiment,cat_detailed,cat_reduced,latitud,longitud,address,area,age,way_travel,rating,sentiment_encoded,destination_encoded,cat_sites_reduced_encoded,age_encoded,way_encoded
0,0,Teatro Flamenco Madrid,history_freak,theater,"experiences, cultural centers, theaters and music",40.423258,-3.704502,"C. del Pez, 10, 28004 Madrid, España",Madrid,adult,in couple,4.7,4,1,4,1,1
1,1,Urban Safari,adventurous,experience,"experiences, cultural centers, theaters and music",40.4043,-3.69467,"Calle de las Delicias, 9, 28045 Madrid, España",Madrid,old,in couple,4.9,1,1,4,2,1
2,2,Museo Gran Via 15,artsy,museum,"historic building, museums and archaeological ...",40.41975,-3.700222,"C/ Gran Vía, 15, Local, 28013 Madrid, España",Madrid,adult,in family,4.4,0,1,1,1,3
3,3,Parque Princesa Leonor,relax,park,"town, parks or lookouts",40.488108,-3.619405,"F9RJ+4F, 28055 Madrid, España",Madrid,young,in couple,4.4,2,1,2,0,1
4,4,Parque Juan Pablo II,relax,park,"town, parks or lookouts",40.454777,-3.626851,"Av. de Machupichu, 1, 28043 Madrid, España",Madrid,young,in couple,4.5,2,1,2,0,1


#### Encoding values

In [8]:
df["average_sentiment"].value_counts()

average_sentiment
history_freak    1297
curious          1011
relax             859
artsy             442
adventurous       189
Name: count, dtype: int64

In [9]:
df["age"].value_counts()

age
adult    1685
young    1510
old       603
Name: count, dtype: int64

In [10]:
df["way_travel"].value_counts()

way_travel
in couple    1709
in family    1496
in group      492
alone         101
Name: count, dtype: int64

In [11]:
df["cat_reduced"].value_counts()

cat_reduced
historic building, museums and archaeological rests    1076
town, parks or lookouts                                 585
points of interest, like squares and sculptures         520
experiences, cultural centers, theaters and music       511
religious                                               367
contemporary buildings and art galleries                303
route and urban routes                                  271
food, music, seaside, sport and others                  165
Name: count, dtype: int64

In [12]:
df["area"].value_counts()

area
Euskadi      1272
Barcelona    1250
Madrid        875
Vigo          373
La Palma       28
Name: count, dtype: int64

In [46]:
# This is how we previously encode the categoricals for the model. 

sentiment = {
    'artsy': 0,
    'adventurous': 1,
    'relax': 2,
    'curious': 3,
    'history_freak': 4,
}


user_age = {
    'young': 0,
    'adult': 1,
    'old': 2
}

user_way = {
    'alone': 0,
    'in couple': 1,
    'in group': 2,
    'in family':3
}


cat_sites_reduced_encoded = {
    'historic building, museums and archaeological rests': 1,
    'town, parks or lookouts': 2,
    'points of interest, like squares and sculptures': 3,
    'experiences, cultural centers, theaters and music': 4,
    'religious': 5,
    'contemporary buildings and art galleries': 6,
    'route and urban routes': 7,
    'food, music, seaside, sport and others': 8
}

destination_encoded = {
    "Madrid": 1,
    "Vigo": 2,
    "La Palma": 3,
    "Euskadi": 4,
    "Barcelona": 5,
}

In [13]:
df.isna().sum()

Unnamed: 0                   0
name                         0
average_sentiment            0
cat_detailed                 0
cat_reduced                  0
latitud                      0
longitud                     0
address                      0
area                         0
age                          0
way_travel                   0
rating                       0
sentiment_encoded            0
destination_encoded          0
cat_sites_reduced_encoded    0
age_encoded                  0
way_encoded                  0
dtype: int64

The model will work in such a way that will tell in the destination of the travel of the user, which category of sites is better for him to visit and then recommend three sites of this category

# THE MODEL I CHOSE

**_Random Forest_**

In [14]:
X_content = df[["age_encoded", "way_encoded", "sentiment_encoded", "destination_encoded", "rating"]]
y_content = df["cat_sites_reduced_encoded"]

# Train test split
X_train_content, X_test_content, y_train_content, y_test_content = train_test_split(X_content, y_content, test_size=0.2, random_state=42)

rf_content = RandomForestClassifier(n_estimators=100, random_state=42)

# model training
rf_content.fit(X_train_content, y_train_content)

y_pred_rf = rf_content.predict(X_test_content)
accuracy_rf = accuracy_score(y_test_content, y_pred_rf)
print(f'Random Forest - Accuracy: {accuracy_rf:.2f}')

***

# Recommender

In [18]:
rf_content = RandomForestClassifier(n_estimators=100, random_state=42)
rf_content.fit(X_train_content, y_train_content)

# Inverse dict
user_age_decoded = {
    0: 'young',
    1: 'adult',
    2: 'old'
}

# Mapping for user_way
user_way_decoded = {
    0: 'alone',
    1:'in couple',
    2:'in group',
    3:'in family'
}

# Mapping for sentiment
sentiment_decoded = {
    0: 'artsy',
    1: 'adventurous',
    2: 'relax',
    3: 'curious',
    4: 'history_freak'
}

# Mapping for cat_sites

cat_sites_decoded = {
    1: 'historic building, museums and archaeological rests',
    2: 'town, parks or lookouts',
    3: 'points of interest, like squares and sculptures',
    4: 'experiences, cultural centers, theaters and music',
    5: 'religious',
    6: 'contemporary buildings and art galleries',
    7: 'route and urban routes',
    8: 'food, music, seaside, sport and others'
}

destination_decoded = {
    1: "Madrid",
    2: "Vigo",
    3: "La Palma",
    4: "Euskadi",
    5: "Barcelona"
}

In [23]:
# Function to obtain the best category of the user
def get_user_category(user_age, user_way, user_sentiment, user_destination, rat):
    user_data = pd.DataFrame({
        "age_encoded": [user_age],
        "way_encoded": [user_way],
        'sentiment_encoded': [user_sentiment],
        'destination_encoded': [user_destination],
        'rating':[rat]
    })

    user_category_encoded = rf_content.predict(user_data)[0]
    user_category = cat_sites_decoded[user_category_encoded]
    return user_category



In [28]:
def get_top_three_sites_by_category(df, user_category, user_destination):
    # filter sites of this category and destination
    category_sites = df[(df['cat_reduced'] == user_category) & (df['destination_encoded'] == user_destination)]

    # Order sites with rating form best to worst
    sorted_sites = category_sites.sort_values(by='rating', ascending=False)

    # Take the three sites with the best rating
    top_three_sites = sorted_sites.head(3)['name']

    return top_three_sites.tolist()

In [33]:
# Example

user_sentiment = 2  # "relax"
user_age = 0 # "young"
user_way = 3     # Code of 'in family'
user_destination = 5 # Code of Madrid
rat = 4

user_category = get_user_category(user_age, user_way, user_sentiment, user_destination, rat)
print("Category more adequate to user:", user_category)

top_sites_names = get_top_three_sites_by_category(df, user_category, user_destination)
print("Best places to visit:")
print(top_sites_names)

Category more adequate to user: town, parks or lookouts
Best places to visit:
['Parc del Auditoris', 'Jardins de Ramon Margalef i López', "Torre vigilància Santa Creu D'Olorda"]


In [None]:
# This is all the code together to export it to Streamlit
df = pd.read_excel("data_final_encoded.xlsx")

X_content = df[["age_encoded", "way_encoded", "sentiment_encoded", "destination_encoded", "rating"]]
y_content = df["cat_sites_reduced_encoded"]

# Train test split
X_train_content, X_test_content, y_train_content, y_test_content = train_test_split(X_content, y_content, test_size=0.2, random_state=42)

rf_content = RandomForestClassifier(n_estimators=100, random_state=42)

# model training
rf_content.fit(X_train_content, y_train_content)

y_pred_rf = rf_content.predict(X_test_content)
accuracy_rf = accuracy_score(y_test_content, y_pred_rf)
print(f'Random Forest - Accuracy: {accuracy_rf:.2f}')

rf_content = RandomForestClassifier(n_estimators=100, random_state=42)
rf_content.fit(X_train_content, y_train_content)

# -----

# Inverse dict
user_age_decoded = {
    0: 'young',
    1: 'adult',
    2: 'old'
}

# Mapping for user_way
user_way_decoded = {
    0: 'alone',
    1:'in couple',
    2:'in group',
    3:'in family'
}

# Mapping for sentiment
sentiment_decoded = {
    0: 'artsy',
    1: 'adventurous',
    2: 'relax',
    3: 'curious',
    4: 'history_freak'
}

# Mapping for cat_sites

cat_sites_decoded = {
    1: 'historic building, museums and archaeological rests',
    2: 'town, parks or lookouts',
    3: 'points of interest, like squares and sculptures',
    4: 'experiences, cultural centers, theaters and music',
    5: 'religious',
    6: 'contemporary buildings and art galleries',
    7: 'route and urban routes',
    8: 'food, music, seaside, sport and others'
}

destination_decoded = {
    1: "Madrid",
    2: "Vigo",
    3: "La Palma",
    4: "Euskadi",
    5: "Barcelona"
}

# -----

# Function to obtain the best category of the user
def get_user_category(user_age, user_way, user_sentiment, user_destination, rat):
    user_data = pd.DataFrame({
        "age_encoded": [user_age],
        "way_encoded": [user_way],
        'sentiment_encoded': [user_sentiment],
        'destination_encoded': [user_destination],
        'rating':[rat]
    })

    user_category_encoded = rf_content.predict(user_data)[0]
    user_category = cat_sites_decoded[user_category_encoded]
    return user_category

# ------

def get_top_three_sites_by_category(df, user_category, user_destination):
    # filter sites of this category and destination
    category_sites = df[(df['cat_reduced'] == user_category) & (df['destination_encoded'] == user_destination)]

    # Order sites with rating form best to worst
    sorted_sites = category_sites.sort_values(by='rating', ascending=False)

    # Take the three sites with the best rating
    top_three_sites = sorted_sites.head(3)['name']

    return top_three_sites.tolist()

# -----

# Example

user_sentiment = 2  # "relax"
user_age = 0 # "young"
user_way = 3     # Code of 'in family'
user_destination = 5 # Code of Madrid
rat = 4

user_category = get_user_category(user_age, user_way, user_sentiment, user_destination, rat)
print("Category more adequate to user:", user_category)

top_sites_names = get_top_three_sites_by_category(df, user_category, user_destination)
print("Best places to visit:")
print(top_sites_names)