## Imports

In [None]:
!pip install gensim

In [None]:
!pip install pyLDAvis

In [None]:
!pip install datasets

In [None]:
# Para el error raro de gensim
!pip uninstall -y scipy
!pip install --no-cache-dir scipy

In [None]:
import os
import re
import spacy
import html
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import random
import pyLDAvis
import pyLDAvis.gensim
import gensim.downloader

from math import sqrt
from datasets import Dataset
from wordcloud import WordCloud
from transformers import pipeline
from tqdm import tqdm
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from transformers import BertTokenizer, BertModel
from datasets import Dataset
from transformers import pipeline
from gensim import corpora, models
from gensim.models import KeyedVectors, LdaModel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

## Data loading

In [None]:
filename = "boardgames_3000.csv"
games = pd.read_csv(filename, sep=";")
games.head()

## Task 1: Text Preprocessing and vectorization

### Text Preprocessing

In [None]:
# Load English spaCy model
nlp = spacy.load("en_core_web_sm")

custom_stopwords = {
    "game", "s", "quot", "play", "player", "move", "turn", "win", "lose", "score", "board",
    "counter", "pawns", "roll", "flip", "stack",
    "rule", "round", "phase", "turns", "player-based",
    "moveable", "drafting", "placement", "point", "setup", "victory", "gameplay",
    "scenario", "objectives", "teams", "player-controlled", "control", "goal", "objective",
    "interaction", "turn-based", "multiplayer", "theme", "mechanic", "design", "playstyle", "elements", "feature", "mode",
    "type", "system", "level", "expansion", "variant", "edition", "version", "add-on", "power-up",
    "ability", "progression", "ruleset", "collection",
    "choice", "replayability", "scoring", "movement"
}

def clean_text(text):
    if pd.isna(text):
        return ""

    # Lowercase
    text = text.lower()

    # Remove HTML tags
    text = re.sub(r"<.*?>", " ", text)

    # Remove special characters and numbers
    text = re.sub(r"[^a-z\s]", " ", text)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Apply spaCy NLP processing
    doc = nlp(text)

    # Remove stopwords, lemmatize, exclude punctuations, and custom words
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and token.is_alpha and token.lemma_ not in custom_stopwords
    ]

    return " ".join(tokens)

In [None]:
# Apply the cleaning function to all descriptions
games["clean_description"] = games["description"].apply(clean_text)

In [None]:
# Example: original and cleaned text for first entry
print("Original:\n", games["description"][0])
print("Cleaned:\n", games["clean_description"][0])

#### Save clean dataset (if desired)

In [None]:
games.to_csv("boardgames_3000_clean.csv", index=False, encoding="utf-8", sep=";")

### Vectorization

#### BoW and TF-IDF

In [None]:
# For this task, we will mainly be working with the column "clean_description".
# We will tokenize this part fist
tokenized_descr = [clean_description.split() for clean_description in games["clean_description"]]

In [None]:
# Then we will create a dictionary and obtain both the BoW corpus and the TF-IDF corpus
# BoW corpus
dictionary = corpora.Dictionary(tokenized_descr)
bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_descr]

# TF-IDF corpus
tfidf = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]

Now we would like to obtain some insights from our data using the TF-IDF corpus. However, we cannot use it as it is. `tfidf_corpus`is a sparse corpus matrix, not a dense numpy matrix.

Gensim is optimized for streaming large corpora and if we wanted to do some visualizations, we would need to convert it to dense form.

In [None]:
num_docs = len(tfidf_corpus)
num_terms = len(dictionary)

# Create empty matrix
X_tfidf_gensim = np.zeros((num_docs, num_terms))

# Fill it with TF-IDF scores
for doc_idx, doc in enumerate(tfidf_corpus):
    for term_id, tfidf_score in doc:
        X_tfidf_gensim[doc_idx, term_id] = tfidf_score

In [None]:
# Compute average TF-IDF per term
avg_tfidf_scores = X_tfidf_gensim.mean(axis=0)

# Top-N terms
top_n = 25
top_term_indices = avg_tfidf_scores.argsort()[::-1][:top_n]

# Get corresponding terms
terms = [dictionary[i] for i in top_term_indices]

# Plot
plt.figure(figsize=(12, 6))
plt.bar(range(top_n), avg_tfidf_scores[top_term_indices], tick_label=terms)
plt.xlabel('Term')
plt.ylabel('Average TF-IDF Score')
plt.title('Top Terms by TF-IDF Score')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Total TF-IDF score per term across all documents
tfidf_term_scores = X_tfidf_gensim.sum(axis=0)

# Map term to score
word_freq = {dictionary[i]: tfidf_term_scores[i] for i in range(num_terms)}

# Generate word cloud
wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='white',
    colormap='viridis',
    max_words=200
).generate_from_frequencies(word_freq)

# Display
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('TF-IDF Word Cloud')
plt.show()

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Dimensionality reduction
X_reduced = PCA(n_components=50).fit_transform(X_tfidf_gensim)
X_embedded = TSNE(n_components=2, perplexity=30, random_state=42).fit_transform(X_reduced)

# Plot
plt.figure(figsize=(10, 8))
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], alpha=0.6)
plt.title("t-SNE of Game Descriptions (TF-IDF)")
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.grid(True)
plt.show()

#### GloVe and Doc2Vec

In [None]:
# Load pretrained GloVe from gensim
glove_model = gensim.downloader.load("glove-wiki-gigaword-100")

def get_glove_avg_vector(doc):
    words = doc.split()
    vectors = [glove_model[word] for word in words if word in glove_model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(glove_model.vector_size)

# Apply to clean descriptions
X_glove = np.vstack(games["clean_description"].apply(get_glove_avg_vector))

In [None]:
def load_glove_embeddings(filepath="glove.6B.100d.txt"):
    embeddings = {}
    with open(filepath, "r", encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

def get_glove_vector(text, glove, dim=100):
    words = text.split()
    vectors = [glove[word] for word in words if word in glove]
    return np.mean(vectors, axis=0) if vectors else np.zeros(dim)

In [None]:
glove = load_glove_embeddings("glove.6B.100d.txt")
games["glove_vector"] = games["clean_description"].apply(lambda x: get_glove_vector(x, glove))

In [None]:
len(games["glove_vector"][0])

In [None]:
# Run PCA without limiting components
pca_full = PCA()
X_glove_pca_full = pca_full.fit_transform(X_glove)

# Explained variance per component
explained_variance = pca_full.explained_variance_ratio_
cumulative_variance = explained_variance.cumsum()

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA Explained Variance Curve (GloVe)")
plt.grid(True)
plt.axhline(y=0.90, color="red", linestyle="--", label="90% Variance")
plt.legend()
plt.tight_layout()
plt.show()

n_components_90 = np.argmax(cumulative_variance >= 0.90) + 1
print(f"Number of components to explain 90% variance: {n_components_90}")

In [None]:
# Reduce GloVe to 2D with PCA
pca = PCA(n_components=2)
X_glove_pca = pca.fit_transform(X_glove)

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(X_glove_pca[:, 0], X_glove_pca[:, 1], alpha=0.6)
plt.title("PCA of GloVe Document Embeddings")
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.grid(True)
plt.tight_layout()
plt.show()

# How much variance PCA retained
explained_var = pca.explained_variance_ratio_.sum()
print(f"Explained Variance (first 2 components): {explained_var:.2%}")

In [None]:
# Prepare data
documents = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(games["clean_description"])]

# Train
doc2vec_model = Doc2Vec(documents, vector_size=100, window=5, min_count=2, workers=4, epochs=40)

# Vectorize
X_doc2vec = np.array([doc2vec_model.dv[str(i)] for i in range(len(documents))])

In [None]:
# Run t-SNE on doc2vec vectors
tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, n_iter=1000, random_state=42)
X_doc2vec_tsne = tsne.fit_transform(X_doc2vec)

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(X_doc2vec_tsne[:, 0], X_doc2vec_tsne[:, 1], alpha=0.6)
plt.title("t-SNE of Doc2Vec Document Embeddings")
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.grid(True)
plt.tight_layout()
plt.show()

#### LDA

In [None]:
corpus = games['clean_description'].tolist()

In [None]:
# Try different numbers of topics
coherence_scores = []
models = []
for k in range(5, 41, 5):
    model = LdaModel(corpus=bow_corpus, id2word=dictionary, num_topics=k, passes=15, random_state=42)
    coherence = CoherenceModel(model=model, texts=tokenized_descr, dictionary=dictionary, coherence='c_v').get_coherence()
    coherence_scores.append(coherence)
    models.append(model)

# Plot coherence vs. topic count
plt.plot(range(5, 41, 5), coherence_scores)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score (c_v)")
plt.title("Coherence vs Number of Topics (Gensim LDA)")
plt.grid(True)
plt.show()

# Select best model
best_k = range(5, 41, 5)[np.argmax(coherence_scores)]
best_model = models[np.argmax(coherence_scores)]
best_model.save("lda_best_model.gensim")

In [None]:
for topic_id, words in best_model.show_topics(num_topics=best_k, num_words=10, formatted=False):
    print(f"Topic {topic_id}: {[word for word, prob in words]}")

In [None]:
beta_matrix = np.array(best_model.get_topics())
print("Beta shape:", beta_matrix.shape)

In [None]:
words = [dictionary[i] for i in range(len(dictionary))]

# We create a dataframe to read the results better
beta_df = pd.DataFrame(beta_matrix, columns=words, index=[f"Topic {i}" for i in range(best_k)])

# Here we display the top words per topic (beta vectors), with the probabilities.
for topic_id in range(best_k):
    print(f"\nTopic {topic_id}:")
    print(beta_df.iloc[topic_id].sort_values(ascending=False).head(10))

In [None]:
best_model.show_topics(num_topics=-1, num_words=10, log=False, formatted=True)

# igual usar esto que es más compacto

In [None]:
theta_matrix = []
for doc in bow_corpus:
    doc_topics = best_model.get_document_topics(doc, minimum_probability=0)
    theta_matrix.append([prob for _, prob in doc_topics])

theta_matrix = np.array(theta_matrix)
print("Theta shape:", theta_matrix.shape)

In [None]:
# Dataframe for the thetas
theta_df = pd.DataFrame(theta_matrix, columns=[f"Topic {i}" for i in range(best_k)])

# We just display the most relevant topic (the one with the highest probability)
theta_df["Dominant Topic"] = theta_df.idxmax(axis=1)

print(theta_df[["Dominant Topic"]])

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(best_model, bow_corpus, dictionary)
pyLDAvis.save_html(vis, "assets/lda_vis.html")
vis

#### LDA with higher coherence values (+0.02)

In [None]:
def lemmatize_nouns_adjs(text):
    doc = nlp(text)
    return [
        token.lemma_ for token in doc
        if token.pos_ in {"NOUN", "ADJ"} and not token.is_stop and token.is_alpha
    ]

tokenized_descr = games["description"].apply(lemmatize_nouns_adjs).tolist()

In [None]:
dictionary = corpora.Dictionary(tokenized_descr)
dictionary.filter_extremes(no_below=5, no_above=0.5)
bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_descr]

In [None]:
coherence_scores = []
models = []

for k in range(5, 41, 5):
    model = LdaModel(
        corpus=bow_corpus,
        id2word=dictionary,
        num_topics=k,
        passes=20,
        random_state=42
    )
    cm = CoherenceModel(model=model, texts=tokenized_descr, dictionary=dictionary, coherence='c_v')
    coherence_scores.append(cm.get_coherence())
    models.append(model)

plt.figure(figsize=(8, 6))
plt.plot(range(5, 41, 5), coherence_scores, marker="o")
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score (c_v)")
plt.title("Improved LDA: Coherence vs Number of Topics")
plt.grid(True)
plt.show()

In [None]:
best_k = range(5, 41, 5)[np.argmax(coherence_scores)]
best_model = models[np.argmax(coherence_scores)]

for idx, topic in best_model.show_topics(num_topics=best_k, num_words=10, formatted=False):
    print(f"Topic {idx}: {[word for word, _ in topic]}")

In [None]:
beta_matrix = np.array(best_model.get_topics())
print("Beta shape:", beta_matrix.shape)

In [None]:
words = [dictionary[i] for i in range(len(dictionary))]

# We create a dataframe to read the results better
beta_df = pd.DataFrame(beta_matrix, columns=words, index=[f"Topic {i}" for i in range(best_k)])

# Here we display the top words per topic (beta vectors), with the probabilities.
for topic_id in range(best_k):
    print(f"\nTopic {topic_id}:")
    print(beta_df.iloc[topic_id].sort_values(ascending=False).head(10))

In [None]:
best_model.show_topics(num_topics=-1, num_words=10, log=False, formatted=True)

# igual usar esto que es más compacto

In [None]:
theta_matrix = []
for doc in bow_corpus:
    doc_topics = best_model.get_document_topics(doc, minimum_probability=0)
    theta_matrix.append([prob for _, prob in doc_topics])

theta_matrix = np.array(theta_matrix)
print("Theta shape:", theta_matrix.shape)

In [None]:
# Dataframe for the thetas
theta_df = pd.DataFrame(theta_matrix, columns=[f"Topic {i}" for i in range(best_k)])

# We just display the most relevant topic (the one with the highest probability)
theta_df["Dominant Topic"] = theta_df.idxmax(axis=1)

print(theta_df[["Dominant Topic"]])

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(best_model, bow_corpus, dictionary)
vis

 ## Sentiment Analysis

In this case, we will be using a zero-shot model from Facebook. We need to be careful when using these models because:

*   They are trained on natural language, not preprocessed or lemmatized text.
*   They expect raw sentences to understand semantic tone.
*   They perform better when given more expressive and emotional content, which can get lost when lemmatizing everything.

Because of this, we need to preprocess again the game descriptions and use that new column for using the model that we have selected.

There is an important aspect to consider here, which is that zero-shot classification with large transformer models is computationally expensive and slow when done sequentially.

To optimize performance, we convert our data to a Hugging Face Dataset object, which allows us to apply classification in efficient batches. Batching not only reduces runtime significantly, but also allows the model to process multiple descriptions in parallel without affecting the accuracy of individual predictions.

As a result of the classification process, we introduced two new variables into our dataset: `predicted_emotion` and `predicted_vibe`. The former reflects the emotional or narrative mood of the game (e.g., dark, funny, nostalgic), while the latter characterizes the gameplay dynamics or player experience (e.g., strategic, cooperative, family-friendly).

In [None]:
# Load your clean dataframe
hf_dataset = Dataset.from_pandas(games[["clean_description"]])

# Load the zero-shot classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)  # device=0 to use GPU

# Labels
emotional_labels = ["funny", "dark", "happy", "serious", "nostalgic", "intense"]
vibe_labels = ["strategic", "cooperative", "competitive", "family-friendly", "chaotic", "educational"]

# Batched classification function
def classify_batch(batch):
    texts = batch["clean_description"]

    # Predict emotions and vibes together
    emotion_results = classifier(texts, emotional_labels, batch_size=8)
    vibe_results = classifier(texts, vibe_labels, batch_size=8)

    # If input is batched, results will be a list of dicts
    predicted_emotion = [result["labels"][0] for result in emotion_results]
    predicted_vibe = [result["labels"][0] for result in vibe_results]

    return {
        "predicted_emotion": predicted_emotion,
        "predicted_vibe": predicted_vibe
    }

In [None]:
# Apply with batching
hf_dataset = hf_dataset.map(classify_batch, batched=True, batch_size=8)

# Convert back to pandas
games["predicted_emotion"] = hf_dataset["predicted_emotion"]
games["predicted_vibe"] = hf_dataset["predicted_vibe"]

In [None]:
games.to_csv("boardgames_3000_with_dual_tone.csv", sep=";", index=False)

In [None]:
num_examples = 100

for i in range(num_examples):
    name = games.iloc[i]["name"]
    description = games.iloc[i]["description"][:500]
    emotion = games.iloc[i]["predicted_emotion"]
    vibe = games.iloc[i]["predicted_vibe"]

    print(f"Game: {name}")
    print(f"Description:\n{description}...")
    print(f"Predicted Emotion: {emotion}")
    print(f"Predicted Vibe: {vibe}\n")

In [None]:
for col in ["predicted_emotion", "predicted_vibe"]:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=games, y=col, order=games[col].value_counts().index)
    plt.title(f"Distribution of {col}")
    plt.tight_layout()
    plt.show()

## Task 2: Machine Learning (Recommender System)

#### Load dataset and reviews

In [None]:
filename = "boardgames_3000_with_dual_tone.csv"
games = pd.read_csv(filename, sep=";")
games

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
filename = "reviews.csv"
reviews = pd.read_csv(filename, sep=",")
reviews

#### Games Similarity Exploration

##### GloVe

In order to explore the similarity between games, we chose to use GloVe embeddings because these capture the meaning of words such that similar words have similar representations.

In [None]:
def load_glove_embeddings(filepath="glove.6B.100d.txt"):
    embeddings = {}
    with open(filepath, "r", encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

def get_glove_vector(text, glove, dim=100):
    words = text.split()
    vectors = [glove[word] for word in words if word in glove]
    return np.mean(vectors, axis=0) if vectors else np.zeros(dim)

In [None]:
glove = load_glove_embeddings("glove.6B.100d.txt")
games["glove_vector"] = games["clean_description"].apply(lambda x: get_glove_vector(x, glove))

In [None]:
len(games['glove_vector'][0])

Once we had vector representations of all our games, we computed the cosine similarity between every pair of games so see how closely aligned vectors are.

Furthermore, we proceeded by displaying a histogram that plots the distribution of the similarity values. This might be useful because it can help us understand:

*   Are most games very similar?
*   Do we have a wide range of similarity?


In [None]:
glove_vectors = np.vstack(games["glove_vector"].values)

# Compute the full cosine similarity matrix
sim_matrix = cosine_similarity(glove_vectors)

# Remove diagonal values (self-similarity)
np.fill_diagonal(sim_matrix, np.nan)

# Flatten and filter out NaNs
similarities = sim_matrix[~np.isnan(sim_matrix)]

# Plot histogram
plt.figure(figsize=(8, 5))
plt.hist(similarities, bins=50, color="skyblue", edgecolor="black")
plt.title("Distribution of Cosine Similarities Between GloVe Embeddings")
plt.xlabel("Cosine Similarity")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

From this histogram, we can learn different things, some along the lines of:

1.   Most pair have high similarity scores, grouped between 0.8 and 0.95, which may be telling us that many game descriptions are using similar language or concepts.
2.   Very few pairs are below 0.6, transalting into strong dissimilarities in descriptions being rare.

These results can be interpreted in different ways, because if we were show that there is a semantic consistency, these results would confirm it.

However, if our goal is to detect meaningful clusters or differentiate games precisely, since everything is mostly 85%-95% similar, we are not getting much variation.

Because of this, we will try and enhance GloVe by trying a contextual embedding such as BERT and combining text with some more metadata.

As of right now, let's take a deeper look to our embeddings.


In [None]:
def get_min_max_similarity(games_df):
    # Stack GloVe vectors into a 2D array
    vectors = np.vstack(games_df["glove_vector"].values)

    # Compute full cosine similarity matrix
    sim_matrix = cosine_similarity(vectors)

    # Remove diagonal (similarity with self = 1.0)
    np.fill_diagonal(sim_matrix, np.nan)

    # Get min and max ignoring NaNs
    min_sim = np.nanmin(sim_matrix)
    max_sim = np.nanmax(sim_matrix)
    mean_sim = np.nanmean(sim_matrix)

    return min_sim, max_sim, mean_sim

In [None]:
min_similarity, max_similarity, mean = get_min_max_similarity(games)
print(f"Min similarity: {min_similarity:.4f}, Max similarity: {max_similarity:.4f}")
print(mean)

In [None]:
def get_min_max_similarity_with_ids(games_df):
    # Stack GloVe vectors into a 2D array
    vectors = np.vstack(games_df["glove_vector"].values)

    # Compute full cosine similarity matrix
    sim_matrix = cosine_similarity(vectors)

    # Remove diagonal (self-similarity)
    np.fill_diagonal(sim_matrix, np.nan)

    # Get min, max, and mean ignoring NaNs
    min_sim = np.nanmin(sim_matrix)
    max_sim = np.nanmax(sim_matrix)
    mean_sim = np.nanmean(sim_matrix)

    # Get indices of min and max
    min_indices = np.unravel_index(np.nanargmin(sim_matrix), sim_matrix.shape)
    max_indices = np.unravel_index(np.nanargmax(sim_matrix), sim_matrix.shape)

    # Get IDs and names
    min_pair = (games_df.iloc[min_indices[0]]["id"], games_df.iloc[min_indices[1]]["id"])
    max_pair = (games_df.iloc[max_indices[0]]["id"], games_df.iloc[max_indices[1]]["id"])

    return min_sim, max_sim, mean_sim, min_pair, max_pair

In [None]:
min_sim, max_sim, mean_sim, min_pair, max_pair = get_min_max_similarity_with_ids(games)
print(f"Min similarity: {min_sim:.4f} between games {min_pair}")
print(f"Max similarity: {max_sim:.4f} between games {max_pair}")
print(f"Mean similarity: {mean_sim:.4f}")

min_names = (games[games["id"] == min_pair[0]]["name"].values[0],
             games[games["id"] == min_pair[1]]["name"].values[0])
max_names = (games[games["id"] == max_pair[0]]["name"].values[0],
             games[games["id"] == max_pair[1]]["name"].values[0])
print(f"Min similarity between: {min_names}")
print(f"Max similarity between: {max_names}")

##### GloVe + Metadata

Now that we have explored similarity using only text (via GloVe), we can try and add  metadata features to enrich our similarity analysis.

We selected some additional features that might influence how games relate to each other, such as the number of players, game complexity...

In [None]:
extra_features = ["year", "rating", "complexity", "minplayers", "maxplayers", "playingtime", "minage"]

# Initialize the scaler
scaler = MinMaxScaler()

# Fit and transform
games_scaled_features = scaler.fit_transform(games[extra_features])

# Save them into a new column
games["extra_vector"] = list(games_scaled_features)

In [None]:
# Now we combine the textual (GloVe) vector and the extra features into a single vector.
# Each game is now represented by a vector that includes both semantic and numeric information.
games["combined_vector"] = games.apply(
    lambda row: np.concatenate([row["glove_vector"], row["extra_vector"]]),
    axis=1)

In [None]:
len(games['combined_vector'][0])

In [None]:
combined_vectors = np.vstack(games["combined_vector"].values)

# Compute the full cosine similarity matrix
sim_matrix = cosine_similarity(combined_vectors)

# Remove diagonal values (self-similarity)
np.fill_diagonal(sim_matrix, np.nan)

# Flatten and filter out NaNs
similarities = sim_matrix[~np.isnan(sim_matrix)]

plt.figure(figsize=(8, 5))
plt.hist(similarities, bins=50, color="skyblue", edgecolor="black")
plt.title("Distribution of Cosine Similarities Between Combined Embeddings")
plt.xlabel("Cosine Similarity")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

When comparing the similarity histograms, we find that using GloVe alone provides a broader range of similarity scores between games. It captures meaningful differences in game descriptions, making it more effective for differentiating games.

Aadding extra features like year, rating, and player count reduces variability in similarity scores, making most games to appear overly similar. This limits the model's ability to differentiate between games, making the combined approach less useful in our context.

In [None]:
games[games['name']== 'Siesta']

In [None]:
games[games['name']== 'CATAN']

In [None]:
def compare_games_combined_similarity(game_id_1, game_id_2, games_df):
    # Get combined vectors
    vec1 = games_df.loc[games_df["id"] == game_id_1, "combined_vector"].values[0]
    vec2 = games_df.loc[games_df["id"] == game_id_2, "combined_vector"].values[0]

    # Get descriptions
    desc1 = games_df.loc[games_df["id"] == game_id_1, "clean_description"].values[0]
    desc2 = games_df.loc[games_df["id"] == game_id_2, "clean_description"].values[0]

    # Compute cosine similarity
    sim = cosine_similarity([vec1], [vec2])[0][0]

    # Print the results
    print(f"Cosine similarity between Game {game_id_1} and Game {game_id_2}: {sim:.4f}")
    print(f"\nDescription of Game {game_id_1}: {desc1}")
    print(f"\nDescription of Game {game_id_2}: {desc2}")

In [None]:
# Example usage
compare_games_combined_similarity(game_id_1=171, game_id_2=13, games_df=games)

##### BERT

Finally, we explore BERT to represent each game's description. Unlike GloVe, which uses static word vectors, BERT captures contextual meaning, allowing us to better differentiate between similar words used in different contexts.



In [None]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set model to evaluation mode

We have defined a function that takes in a game description and returns a 768-dimensional embedding representing the entire sentence. We do this by extracting the embedding of the special CLS token, which BERT is trained to use as a summary of the sentence.

In [None]:
def get_bert_embeddings(texts, tokenizer, model, batch_size=16, device="cpu"):
    model.to(device)
    model.eval()
    embeddings = []

    dataloader = DataLoader(texts, batch_size=batch_size)

    with torch.no_grad():
        for batch in tqdm(dataloader):
            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # shape: (batch_size, 768)
            embeddings.extend(cls_embeddings.cpu().numpy())

    return embeddings

In [None]:
texts = games["clean_description"].tolist()
bert_vectors = get_bert_embeddings(texts, tokenizer, model, batch_size=16, device="cuda" if torch.cuda.is_available() else "cpu")
games["bert_vector"] = bert_vectors

In [None]:
bert_vectors = np.vstack(games["bert_vector"].values)

# Compute the full cosine similarity matrix
sim_matrix = cosine_similarity(bert_vectors)

# Remove diagonal values (self-similarity)
np.fill_diagonal(sim_matrix, np.nan)

# Flatten and filter out NaNs
similarities = sim_matrix[~np.isnan(sim_matrix)]

# Plot histogram
plt.figure(figsize=(8, 5))
plt.hist(similarities, bins=50, color="skyblue", edgecolor="black")
plt.title("Distribution of Cosine Similarities Between BERT Embeddings")
plt.xlabel("Cosine Similarity")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

The histogram of cosine similarities using BERT shows a broader and more informative distribution than GloVe or GloVe+features.

Similarities range from 0.60 to 0.96, with a peak around 0.90, indicating that BERT captures contextual differences between game descriptions more effectively.


#### Reviews Dataset Exploration

Now we explore the dataset of user reviews to understand how many games have been reviewed, the distribution of ratings, and user activity.

In [None]:
# Unique game names in the reviews
unique_names = reviews["name"].unique()
print(len(unique_names))

In [None]:
# Unique reviews of games in the reviews
reviewed_ids = reviews["ID"].unique()

In [None]:
# Find games with and without reviews
games_with_reviews = games[games["id"].isin(reviewed_ids)]
games_without_reviews = games[~games["id"].isin(reviewed_ids)]

In [None]:
print(f"Total games: {len(games)}")
print(f"Games with reviews: {len(games_with_reviews)}")
print(f"Games without reviews: {len(games_without_reviews)}")

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=reviews, x='rating', bins=10, color='skyblue', edgecolor='black')
plt.xlabel("Rating", fontsize=14)
plt.ylabel("Number of Ratings", fontsize=14)
plt.title("Distribution of Game Ratings", fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

This histogram shows the frequency of ratings given to games, on a scale from 0 to 10.

The distribution is right-skewed, meaning most ratings are above 6. Also, the most common rating is between 7 and 8, suggesting that users tend to rate games positively, and very low ratings are rare.

In [None]:
Nrating_por_user = reviews.user.value_counts()

plt.figure(figsize=(10, 6))
sns.histplot(Nrating_por_user, bins=range(1, 21), color='skyblue', edgecolor='black')
plt.xlim(1, 20)
plt.ylim(1, 60000)
plt.xlabel("Number of Ratings", fontsize=14)
plt.ylabel("Number of Users", fontsize=14)
plt.title("Distribution of Ratings per User", fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

# Print the user who has rated the most
print("The user who has rated most games has rated:", Nrating_por_user.max())

This histogram shows how many users have given a certain number of ratings.

The majority of users have rated only 1 or 2 games, and very few users have rated more than 10 games, and even fewer above 15.

#### Filtering the reviews

In [None]:
# Keep only reviews for games that exist in the games dataset
valid_game_ids = set(games["id"].unique())
filtered_reviews = reviews[reviews["ID"].isin(valid_game_ids)].copy()
filtered_reviews

In [None]:
# Check the number of reviews per game
review_counts = filtered_reviews["ID"].value_counts()
review_counts.describe()

In [None]:
#filtered_reviews.to_csv("filtered_reviews.csv", sep=";", index=False)
#filtered_reviews = pd.read_csv("filtered_reviews.csv", sep=";")
filtered_reviews

In [None]:
reviews = filtered_reviews

In [None]:
valid_game_ids = set(games["id"].unique())
filtered_reviews = reviews[reviews["ID"].isin(valid_game_ids)].copy()

#### Content-based system

In this section, we will be implementing different content-based recommendation systems that suggest games to users based on the content of games they have previously liked.

These systems rely on vector representations derived from game descriptions, along with optional metadata like player count and playtime, to build personalized user profiles.

We will explore several variants, starting with a basic similarity-based approach and going toward more diverse models using techniques like MMR, clustering, and even emotional tone.

##### Basic content-based system

The first method is a simple content-based recommender that suggests new games to a user based on the descriptions of games they liked.

We consider it 'basic' because it relies solely on semantic similarity between game descriptions, without incorporating diversity mechanisms, clustering, or collaborative data.

This works by using GloVe embeddings to convert each game's description into a numeric vector. Then, for a given user, it builds a preference profile by averaging the vectors of games the user liked, weighted by how highly they were rated. The system then compares this profile to the rest of the games (filtered by user constraints like age or playtime), and recommends the top k games with the highest cosine similarity to the user's profile.


In [None]:
def recommend_content_based_basic(user_name, games, reviews, top_k=10, testing=False, max_playingtime=None, min_age=None, minplayers=None, maxplayers=None, min_rating=None):
    # Filter games based on user preferences
    filtered_games = games.copy()

    if max_playingtime is not None:
        filtered_games = filtered_games[filtered_games["playingtime"] <= max_playingtime]
    if min_age is not None:
        filtered_games = filtered_games[filtered_games["minage"] >= min_age]
    if minplayers is not None:
        filtered_games = filtered_games[filtered_games["minplayers"] <= minplayers]
    if maxplayers is not None:
        filtered_games = filtered_games[filtered_games["maxplayers"] >= maxplayers]
    if min_rating is not None:
        filtered_games = filtered_games[filtered_games["rating"] >= min_rating]

    if filtered_games.empty:
        print("No games match the provided filters.")
        return

    # Get games the user liked
    liked_game_ids = reviews[(reviews["user"] == user_name) & (reviews["rating"] >= 6)]
    liked_vectors = liked_game_ids.merge(games[["id", "glove_vector"]], left_on="ID", right_on="id")

    if liked_vectors.empty:
        print("Not enough liked games with vectors to build a profile.")
        return

    # Build user profile (weighted average)
    ratings = liked_vectors["rating"].values
    weights = ratings / ratings.max()
    vectors = np.vstack(liked_vectors["glove_vector"])
    user_profile = np.average(vectors, axis=0, weights=weights)

    if not testing:
        # Remove already liked games from recommendations
        unseen_mask = ~filtered_games["id"].isin(liked_game_ids["ID"])
        filtered_games = filtered_games[unseen_mask]

    if filtered_games.empty:
        print("No new games to recommend after filtering out liked ones")
        return

    # Calculate similarity
    game_vectors = np.vstack(filtered_games["glove_vector"].values)
    similarities = cosine_similarity(game_vectors, [user_profile]).flatten()

    # Select top_k most similar games
    filtered_games = filtered_games.copy()
    filtered_games["similarity"] = similarities
    recommended_games = filtered_games.sort_values(by="similarity", ascending=False).head(top_k)

    # Generate personalized explanations
    explanations = []
    for game_id in recommended_games["id"]:
        # Find most similar liked game
        similarities_to_liked = cosine_similarity(
            [games.loc[games["id"] == game_id, "glove_vector"].values[0]],
            liked_vectors["glove_vector"].tolist()
        ).flatten()
        most_similar_game_idx = np.argmax(similarities_to_liked)
        most_similar_game_id = liked_game_ids.iloc[most_similar_game_idx]["ID"]
        most_similar_game_name = games.loc[games["id"] == most_similar_game_id, "name"].values[0]
        most_similar_rating = liked_game_ids.iloc[most_similar_game_idx]["rating"]

        # Vibe and emotion
        predicted_vibe = games.loc[games["id"] == game_id, "predicted_vibe"].values[0]
        predicted_emotion = games.loc[games["id"] == game_id, "predicted_emotion"].values[0]

        explanation = (
            f"I recommend this game because you liked '{most_similar_game_name}' "
            f"(you rated it {most_similar_rating}/10). "
            f"This game has a '{predicted_emotion}' feeling and is '{predicted_vibe}'."
        )
        explanations.append(explanation)

    recommended_games["explanation"] = explanations

    return recommended_games[["name", "rating", "id", "similarity", "explanation"]]

In [None]:
user_name = "jbuergel"
recommendations = recommend_content_based_basic(
    user_name,
    games,
    filtered_reviews,
    max_playingtime=60,
    min_age=12,
    minplayers=2,
    maxplayers=4,
    min_rating=6
)

print(f"Recommended games for user '{user_name}'")
print(recommendations)

In [None]:
if recommendations is not None:
    for idx, row in recommendations.iterrows():
        print(f"{row['name']} (Rating: {row['rating']:.2f})")
        print(f"Similarity: {row['similarity']:.2f}")
        print(f"{row['explanation']}\n")

##### MMR diversity content-based system

In this method, we extend the basic content-based recommender by incorporating a diversitystrategy known as Maximal Marginal Relevance (MMR).

While the basic model focuses only on recommending games that are most similar to the user's profile, it can result in suggestions that are too similar to each other.

MMR addresses this by balancing relevance (similarity to the user) with diversity (dissimilarity among recommended items). At each step, the algorithm selects the next best game that is both similar to the user's interests and sufficiently different from previously selected games.

This allows us to give recommendations that are not only aligned with the user's tastes but also more varied.

In [None]:
def mmr(user_profile, candidate_vectors, candidate_ids, lambda_param=0.7, top_k=10):
    selected = []
    selected_ids = []
    candidate_indices = list(range(len(candidate_vectors)))

    similarities_to_user = cosine_similarity(candidate_vectors, [user_profile]).flatten()
    similarity_matrix = cosine_similarity(candidate_vectors)

    for _ in range(top_k):
        mmr_scores = []
        for idx in candidate_indices:
            if not selected:
                diversity_penalty = 0
            else:
                diversity_penalty = max(similarity_matrix[idx][j] for j in selected)

            mmr_score = lambda_param * similarities_to_user[idx] - (1 - lambda_param) * diversity_penalty
            mmr_scores.append((idx, mmr_score))

        selected_idx, _ = max(mmr_scores, key=lambda x: x[1])
        selected.append(selected_idx)
        selected_ids.append(candidate_ids[selected_idx])
        candidate_indices.remove(selected_idx)

    return selected_ids

In [None]:
def visualize_mmr(user_vector, candidate_vectors, candidate_ids, game_names, lambda_param=0.7, top_k=10):
    # Get top-k by similarity
    relevance = cosine_similarity(candidate_vectors, [user_vector]).flatten()
    top_k_indices = relevance.argsort()[::-1][:top_k]
    top_k_ids = [candidate_ids[i] for i in top_k_indices]

    # Get top-k using MMR
    mmr_ids = mmr(user_vector, candidate_vectors, candidate_ids, lambda_param, top_k)

    # Reduce dimensions for plotting
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(np.vstack([user_vector, candidate_vectors]))
    user_p = reduced[0]
    candidates_p = reduced[1:]

    plt.figure(figsize=(10, 7))
    plt.scatter(candidates_p[:, 0], candidates_p[:, 1], c='lightgray', label="All Candidates", alpha=0.5)

    # Plot user profile
    plt.scatter(user_p[0], user_p[1], c='blue', label="User Profile", marker='X', s=100)

    # Plot top-k by similarity
    for idx in top_k_indices:
        plt.scatter(candidates_p[idx, 0], candidates_p[idx, 1], c='green', label="Top-k Similarity" if idx == top_k_indices[0] else "", edgecolors='black')
        plt.text(candidates_p[idx, 0], candidates_p[idx, 1], game_names[candidate_ids[idx]], fontsize=8, color='darkgreen')

    # Plot MMR-selected games
    for mmr_id in mmr_ids:
        i = candidate_ids.index(mmr_id)
        plt.scatter(candidates_p[i, 0], candidates_p[i, 1], c='orange', label="Top-k MMR" if mmr_id == mmr_ids[0] else "", edgecolors='black')
        plt.text(candidates_p[i, 0], candidates_p[i, 1], game_names[mmr_id], fontsize=8, color='darkorange')

    plt.legend()
    plt.title(f"MMR vs Top-k Similarity (λ = {lambda_param})")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.grid(True)
    plt.show()

In [None]:
def recommend_content_based_mmr(user_name, games, reviews, top_k=10, testing=False, max_playingtime=None, min_age=None, minplayers=None, maxplayers=None, min_rating=6, plot=False):
    # Filter games based on user preferences
    filtered_games = games.copy()

    if max_playingtime is not None:
        filtered_games = filtered_games[filtered_games["playingtime"] <= max_playingtime]
    if min_age is not None:
        filtered_games = filtered_games[filtered_games["minage"] >= min_age]
    if minplayers is not None:
        filtered_games = filtered_games[filtered_games["minplayers"] <= minplayers]
    if maxplayers is not None:
        filtered_games = filtered_games[filtered_games["maxplayers"] >= maxplayers]
    if min_rating is not None:
        filtered_games = filtered_games[filtered_games["rating"] >= min_rating]

    if filtered_games.empty:
        print("No games match the provided filters.")
        return

    # Get games the user liked
    liked_game_ids = reviews[(reviews["user"] == user_name) & (reviews["rating"] >= 6)]
    liked_vectors = liked_game_ids.merge(games[["id", "glove_vector"]], left_on="ID", right_on="id")

    if liked_vectors.empty:
        print("Not enough liked games with vectors to build a profile.")
        return

    # Build user profile (weighted average)
    ratings = liked_vectors["rating"].values
    weights = ratings / ratings.max()
    vectors = np.vstack(liked_vectors["glove_vector"])
    user_profile = np.average(vectors, axis=0, weights=weights)

    if not testing:
        # Remove already liked games from recommendations
        unseen_mask = ~filtered_games["id"].isin(liked_game_ids["ID"])
        filtered_games = filtered_games[unseen_mask]

    if filtered_games.empty:
        print("No new games to recommend after filtering out liked ones.")
        return

    # Prepare data for MMR
    game_vectors = np.vstack(filtered_games["glove_vector"].values)
    similarities = cosine_similarity(game_vectors, [user_profile]).flatten()

    # Apply MMR for diversity
    selected_ids = mmr(user_profile, game_vectors, filtered_games["id"].tolist(), lambda_param=0.7, top_k=top_k)
    recommended_games = filtered_games[filtered_games["id"].isin(selected_ids)].copy()

    # Add similarity score for interpretability
    recommended_games["similarity"] = similarities[[filtered_games["id"].tolist().index(i) for i in selected_ids]]

    # Generate personalized explanations with vibe and emotion
    explanations = []
    for game_id in selected_ids:
        # Find the most similar liked game
        similarities_to_liked_games = cosine_similarity([games.loc[games["id"] == game_id, "glove_vector"].values[0]], liked_vectors["glove_vector"].tolist()).flatten()
        most_similar_game_idx = np.argmax(similarities_to_liked_games)
        most_similar_game_id = liked_game_ids.iloc[most_similar_game_idx]["ID"]
        most_similar_game_name = games.loc[games["id"] == most_similar_game_id, "name"].values[0]
        most_similar_rating = liked_game_ids.iloc[most_similar_game_idx]["rating"]

        # Get predicted vibe and emotion for the recommended game
        predicted_vibe = games.loc[games["id"] == game_id, "predicted_vibe"].values[0]
        predicted_emotion = games.loc[games["id"] == game_id, "predicted_emotion"].values[0]

        # Generate the explanation
        explanation = (
            f"I recommend this game because you liked '{most_similar_game_name}' "
            f"(you rated it {most_similar_rating}/10). "
            f"This game has a '{predicted_emotion}' feeling and is '{predicted_vibe}'."
        )
        explanations.append(explanation)

    recommended_games["explanation"] = explanations

    if plot:
        visualize_mmr(user_profile, game_vectors, filtered_games["id"].tolist(), dict(zip(games["id"], games["name"])), lambda_param=0.7, top_k=top_k)

    return recommended_games[["name", "rating", "id", "similarity", "explanation"]]

In [None]:
user_name = "jbuergel"
recommendations = recommend_content_based_mmr(
    user_name,
    games,
    filtered_reviews,
    max_playingtime=60,
    min_age=12,
    minplayers=2,
    maxplayers=4,
    min_rating=6,
    plot = True)

print(f"Recommended games for user '{user_name}'")
print(recommendations)

In [None]:
if recommendations is not None:
    for idx, row in recommendations.iterrows():
        print(f"{row['name']} (Rating: {row['rating']:.2f})")
        print(f"Similarity: {row['similarity']:.2f}")
        print(f"{row['explanation']}\n")

##### Cluster diversity content-based system

This other method introduces diversity by grouping candidate games into clusters before selecting recommendations. Instead of simply picking the most similar games overall, we first apply K-Means clustering to partition the games into different groups based on their GloVe embeddings.

Then, we select top candidates from multiple clusters, ensuring that the recommendations are not only relevant to the user's preferences but also draen from different thematic areas within the dataset.

In [None]:
def visualize_clusters(user_profile, game_vectors, cluster_labels, candidate_ids, recommended_ids, id_to_name):
    # Reduce dimensions with PCA
    pca = PCA(n_components=2)
    all_points = np.vstack([user_profile, game_vectors])
    pca_result = pca.fit_transform(all_points)

    user_pca = pca_result[0]
    games_pca = pca_result[1:]

    plt.figure(figsize=(10, 7))

    # Plot games, colored by cluster
    for cluster in np.unique(cluster_labels):
        idxs = np.where(cluster_labels == cluster)[0]
        plt.scatter(games_pca[idxs, 0], games_pca[idxs, 1], label=f"Cluster {cluster}", alpha=0.6)

    # Highlight recommended games and display their names below
    recommended_idxs = [candidate_ids.index(i) for i in recommended_ids if i in candidate_ids]
    for idx in recommended_idxs:
        x, y = games_pca[idx, 0], games_pca[idx, 1]
        plt.scatter(x, y, s=20, color='black')  # Black dot for recommended games
        # Display the game name below the point
        plt.text(x, y - 0.05, id_to_name[candidate_ids[idx]], fontsize=10, color='black', ha='center')

    # Plot user profile
    plt.scatter(user_pca[0], user_pca[1], color='red', s=150, marker='X', label='User Profile')

    # Set the plot labels and title
    plt.xlabel("PCA 1", fontsize=14)
    plt.ylabel("PCA 2", fontsize=14)
    plt.title("PCA Projection of Game Clusters", fontsize=16)

    # Add legend and grid
    plt.legend(loc="best", fontsize=12)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)

    # Show the plot
    plt.show()

In [None]:
def recommend_content_based_cluster(user_name, games, reviews, top_k=10, testing = False, max_playingtime=None, min_age=None, minplayers=None, maxplayers=None, min_rating=None, n_clusters=5, plot=False):
    # Filter games based on user preferences
    filtered_games = games.copy()

    if max_playingtime is not None:
        filtered_games = filtered_games[filtered_games["playingtime"] <= max_playingtime]
    if min_age is not None:
        filtered_games = filtered_games[filtered_games["minage"] >= min_age]
    if minplayers is not None:
        filtered_games = filtered_games[filtered_games["minplayers"] <= minplayers]
    if maxplayers is not None:
        filtered_games = filtered_games[filtered_games["maxplayers"] >= maxplayers]
    if min_rating is not None:
        filtered_games = filtered_games[filtered_games["rating"] >= min_rating]

    if filtered_games.empty:
        print("No games match the provided filters.")
        return

    # Get games the user liked
    liked_game_ids = reviews[(reviews["user"] == user_name) & (reviews["rating"] >= 6)]
    liked_vectors = liked_game_ids.merge(games[["id", "glove_vector"]], left_on="ID", right_on="id")

    if liked_vectors.empty:
        print("Not enough liked games with vectors to build a profile.")
        return

    # Build user profile (weighted average)
    ratings = liked_vectors["rating"].values
    weights = ratings / ratings.max()
    vectors = np.vstack(liked_vectors["glove_vector"])
    user_profile = np.average(vectors, axis=0, weights=weights)

    if not testing:
    # Remove already liked games from recommendations
        unseen_mask = ~filtered_games["id"].isin(liked_game_ids["ID"])
        filtered_games = filtered_games[unseen_mask]

    if filtered_games.empty:
        print("No new games to recommend after filtering out liked ones")
        return

    # Cluster candidate games
    game_vectors = np.vstack(filtered_games["glove_vector"].values)
    candidate_ids = filtered_games["id"].tolist()
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(game_vectors)

    # Compute similarities
    similarities = cosine_similarity(game_vectors, [user_profile]).flatten()

    # Pick top games from each cluster
    cluster_recommendations = []
    for cluster_id in range(n_clusters):
        indices_in_cluster = np.where(cluster_labels == cluster_id)[0]
        if len(indices_in_cluster) == 0:
            continue
        # Sort games in cluster by similarity
        cluster_sims = similarities[indices_in_cluster]
        sorted_idx = indices_in_cluster[np.argsort(cluster_sims)[::-1]]
        for i in sorted_idx[:min(top_k // n_clusters + 1, len(sorted_idx))]:
            cluster_recommendations.append((candidate_ids[i], similarities[i]))

    # Sort all selected games by similarity and return top_k
    sorted_final = sorted(cluster_recommendations, key=lambda x: x[1], reverse=True)[:top_k]
    selected_ids = [x[0] for x in sorted_final]

    recommended_games = filtered_games[filtered_games["id"].isin(selected_ids)].copy()
    recommended_games["similarity"] = [x[1] for x in sorted_final]

    # Generate personalized explanations with vibe and emotion
    explanations = []
    for game_id in selected_ids:
        # Find the most similar liked game
        similarities_to_liked_games = cosine_similarity(
            [games.loc[games["id"] == game_id, "glove_vector"].values[0]],
            liked_vectors["glove_vector"].tolist()
        ).flatten()
        most_similar_game_idx = np.argmax(similarities_to_liked_games)
        most_similar_game_id = liked_game_ids.iloc[most_similar_game_idx]["ID"]
        most_similar_game_name = games.loc[games["id"] == most_similar_game_id, "name"].values[0]
        most_similar_rating = liked_game_ids.iloc[most_similar_game_idx]["rating"]

        # Get predicted vibe and emotion for the recommended game
        predicted_vibe = games.loc[games["id"] == game_id, "predicted_vibe"].values[0]
        predicted_emotion = games.loc[games["id"] == game_id, "predicted_emotion"].values[0]

        # Generate the explanation
        explanation = (
            f"I recommend this game because you liked '{most_similar_game_name}' "
            f"(you rated it {most_similar_rating}/10). "
            f"This game has a '{predicted_emotion}' feeling and is '{predicted_vibe}'."
        )
        explanations.append(explanation)

    recommended_games["explanation"] = explanations

    if plot:
        id_to_name = dict(zip(games["id"], games["name"]))
        visualize_clusters(user_profile, game_vectors, cluster_labels, candidate_ids, selected_ids, id_to_name)

    return recommended_games[["name", "rating", "id", "similarity", "explanation"]]

In [None]:
user_name = "Pyrogal"
recommendations = recommend_content_based_cluster(
    user_name,
    games,
    filtered_reviews,
    max_playingtime=60,
    min_age=12,
    minplayers=2,
    maxplayers=4,
    min_rating=6,
    plot = True)
print(f"Recommended games for user '{user_name}'")
print(recommendations)

In [None]:
if recommendations is not None:
    for idx, row in recommendations.iterrows():
        print(f"{row['name']} (Rating: {row['rating']:.2f})")
        print(f"Similarity: {row['similarity']:.2f}")
        print(f"{row['explanation']}\n")

##### Diversity-based RecSys comparison

Now we need to evaluate and compare the performance of the three content-based recommendation strategies introduced earlier.

The goal is to assess how each method balances accuracy and diversity in the recommendations.

We will be using precision and recall at top-k as evaluation metrics, both for an individual user and averaged across multiple users. This allows us to quantify how well each system recovers games the user actually liked, and observe the trade-offs between relevance and variety.

In [None]:
reviews[(reviews["user"] == 'jbuergel') & (reviews["rating"] >= 6)]

In [None]:
def get_highly_rated_games(reviews_df, user_name, rating_threshold=6):
    user_reviews = reviews_df[(reviews_df["user"] == user_name) & (reviews_df["rating"] >= rating_threshold)]
    return user_reviews

def precision_recall_manual(user_name, recommendations, highly_rated_games, k=10, threshold=6.0):
    relevant_ids = highly_rated_games["ID"].unique()  # All games the user rated highly

    # Top-k recommended game IDs
    rec_ids = recommendations["id"].head(k).values if not recommendations.empty else []

    # True positives
    hits = len(set(rec_ids) & set(relevant_ids))

    precision = hits / len(rec_ids) if len(rec_ids) > 0 else 0
    recall = hits / len(relevant_ids) if len(relevant_ids) > 0 else 0
    return precision, recall

We generate recommendations for a user using each of the three methods, applying the same filtering preferences.

In [None]:
user1 = "jbuergel"
highly_rated_games_user1 = get_highly_rated_games(filtered_reviews, user1, rating_threshold=6)

In [None]:
recs_basic = recommend_content_based_basic(
    user1, games, highly_rated_games_user1, testing = True, max_playingtime=60, min_age=12, minplayers=2, maxplayers=4)

recs_mmr = recommend_content_based_mmr(
    user1, games, highly_rated_games_user1, testing = True,  max_playingtime=60, min_age=12, minplayers=2, maxplayers=4)

recs_cluster = recommend_content_based_cluster(
    user1, games, highly_rated_games_user1, testing = True, max_playingtime=60, min_age=12, minplayers=2, maxplayers=4)

In [None]:
precision_basic, recall_basic = precision_recall_manual(
    user1, recs_basic, highly_rated_games_user1, k=5, threshold=6.0)
precision_mmr, recall_mmr = precision_recall_manual(
    user1, recs_mmr, highly_rated_games_user1, k=5, threshold=6.0)
precision_cluster, recall_cluster = precision_recall_manual(
    user1, recs_cluster, highly_rated_games_user1, k=5, threshold=6.0)

print(f"Precision@5 Basic: {precision_basic:.4f}")
print(f"Recall@5 Basic: {recall_basic:.4f}")

print(f"\nPrecision@5 MMR: {precision_mmr:.4f}")
print(f"Recall@5M MRR: {recall_mmr:.4f}")

print(f"\nPrecision@5 Cluster: {precision_cluster:.4f}")
print(f"Recall@5 Cluster: {recall_cluster:.4f}")

Now we defined a general evaluation function to assess how each method performs across multiple users, not just one.

In [None]:
def evaluate_multiple_users(reviews_df, games_df, recommend_funcs, k=5, n_users=30, rating_threshold=6.0):
    """
    Evaluate average precision and recall over multiple users who have rated at least one game above the threshold.
    """
    users = reviews_df["user"].unique()
    random.shuffle(users)  # shuffle for randomness

    results = {method: {"precision": [], "recall": []} for method in recommend_funcs.keys()}
    evaluated_users = 0

    for user in users:
        highly_rated = get_highly_rated_games(reviews_df, user, rating_threshold)

        if highly_rated.empty:
            continue  # skip users with no high ratings

        for method_name, func in recommend_funcs.items():
            recs = func(user, games_df, highly_rated, testing=True,
                        max_playingtime=60, min_age=12, minplayers=2, maxplayers=4)

            precision, recall = precision_recall_manual(user, recs, highly_rated, k=k, threshold=rating_threshold)
            results[method_name]["precision"].append(precision)
            results[method_name]["recall"].append(recall)

        evaluated_users += 1
        if evaluated_users >= n_users:
            break  # stop once we have enough valid users

    # Compute average metrics
    avg_results = {}
    for method, metrics in results.items():
        avg_precision = sum(metrics["precision"]) / len(metrics["precision"]) if metrics["precision"] else 0
        avg_recall = sum(metrics["recall"]) / len(metrics["recall"]) if metrics["recall"] else 0
        avg_results[method] = {"avg_precision": avg_precision, "avg_recall": avg_recall}

    return avg_results

In [None]:
recommend_funcs = {
    "basic": recommend_content_based_basic,
    "mmr": recommend_content_based_mmr,
    "cluster": recommend_content_based_cluster
}

avg_metrics = evaluate_multiple_users(
    filtered_reviews, games,
    recommend_funcs=recommend_funcs,
    k=5, n_users=40, rating_threshold=6.0
)

for method, metrics in avg_metrics.items():
    print(f"{method.upper()} - Avg Precision@5: {metrics['avg_precision']:.4f}, Avg Recall@5: {metrics['avg_recall']:.4f}")

##### Hybrid content-based recommender system

Now we will try a hybrid recommender system, which is called hybrid because it comines elements from collaborative filtering and content-based filtering.

The content-based part is achieved using the GloVe vectors to represent each game, and the collaborative by using user rating behavior.

Note we only take the k most similar games. This is done in order to prevent bias towards the weighted mean of the user ratings. When a user has many rated games, the prediction of any of the rest of target games will converge to a number very close to the mean.

In [None]:
def predict_rating(user_name, target_game_id, games_df, reviews_df, k=10):
    # Get user-rated games
    user_reviews = reviews_df[(reviews_df["user"] == user_name)]

    if user_reviews.empty:
        return "No reviews found for this user."

    # Merge with games to get GloVe vectors
    user_games = user_reviews.merge(games_df[["id", "glove_vector"]], left_on="ID", right_on="id")

    if user_games.empty:
        return "User has rated games with no vector information."

    # Get vector of target game
    try:
        target_vector = games_df.loc[games_df["id"] == target_game_id, "glove_vector"].values[0]
    except IndexError:
        return "Target game ID not found in games dataset."

    # Compute cosine similarities
    rated_vectors = np.vstack(user_games["glove_vector"])
    similarities = cosine_similarity(rated_vectors, [target_vector]).flatten()
    similarities = np.maximum(similarities, 0)  # Ensure no negatives

    ratings = user_games["rating"].values

    # Get indices of top-k similar games
    if len(similarities) < k:
        k = len(similarities)
    top_k_idx = np.argsort(similarities)[-k:]

    top_similarities = similarities[top_k_idx]
    top_ratings = ratings[top_k_idx]

    if top_similarities.sum() == 0:
        return "No sufficiently similar games found."

    # Weighted average using top-k
    predicted_rating = np.average(top_ratings, weights=top_similarities)

    return round(predicted_rating, 2)

In [None]:
def predict_rating_with_explanation(user_name, target_game_id, games_df, reviews_df, k=10):
    user_reviews = reviews_df[reviews_df["user"] == user_name]
    if user_reviews.empty:
        return None, "No reviews found for this user."

    user_games = user_reviews.merge(games_df[["id", "glove_vector", "name"]], left_on="ID", right_on="id")
    if user_games.empty:
        return None, "User has rated games with no vector information."
    try:
        target_vector = games_df.loc[games_df["id"] == target_game_id, "glove_vector"].values[0]
    except IndexError:
        return None, "Target game ID not found in games dataset."

    rated_vectors = np.vstack(user_games["glove_vector"])
    similarities = cosine_similarity(rated_vectors, [target_vector]).flatten()
    similarities = np.maximum(similarities, 0)

    ratings = user_games["rating"].values

    if len(similarities) < k:
        k = len(similarities)
    top_k_idx = np.argsort(similarities)[-k:]

    top_similarities = similarities[top_k_idx]
    top_ratings = ratings[top_k_idx]
    top_game_names = user_games.iloc[top_k_idx]["name_y"].tolist()

    if top_similarities.sum() == 0:
        return None, "No sufficiently similar games found."

    predicted_rating = round(np.average(top_ratings, weights=top_similarities), 2)

    # Build explanation
    if len(top_game_names) == 1:
        explanation = f"Because you liked **{top_game_names[0]}**, and this game is similar to it."
    elif len(top_game_names) == 2:
        explanation = f"Because you liked **{top_game_names[0]}** and **{top_game_names[1]}**, and this game is similar to both."
    else:
        explanation = f"Because you liked games like **{top_game_names[0]}**, **{top_game_names[1]}**, and others."

    return predicted_rating, explanation

The next code is to check range of predictions in all target games

In [None]:
def predict_all_ratings(user_name, games_df, reviews_df, k=10):
    # Get user-rated games and their vectors
    user_reviews = reviews_df[reviews_df["user"] == user_name]
    if user_reviews.empty:
        return "No reviews found for this user."

    user_games = user_reviews.merge(games_df[["id", "glove_vector"]], left_on="ID", right_on="id")
    if user_games.empty:
        return "User has rated games with no vector information."

    rated_vectors = np.vstack(user_games["glove_vector"])
    rated_ratings = user_games["rating"].values
    rated_names = user_games["name"].tolist()

    # Games not yet rated by user
    unseen_games = games_df[~games_df["id"].isin(user_games["id"])].copy()
    if unseen_games.empty:
        return "No unseen games to predict ratings for."

    predicted_ratings = []

    for _, game in unseen_games.iterrows():
        target_vector = game["glove_vector"]

        # Cosine similarity with all user-rated games
        similarities = cosine_similarity(rated_vectors, [target_vector]).flatten()
        similarities = np.maximum(similarities, 0)

        # Select top-k most similar games
        if len(similarities) < k:
            k_adj = len(similarities)
        else:
            k_adj = k

        top_k_idx = np.argsort(similarities)[-k_adj:]
        top_similarities = similarities[top_k_idx]
        top_ratings = rated_ratings[top_k_idx]
        top_game_names = [rated_names[i] for i in top_k_idx]

        if top_similarities.sum() == 0:
            pred_rating = np.nan
            explanation = "No sufficiently similar games found."
        else:
            pred_rating = np.average(top_ratings, weights=top_similarities)
            # Build explanation
            if len(top_game_names) == 1:
                explanation = f"Because you liked **{top_game_names[0]}**, and this game is similar to it."
            elif len(top_game_names) == 2:
                explanation = f"Because you liked **{top_game_names[0]}** and **{top_game_names[1]}**, and this game is similar to both."
            else:
                explanation = f"Because you liked games like **{top_game_names[0]}**, **{top_game_names[1]}**, and others."

        predicted_ratings.append((game["id"], game["name"], round(pred_rating, 2) if not np.isnan(pred_rating) else None, explanation))

    # Return as a DataFrame
    pred_df = pd.DataFrame(predicted_ratings, columns=["id", "name", "predicted_rating", "explanation"])
    pred_df = pred_df.sort_values(by="predicted_rating", ascending=False)

    return pred_df

In [None]:
predictions_df = predict_all_ratings("jbuergel", games, reviews, k=10)
predictions_df

It is important, however, to test different values of k (number of similar items considered), since this can affect the predictions:

*   A small k uses the most similar items, hence it is more personalized but probably noisier.
*   A large k uses more items, hence smoother predictions but more biased towards the average.

In [None]:
def split_reviews_for_user(reviews_df, user_name, test_size=0.3, random_state=42):
    user_reviews = reviews_df[reviews_df["user"] == user_name]

    if user_reviews.empty:
        raise ValueError(f"No reviews found for user {user_name}")

    # Split the reviews into train and test
    train_reviews, test_reviews = train_test_split(user_reviews, test_size=test_size, random_state=random_state)

    return train_reviews, test_reviews

In [None]:
def predict_all_ratings_k_values_with_rmse(user_name, games_df, reviews_df, test_df, k_values=[5, 10]):
    results = {}

    # Prepare user data
    user_train = reviews_df[reviews_df["user"] == user_name]
    test_data = test_df[test_df["user"] == user_name]

    if user_train.empty or test_data.empty:
        return "Not enough data for user"

    user_games = user_train.merge(games_df[["id", "glove_vector"]], left_on="ID", right_on="id")
    rated_vectors = np.vstack(user_games["glove_vector"])
    rated_ratings = user_games["rating"].values

    for k in k_values:
        predicted = []
        actual = []

        for _, game in test_data.iterrows():
            game_id = game["ID"]
            true_rating = game["rating"]
            try:
                target_vector = games_df.loc[games_df["id"] == game_id, "glove_vector"].values[0]
            except IndexError:
                continue

            sims = cosine_similarity(rated_vectors, [target_vector]).flatten()
            sims = np.maximum(sims, 0)
            top_k_idx = np.argsort(sims)[-k:]
            top_sims = sims[top_k_idx]
            top_ratings = rated_ratings[top_k_idx]

            if top_sims.sum() == 0:
                continue

            pred_rating = np.average(top_ratings, weights=top_sims)
            predicted.append(pred_rating)
            actual.append(true_rating)

        if predicted:
            rmse = sqrt(mean_squared_error(actual, predicted))
            results[k] = rmse

    return results

In [None]:
user = "jbuergel"
train_df, test_df = split_reviews_for_user(filtered_reviews, user)

rmse_by_k = predict_all_ratings_k_values_with_rmse(
    user_name=user,
    games_df=games,
    reviews_df=train_df,   # this is used for building the user profile
    test_df=test_df,       # this is used for evaluating predictions
    k_values=[3, 5, 10, 15, 20, 30, 40, 50]
)
rmse_by_k

In [None]:
ks = list(rmse_by_k.keys())
rmse_values = list(rmse_by_k.values())

plt.figure(figsize=(8, 5))
plt.plot(ks, rmse_values, marker='o')
plt.title("RMSE vs. k in Hybrid Content-Based Recommender")
plt.xlabel("k (Top-k similar games used)")
plt.ylabel("RMSE")
plt.grid(True)
plt.xticks(ks)
plt.show()

In [None]:
predictions_df = predict_rating("jbuergel", 13, games, reviews, k=10)
predictions_df

In [None]:
predicted_score, explanation = predict_rating_with_explanation("jbuergel", 13, games, reviews, k=10)
if predicted_score is not None:
    print(f"You’d probably rate this game {predicted_score}/10")
    print(explanation)
else:
    print(explanation)

##### Cold Start for Hybrid RecSys

For this project, it is important to highlight that we considered the so-called 'cold-start' scenarios. Cold-start refers to when a user does not have ratings at all or only a few, which makes collaborative or hybrid recommendations unreliable.

Our original hybrid system depends on having enough rated games to compare against, so it doesn't work well when this data is missing.

Because of this, we offer two approaches:

*   Allow users to manually select a few games they like (simulating initial ratings).
*   Recommend games based on popularity and clustering, ensuring relevance and diversity.


**OPTION 1: Cold start with user choosing three games**

In [None]:
# Simulate cold start user
cold_start_user = "new_user"

# Let’s pretend they selected these game IDs as their favorites
liked_game_ids = [12, 91, 170]  # These must exist in games["id"]

# Create a pseudo-review dataframe for them
cold_reviews = pd.DataFrame({
    "user": [cold_start_user] * len(liked_game_ids),
    "ID": liked_game_ids,
    "rating": [9] * len(liked_game_ids)  # simulate high preference
})

# Run the recommender with this pseudo-profile
cold_recs = recommend_content_based_mmr(
    user_name=cold_start_user,
    games=games,
    reviews=cold_reviews,
    top_k=10,
    max_playingtime=60,
    min_age=12,
    minplayers=2,
    maxplayers=4,
    min_rating=6
)

print(cold_recs)

**OPTION 2: Cold start with popularity-based recommnender**

Before implementing the recommender, we explored two things:

1.   Choosing the number of clusters using the Elbow method.
2.   Visualizing the game space with t-SNE to confirm meaningful clustering.



In [None]:
# Calculate the inertia (WCSS) for different values of k
inertia = []
k_range = range(1, 30)  # Try different cluster counts

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(np.vstack(games["glove_vector"]))
    inertia.append(kmeans.inertia_)

# Plot the inertia values
plt.figure(figsize=(8, 6))
plt.plot(k_range, inertia, marker="o")
plt.title("Elbow Method for Optimal Number of Clusters")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia (WCSS)")
plt.show()

In [None]:
games["glove_vector"] = games["glove_vector"].apply(lambda x: np.fromstring(x[1:-1], sep=',') if isinstance(x, str) else x)
n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
games["description_cluster"] = kmeans.fit_predict(np.vstack(games["glove_vector"]))

In [None]:
# Get the embeddings (glove vectors) from the 'glove_vector' column
embeddings = np.vstack(games["glove_vector"])

# Perform t-SNE to reduce dimensions to 2D
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(embeddings)

# Add t-SNE results to the games dataframe
games["tsne_1"] = tsne_results[:, 0]
games["tsne_2"] = tsne_results[:, 1]

# Plot t-SNE representation with cluster labels
plt.figure(figsize=(10, 8))
plt.scatter(games["tsne_1"], games["tsne_2"], c=games["description_cluster"], cmap="viridis", alpha=0.7)
plt.colorbar(label="Cluster")
plt.title("t-SNE Visualization of K-Means Clusters")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.show()

In [None]:
# Compute average ratings and number of ratings per game
ratings = reviews.groupby("ID")["rating"].agg(["mean", "count"]).reset_index()
ratings.columns = ["id", "avg_rating", "num_ratings"]

# Compute popularity score: weighted average of rating and the number of ratings
ratings["popularity_score"] = (ratings["avg_rating"] * ratings["num_ratings"]) / (ratings["num_ratings"] + 10)

# Merge popularity scores with the games DataFrame and add suffixes to avoid column name conflicts
games = games.merge(ratings, left_on="id", right_on="id", how="left", suffixes=('', '_ratings'))

def get_diverse_popular_games(games, top_k=10):
    recommended_games = []

    # Iterate over clusters to get popular games from each cluster
    for cluster in range(games["description_cluster"].nunique()):
        # Get games in the cluster
        games_cluster = games[games["description_cluster"] == cluster]

        # Sort by popularity
        games_cluster = games_cluster.sort_values("popularity_score", ascending=False)

        # Select the top 1 game from each cluster to ensure diversity
        top_game = games_cluster.head(1)
        recommended_games.append(top_game)

    # Combine all recommended games and sort by popularity score
    recommended_games = pd.concat(recommended_games)
    recommended_games = recommended_games.sort_values("popularity_score", ascending=False).head(top_k)

    explanations = []
    for _, row in recommended_games.iterrows():
        cluster = row["description_cluster"]
        rating = round(row["avg_rating"], 2)
        num = int(row["num_ratings"])

        explanation = (
            f"This game is highly rated ({rating}/10 from {num} users) "
            f"and was selected to represent a distinct style or theme (cluster {cluster})."
        )
        explanations.append(explanation)

    recommended_games["explanation"] = explanations

    return recommended_games[["name", "clean_description", "avg_rating", "num_ratings", "popularity_score", "explanation"]]

In [None]:
cold_recs = get_diverse_popular_games(games, top_k=10)
print(cold_recs)

##### Tone-enhanced recommendations (FALTA AÑADIR EXPLANATIONS PART, PERO NO ME CONVENCE ESTE MÉTODO EN GENERAL)

For this we will use OHE to encode the predicted tone and vibe

In [None]:
def encode_vibe_emotion(games_df):
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded = encoder.fit_transform(games_df[["predicted_vibe", "predicted_emotion"]])
    return encoded, encoder

def build_content_vectors(games_df, encoder):
    glove_vectors = np.vstack(games_df["glove_vector"])
    vibe_emotion_encoded = encoder.transform(games_df[["predicted_vibe", "predicted_emotion"]])
    combined_vectors = np.hstack([glove_vectors, vibe_emotion_encoded])
    return combined_vectors

In [None]:
def recommend_content_based_tone_with_input(user_name, games, reviews, target_vibe, target_emotion,
                                            top_k=10, max_playingtime=None, min_age=None, minplayers=None,
                                            maxplayers=None, min_rating=6, n_clusters=5, alpha=0.5):
    # Filter games based on user preferences
    filtered_games = games.copy()

    if max_playingtime is not None:
        filtered_games = filtered_games[filtered_games["playingtime"] <= max_playingtime]
    if min_age is not None:
        filtered_games = filtered_games[filtered_games["minage"] >= min_age]
    if minplayers is not None:
        filtered_games = filtered_games[filtered_games["minplayers"] <= minplayers]
    if maxplayers is not None:
        filtered_games = filtered_games[filtered_games["maxplayers"] >= maxplayers]
    if min_rating is not None:
        filtered_games = filtered_games[filtered_games["rating"] >= min_rating]

    if filtered_games.empty:
        print("No games match the provided filters.")
        return

    # Get liked games by the user
    liked_game_ids = reviews[(reviews["user"] == user_name) & (reviews["rating"] >= 6)]
    liked_vectors = liked_game_ids.merge(games, left_on="ID", right_on="id")

    if liked_vectors.empty:
        print("Not enough liked games with vectors to build a profile.")
        return

    # Encode tone
    _, encoder = encode_vibe_emotion(games)
    tone_df = pd.DataFrame([[target_vibe, target_emotion]], columns=["predicted_vibe", "predicted_emotion"])
    tone_vector = encoder.transform(tone_df)

    # Build user profile
    ratings = liked_vectors["rating_x"].values
    weights = ratings / ratings.max()
    liked_combined_vectors = build_content_vectors(liked_vectors, encoder)
    user_profile = np.average(liked_combined_vectors, axis=0, weights=weights)

    # Pad tone vector to match dimensionality
    glove_dim = user_profile.shape[0] - tone_vector.shape[1]
    padded_tone_vector = np.hstack([np.zeros(glove_dim), tone_vector.flatten()])

    # Final profile: weighted combo of user and tone vector
    final_vector = alpha * user_profile + (1 - alpha) * padded_tone_vector

    # Remove already liked games
    filtered_games = filtered_games[~filtered_games["id"].isin(liked_game_ids["ID"])]

    if filtered_games.empty:
        print("No new games to recommend after filtering out liked ones")
        return

    # Compute candidate vectors
    game_vectors = build_content_vectors(filtered_games, encoder)
    candidate_ids = filtered_games["id"].tolist()

    # Clustering
    cluster_labels = KMeans(n_clusters=n_clusters, random_state=42).fit_predict(game_vectors)

    # Similarities
    similarities = cosine_similarity(game_vectors, [final_vector]).flatten()

    # Cluster-based selection
    cluster_recommendations = []
    for cluster_id in range(n_clusters):
        indices = np.where(cluster_labels == cluster_id)[0]
        if len(indices) == 0:
            continue
        sims = similarities[indices]
        sorted_idx = indices[np.argsort(sims)[::-1]]
        for i in sorted_idx[:min(top_k // n_clusters + 1, len(sorted_idx))]:
            cluster_recommendations.append((candidate_ids[i], similarities[i]))

    sorted_final = sorted(cluster_recommendations, key=lambda x: x[1], reverse=True)[:top_k]
    selected_ids = [x[0] for x in sorted_final]

    recommended_games = filtered_games[filtered_games["id"].isin(selected_ids)].copy()
    recommended_games["similarity"] = [x[1] for x in sorted_final]

    return recommended_games[["name", "rating", "id", "predicted_vibe", "predicted_emotion", "similarity"]]

In [None]:
recs = recommend_content_based_tone_with_input(
    user_name="sidehacker",
    games=games,
    reviews=filtered_reviews,
    target_vibe="strategic",
    target_emotion="funny",
    max_playingtime=60,
    min_age=12,
    minplayers=2,
    maxplayers=4,
    min_rating=6,
    alpha=0.1
)

print(recs)

#### Collaborative filtering

In this section, we implement collaborative filtering approaches using the Surprise library. These methods only rely on ratings, without needing content features. We explore both neighborhood-based and latent factor models.

In [None]:
# CODE TO MAKE SURPRISE LIBRARY WORK
!pip uninstall -y numpy scikit-surprise tensorflow numba
!pip install numpy==1.26.4
!pip install numba==0.60.0
!pip install scikit-surprise
!pip install tensorflow==2.18.0

In [None]:
from surprise import SVD, Dataset, Reader, accuracy, NMF, KNNBasic
from surprise import KNNWithMeans, KNNBaseline
from surprise.model_selection import GridSearchCV, train_test_split
from collections import defaultdict

##### Data preparation and filtering

We begin by filtering out users with too many ratings (above a threshold) to avoid bias from these users. We also randomly sample users to reduce computational cost, and format the data to work with Surprise's rating-based datasets.



In [None]:
reviews = pd.read_csv("filtered_reviews.csv", sep=";")

In [None]:
# Identify and remove users with too many reviews
review_counts = reviews["user"].value_counts()

# Set a threshold (e.g., remove users with more than 50 reviews)
threshold = 50
users_to_remove = review_counts[review_counts > threshold].index

# Filter reviews
filtered_reviews = reviews[~reviews["user"].isin(users_to_remove)]

In [None]:
filtered_reviews

In [None]:
# Sample 10000 users to have less reviews
sampled_users = np.random.choice(reviews["user"].unique(), size=1000, replace=False)
sampled_reviews = reviews[reviews["user"].isin(sampled_users)]

In [None]:
sampled_reviews

In [None]:
# Load data
# Range of ratings
reader = Reader(rating_scale=(1, 10))

# Dataset Auto Folds only with user, game ID and rating.
data = Dataset.load_from_df(sampled_reviews[["user", "ID", "rating"]], reader)

In [None]:
# Train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
print(type(trainset))
print(type(testset))

In [None]:
print(list(trainset.all_ratings())[:10])
# Note the relative user ids.

In [None]:
testset[:10]

##### Neighborhood-based methods


We will begin by exploring KNN-based collaborative filtering, where the rating for a game is predicted based on similar users or items.

We will be hyperparamter tuning k (number of neighbors) and min_k (minimum number of neighbors) using RMSE as a performance metric. Grid search is then applied to test different similarity measures and models like KNNBasic and KNNWithMeans.

To narrow down the gridsearch, we can explore these hyperaparameters with the KNNBasic model to see where does the performance stabilizes. This will also help us to not consider a huge grid during gridsearch.

In [None]:
# Discovering the right value for k_max
range_K = range(10,200,10)
RMSE_K = []

for k in range_K:
    print(f"Trying k = {k}")
    algo = KNNBasic(k=k)
    algo.fit(trainset)
    predictions = algo.test(testset)
    RMSE_K.append(accuracy.rmse(predictions))

plt.plot(range_K, RMSE_K)
plt.xlabel('k')
plt.ylabel('RMSE')
plt.show()

In [None]:
# Discovering the right value for min_k
range_minK = range(1, 20)
RMSE_minK = []

for mink in range_minK:
    print(f"Trying min_k = {mink}")
    algo = KNNBasic(k=20, min_k=mink)
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    RMSE_minK.append(rmse)

plt.plot(range_minK, RMSE_minK, marker='o')
plt.xlabel('min_k')
plt.ylabel('RMSE')
plt.title('RMSE vs. min_k in KNNBasic')
plt.grid(True)
plt.show()

In [None]:
# Define parameters
param_grids = {
    'KNNBasic': {
        'k': [5, 10, 20, 30, 50, 70, 90, 100, 120],
        'min_k': [3, 6, 7],
        'sim_options': {
            'name': ['cosine', 'pearson'],
            'user_based': [False],
        }
    },
    'KNNWithMeans': {
        'k': [5, 10, 20, 30, 50, 70, 90, 100, 120],
        'min_k': [3, 6, 7],
        'sim_options': {
            'name': ['cosine', 'pearson'],
            'user_based': [False],
        }
    }
}

# Store results
results = []
best_models = {}  # To store the best model for each algorithm

# Loop through algorithms and perform grid search
for algo_name, param_grid in param_grids.items():
    algo_class = eval(algo_name)
    gs = GridSearchCV(algo_class, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)
    gs.fit(data)  # Fit on the full dataset to find best parameters

    # Store the best model
    best_models[algo_name] = gs.best_estimator['rmse']

    # Store the result of the grid search
    results.append({
        'Algorithm': algo_name,
        'Best RMSE': gs.best_score['rmse'],
        'Best MAE': gs.best_score['mae'],
        'Best Params': gs.best_params['rmse']
    })

    # Plotting the best k values
    # Extract k values and corresponding RMSE scores
    k_values = param_grid['k']
    rmse_scores = []

    # Loop through each k value and find corresponding RMSE score
    for k in k_values:
        # Filter out the corresponding scores for each k value
        mask = [params['k'] == k for params in gs.cv_results['params']]
        k_rmse_scores = [gs.cv_results['mean_test_rmse'][i] for i in range(len(gs.cv_results['params'])) if mask[i]]
        rmse_scores.append(k_rmse_scores[0])  # Assuming only one score per k value

    plt.figure(figsize=(8, 6))
    plt.plot(k_values, rmse_scores, marker='o', linestyle='-', color='b')
    plt.title(f'RMSE vs. k for {algo_name}')
    plt.xlabel('k')
    plt.ylabel('RMSE')
    plt.grid(True)
    plt.show()

In [None]:
# Show results of the grid search
results_df = pd.DataFrame(results)
results_df

In [None]:
for index, row in results_df.iterrows():
    print(f"Algorithm: {row['Algorithm']}")
    print(f"Best Params: {row['Best Params']}")
    print("-" * 20)

In [None]:
# Choose the best algorithm based on the best RMSE from the grid search
best_algorithm_name = results_df.loc[results_df['Best RMSE'].idxmin()]['Algorithm']
best_model = best_models[best_algorithm_name]
print(f"Using the best model: {best_algorithm_name} with parameters: {results_df.loc[results_df['Best RMSE'].idxmin()]['Best Params']}")

Once we have found the best model, we can test its ability to recommend games to a specific user.

In [None]:
best_model = KNNWithMeans(k=50, min_k=5, sim_options={'name': 'cosine', 'user_based': False})
best_model.fit(trainset)

import pickle

# Save KNNWithMeans model
with open("knn_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

user = 'jbourgel'

# Games the user has already rated
rated_game_ids = reviews[reviews["user"] == user]["ID"].unique()

# All possible game IDs
all_game_ids = games["id"].unique()

# Filter out the ones already rated
unseen_game_ids = [gid for gid in all_game_ids if gid not in rated_game_ids]

# Predict ratings for the unseen games
predictions_user = [(gid, best_model.predict(user, gid).est) for gid in unseen_game_ids]

# Sort by predicted rating
top_k = 10
top_predictions = sorted(predictions_user, key=lambda x: x[1], reverse=True)[:top_k]

# Get game info from the original games DataFrame
top_game_ids = [pred[0] for pred in top_predictions]
recommended_games = games[games["id"].isin(top_game_ids)][["id", "name", "rating"]]

print("\nTop Recommended Games:")
print(recommended_games.reset_index(drop=True))

In [None]:
# Get predictions on test set
predictions = best_model.test(testset)

# Calculate performance metrics
print("Evaluation Metrics:")
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

In [None]:
# Precision@K and Recall@K (k = 10 by default)
def precision_recall_at_k(predictions, k=10, threshold=7.0):
    # Map user to list of (item_id, true_rating, est_rating)
    # Here we group predictions by user
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((iid, true_r, est))

    precisions = dict()
    recalls = dict()

    # For each user we simulate getting their top-k recommendations:
    for uid, user_ratings in user_est_true.items():
        # Sort by estimated rating
        user_ratings.sort(key=lambda x: x[2], reverse=True)
        top_k = user_ratings[:k]

        # How many relevant items are in the test set? (true_rating ≥ 8)
        n_rel = sum((true_r >= threshold) for (_, true_r, _) in user_ratings)

        # How many of the top-k recommended items are estimated to be ≥ 8?
        n_rec_k = sum((est >= threshold) for (_, _, est) in top_k)

        # How many are both relevant and recommended?
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (_, true_r, est) in top_k)

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    # Average over all users
    avg_precision = sum(precisions.values()) / len(precisions)
    avg_recall = sum(recalls.values()) / len(recalls)
    return avg_precision, avg_recall

In [None]:
precision, recall = precision_recall_at_k(predictions, k=10)
print(f"Precision@10: {precision:.4f}")
print(f"Recall@10: {recall:.4f}")

##### Latent based methods

Before ending this section of collaborative filtering, we also wanted to test latent factor models. These try to compress user and item interactions into a smaller set of hidden features.

These methods try to uncover the underlying structure in rating data, capturing things like genre preference, complexity tolerance, or thematic interests.

We test two models from the Surprise library:

*   SVD (Singular Value Decomposition): Learns latent factors via matrix factorization using gradient descent.
*   NMF (Non-negative Matrix Factorization): Learns additive latent components, and it is often more interpretable.

In [None]:
from surprise import SVD, NMF
from surprise.model_selection import GridSearchCV

# Define models and corresponding grids
param_grids = {
    'SVD': {
        'n_factors': [10, 25, 50],
        'lr_all': [0.002, 0.005],
        'reg_all': [0.02, 0.1]
    },
    'NMF': {
        'n_factors': [15, 30, 70],
        'reg_pu': [0.06, 0.1],
        'reg_qi': [0.06, 0.1]
    }
}

# Store results
results = []
best_models = {}  # To store the best model for each algorithm

# Loop through algorithms and perform grid search
for algo_name, param_grid in param_grids.items():
    algo_class = eval(algo_name)
    gs = GridSearchCV(algo_class, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)
    gs.fit(data)  # Fit on the full dataset to find best parameters

    # Store the best model
    best_models[algo_name] = gs.best_estimator

    # Store the result of the grid search
    results.append({
        'Algorithm': algo_name,
        'Best RMSE': gs.best_score['rmse'],
        'Best MAE': gs.best_score['mae'],
        'Best Params': gs.best_params['rmse']
    })

    # Plotting the RMSE values for different hyperparameters
    if algo_name == 'SVD':
        # Extract parameters for plotting RMSE
        n_factors = param_grid['n_factors']
        lr_all = param_grid['lr_all']
        reg_all = param_grid['reg_all']

        # Plot RMSE across n_factors, lr_all, reg_all
        plt.figure(figsize=(8, 6))
        for lr in lr_all:
            for reg in reg_all:
                rmse_scores = []
                for nf in n_factors:
                    mask = [(params['n_factors'] == nf) and (params['lr_all'] == lr) and (params['reg_all'] == reg)
                            for params in gs.cv_results['params']]
                    rmse_scores.append([gs.cv_results['mean_test_rmse'][i] for i in range(len(gs.cv_results['params'])) if mask[i]][0])

                plt.plot(n_factors, rmse_scores, marker='o', label=f'lr={lr}, reg={reg}')

        plt.title('RMSE vs n_factors for SVD')
        plt.xlabel('n_factors')
        plt.ylabel('RMSE')
        plt.legend()
        plt.grid(True)
        plt.show()

    elif algo_name == 'NMF':
        # Extract parameters for plotting RMSE
        n_factors = param_grid['n_factors']
        reg_pu = param_grid['reg_pu']
        reg_qi = param_grid['reg_qi']

        # Plot RMSE across n_factors, reg_pu, reg_qi
        plt.figure(figsize=(8, 6))
        for rp in reg_pu:
            for rq in reg_qi:
                rmse_scores = []
                for nf in n_factors:
                    mask = [(params['n_factors'] == nf) and (params['reg_pu'] == rp) and (params['reg_qi'] == rq)
                            for params in gs.cv_results['params']]
                    rmse_scores.append([gs.cv_results['mean_test_rmse'][i] for i in range(len(gs.cv_results['params'])) if mask[i]][0])

                plt.plot(n_factors, rmse_scores, marker='o', label=f'reg_pu={rp}, reg_qi={rq}')

        plt.title('RMSE vs n_factors for NMF')
        plt.xlabel('n_factors')
        plt.ylabel('RMSE')
        plt.legend()
        plt.grid(True)
        plt.show()

In [None]:
# Show results of the grid search
results_df = pd.DataFrame(results)
results_df

In [None]:
for index, row in results_df.iterrows():
    print(f"Algorithm: {row['Algorithm']}")
    print(f"Best Params: {row['Best Params']}")
    print("-" * 20)

In [None]:
best_model = SVD(n_factors=10, lr_all=0.005, reg_all=0.02)
best_model.fit(trainset)

# Save SVD model
with open("svd_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

user = 'jbourgel'

# Games user has already rated
rated_game_ids = reviews[reviews["user"] == user]["ID"].unique()

# All possible game ids
all_game_ids = games["id"].unique()

# Filter out the ones already rated
unseen_game_ids = [gid for gid in all_game_ids if gid not in rated_game_ids]

# Predict ratings for the unseen games
predictions_user = [(gid, best_model.predict(user, gid).est) for gid in unseen_game_ids]

# Sort by predicted rating
top_k = 10
top_predictions = sorted(predictions_user, key=lambda x: x[1], reverse=True)[:top_k]

# Get game info from the original games DataFrame
top_game_ids = [pred[0] for pred in top_predictions]
recommended_games = games[games["id"].isin(top_game_ids)][["id", "name", "rating"]]
print("\nTop Recommended Games:")
print(recommended_games.reset_index(drop=True))

In [None]:
# Get predictions on test set
predictions = best_model.test(testset)

# Calculate performance metrics
print("Evaluation Metrics:")
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

In [None]:
precision, recall = precision_recall_at_k(predictions, k=10)
print(f"Precision@10: {precision:.4f}")
print(f"Recall@10: {recall:.4f}")

Now we represent the projection of the latent space of the games created with the SVD model. Only the 30 first games will have the id label to ensure readability. The games that are close to each other have similar latent factors.

In [None]:
# Get game factors
item_ids = trainset.all_items()
item_inner_ids = list(item_ids)
item_raw_ids = [trainset.to_raw_iid(inner_id) for inner_id in item_inner_ids]
item_factors = np.array([best_model.qi[inner_id] for inner_id in item_inner_ids])

# Reduce to 2D
tsne = TSNE(n_components=2, random_state=42, perplexity = 15)
item_2d = tsne.fit_transform(item_factors)

# Plot
plt.figure(figsize=(12, 8))
plt.scatter(item_2d[:, 0], item_2d[:, 1], alpha=0.6)
for i, item_id in enumerate(item_raw_ids[:30]):  # label a few
    plt.text(item_2d[i, 0], item_2d[i, 1], item_id, fontsize=9)
plt.title("t-SNE Visualization of Game Embeddings (SVD)")
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.grid(True)
plt.show()

The following plot represents the user-item matrix of 10 users and 10 games. We can see that the games with predicted low ratings will propagate over users and the same with high ratings.

In [None]:
import seaborn as sns

# Pick a sample of users and items
sample_users = list(trainset.all_users())[:10]
sample_items = list(trainset.all_items())[:10]

matrix = np.zeros((len(sample_users), len(sample_items)))
for i, uid in enumerate(sample_users):
    for j, iid in enumerate(sample_items):
        matrix[i, j] = best_model.predict(trainset.to_raw_uid(uid), trainset.to_raw_iid(iid)).est

sns.heatmap(matrix, annot=True, fmt=".1f", cmap="coolwarm")
plt.xlabel("Games")
plt.ylabel("Users")
plt.title("Predicted Ratings Heatmap (Sample)")
plt.show()

## Dashboard

In [None]:
# CODE TO MAKE SURPRISE LIBRARY WORK
!pip uninstall -y numpy scikit-surprise tensorflow numba
!pip install numpy==1.26.4
!pip install numba==0.60.0
!pip install scikit-surprise
!pip install tensorflow==2.18.0

In [None]:
!pip install jupyter-dash

### Old

In [None]:
from dash import Dash, dcc, html, Input, Output, State
import dash

# LO MÁS SIMPLE

# === Recommendation Function ===
def recommend_content_based(user_name, games, reviews, top_k=10):
    liked_game_ids = reviews[(reviews["user"] == user_name) & (reviews["rating"] >= 6)]

    if liked_game_ids.empty:
        return pd.DataFrame()

    liked_vectors = liked_game_ids.merge(games[["id", "glove_vector"]], left_on="ID", right_on="id")

    if liked_vectors.empty:
        return pd.DataFrame()

    ratings = liked_vectors["rating"].values
    weights = ratings / ratings.max()
    glove_vectors = np.vstack(liked_vectors["glove_vector"])
    user_profile = np.average(glove_vectors, axis=0, weights=weights)

    candidate_games = games[~games["id"].isin(liked_game_ids["ID"])]
    candidate_vectors = np.vstack(candidate_games["glove_vector"])
    similarities = cosine_similarity(candidate_vectors, [user_profile]).flatten()

    candidate_games = candidate_games.copy()
    candidate_games["similarity"] = similarities
    recommended = candidate_games.sort_values("similarity", ascending=False).head(top_k)

    return recommended[["name", "similarity"]]

# === Create Dash App ===
app = Dash(__name__)
app.title = "Game Recommender"

app.layout = html.Div([
    html.H2("Game Recommender Dashboard"),
    html.Div([
        dcc.Input(id="username-input", type="text", placeholder="Enter your username..."),
        html.Button("Get Recommendations", id="submit-btn", n_clicks=0),
    ], style={"margin-bottom": "20px"}),

    html.Div(id="recommendation-output")
])

@app.callback(
    Output("recommendation-output", "children"),
    Input("submit-btn", "n_clicks"),
    State("username-input", "value")
)
def update_recommendations(n_clicks, user_input):
    if not user_input:
        return html.Div("Please enter a username.", style={"color": "white"})

    results = recommend_content_based(user_input, games, reviews)

    if results.empty:
        return html.Div("No recommendations found. Check if the user exists and has rated any games.", style={"color": "red"})

    return html.Div([
        html.H4(f"Top Recommendations for '{user_input}':"),
        html.Table([
            html.Thead(html.Tr([html.Th("Game"), html.Th("Similarity")])),
            html.Tbody([
                html.Tr([html.Td(row["name"]), html.Td(f"{row['similarity']:.3f}")])
                for _, row in results.iterrows()
            ])
        ])
    ])

# === Run it in notebook with the new built-in method ===
app.run(jupyter_mode="inline", debug=True)

In [None]:
import dash
from dash import dcc, html, Input, Output, State
# ------------------------------
# Recommendation Functions
# ------------------------------

def mmr(user_profile, candidate_vectors, candidate_ids, lambda_param=0.7, top_k=10):
    selected = []
    selected_ids = []
    candidate_indices = list(range(len(candidate_vectors)))
    similarities_to_user = cosine_similarity(candidate_vectors, [user_profile]).flatten()
    similarity_matrix = cosine_similarity(candidate_vectors)

    for _ in range(top_k):
        mmr_scores = []
        for idx in candidate_indices:
            diversity_penalty = 0 if not selected else max(similarity_matrix[idx][j] for j in selected)
            mmr_score = lambda_param * similarities_to_user[idx] - (1 - lambda_param) * diversity_penalty
            mmr_scores.append((idx, mmr_score))

        selected_idx, _ = max(mmr_scores, key=lambda x: x[1])
        selected.append(selected_idx)
        selected_ids.append(candidate_ids[selected_idx])
        candidate_indices.remove(selected_idx)

    return selected_ids

def recommend_mmr(user_profile, filtered_games, liked_ids, top_k=10, lambda_param=0.7):
    filtered_games = filtered_games[~filtered_games["id"].isin(liked_ids)]
    game_vectors = np.vstack(filtered_games["glove_vector"].values)
    candidate_ids = filtered_games["id"].tolist()
    selected_ids = mmr(user_profile, game_vectors, candidate_ids, lambda_param, top_k)
    similarities = cosine_similarity(game_vectors, [user_profile]).flatten()
    filtered_games["similarity"] = similarities
    return filtered_games[filtered_games["id"].isin(selected_ids)].sort_values(by="similarity", ascending=False)

def recommend_cluster(user_profile, filtered_games, liked_ids, top_k=10, n_clusters=5):
    filtered_games = filtered_games[~filtered_games["id"].isin(liked_ids)]
    game_vectors = np.vstack(filtered_games["glove_vector"].values)
    candidate_ids = filtered_games["id"].tolist()
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(game_vectors)
    similarities = cosine_similarity(game_vectors, [user_profile]).flatten()

    cluster_recommendations = []
    for cluster_id in range(n_clusters):
        indices = np.where(cluster_labels == cluster_id)[0]
        if len(indices) == 0:
            continue
        cluster_sims = similarities[indices]
        sorted_idx = indices[np.argsort(cluster_sims)[::-1]]
        for i in sorted_idx[:min(top_k // n_clusters + 1, len(sorted_idx))]:
            cluster_recommendations.append((candidate_ids[i], similarities[i]))

    sorted_final = sorted(cluster_recommendations, key=lambda x: x[1], reverse=True)[:top_k]
    return filtered_games[filtered_games["id"].isin([x[0] for x in sorted_final])].assign(similarity=[x[1] for x in sorted_final])


# ------------------------------
# User Profile and Filtering
# ------------------------------

def build_user_profile(user_name, games, reviews):
    liked_reviews = reviews[(reviews["user"] == user_name) & (reviews["rating"] >= 6)]
    liked_vectors = liked_reviews.merge(games[["id", "glove_vector"]], left_on="ID", right_on="id")
    if liked_vectors.empty:
        return None, []
    ratings = liked_vectors["rating"].values
    weights = ratings / ratings.max()
    vectors = np.vstack(liked_vectors["glove_vector"])
    profile = np.average(vectors, axis=0, weights=weights)
    return profile, liked_vectors["ID"].tolist()

def filter_games(games, max_time, min_age, min_p, max_p):
    result = games.copy()
    result = result[result["playingtime"] <= max_time]
    result = result[result["minage"] >= min_age]
    result = result[result["minplayers"] <= min_p]
    result = result[result["maxplayers"] >= max_p]
    return result


# ------------------------------
# Dash App
# ------------------------------

app = dash.Dash(__name__)

app.layout = html.Div([
    html.H2("Board Game Recommender"),

    html.Label("Enter your name"),
    dcc.Input(id='user-input', type='text', value='', debounce=True),  # User input for name

    html.Br(),
    html.Label("Recommendation Strategy"),
    dcc.RadioItems(id='recommender', options=[
        {'label': 'MMR', 'value': 'mmr'},
        {'label': 'Cluster-based', 'value': 'cluster'}
    ], value='mmr', inline=True),

    html.Br(),
    html.Label("Number of Recommendations"),
    dcc.Slider(id='top-k', min=1, max=20, step=1, value=10),

    html.Br(),
    html.Label("Max Playing Time"),
    dcc.Slider(id='max-time', min=10, max=300, step=10, value=60),

    html.Br(),
    html.Label("Min Age"),
    dcc.Slider(id='min-age', min=0, max=21, step=1, value=12),

    html.Br(),
    html.Label("Min Players"),
    dcc.Slider(id='min-p', min=1, max=10, step=1, value=2),

    html.Br(),
    html.Label("Max Players"),
    dcc.Slider(id='max-p', min=1, max=10, step=1, value=4),

    html.Div(id='mmr-options', children=[
        html.Br(),
        html.Label("Lambda (for MMR)"),
        dcc.Slider(id='lambda', min=0.0, max=1.0, step=0.05, value=0.7)
    ]),

    html.Div(id='cluster-options', style={'display': 'none'}, children=[
        html.Br(),
        html.Label("Number of Clusters"),
        dcc.Slider(id='n-clusters', min=2, max=10, step=1, value=5)
    ]),

    html.Br(),
    html.Button("Generate Recommendations", id="submit", n_clicks=0),
    html.Br(), html.Br(),

    html.Div(id='results')
])


@app.callback(
    Output('mmr-options', 'style'),
    Output('cluster-options', 'style'),
    Input('recommender', 'value')
)
def toggle_options(value):
    if value == 'mmr':
        return {'display': 'block'}, {'display': 'none'}
    else:
        return {'display': 'none'}, {'display': 'block'}

@app.callback(
    Output('results', 'children'),
    Input('submit', 'n_clicks'),
    State('user-input', 'value'),  # User input for name
    State('recommender', 'value'),
    State('top-k', 'value'),
    State('max-time', 'value'),
    State('min-age', 'value'),
    State('min-p', 'value'),
    State('max-p', 'value'),
    State('lambda', 'value'),
    State('n-clusters', 'value')
)
def recommend(n_clicks, user_name, method, top_k, max_time, min_age, min_p, max_p, lambda_param, n_clusters):
    if n_clicks == 0 or not user_name:
        return "Please enter your name to get recommendations."

    profile, liked_ids = build_user_profile(user_name, games, reviews)
    if profile is None:
        return html.Div("Not enough liked games.")

    filtered = filter_games(games, max_time, min_age, min_p, max_p)
    if filtered.empty:
        return html.Div("No games matched the filters.")

    if method == "mmr":
        recs = recommend_mmr(profile, filtered, liked_ids, top_k=top_k, lambda_param=lambda_param)
    else:
        recs = recommend_cluster(profile, filtered, liked_ids, top_k=top_k, n_clusters=n_clusters)

    table = html.Table([
        html.Tr([html.Th("Game"), html.Th("Similarity"), html.Th("Play Time"), html.Th("Min Age")])
    ] + [
        html.Tr([
            html.Td(r["name"]),
            html.Td(f"{r['similarity']:.2f}"),
            html.Td(r["playingtime"]),
            html.Td(r["minage"])
        ]) for _, r in recs.iterrows()
    ])

    return table


if __name__ == '__main__':
    app.run(debug=True)

### New

In [None]:
import base64
import pickle
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from gensim import corpora, models
from dash import Dash, html, dcc, Input, Output, callback_context, State
from dash.exceptions import PreventUpdate
from dash.dependencies import ALL
from wordcloud import WordCloud
from io import BytesIO
from surprise import KNNWithMeans, Dataset, Reader
from surprise import SVD

# ------------------------------
# Dataset definition
# ------------------------------
filename = "boardgames_3000_clean_dual_tone_glove_cluster_popularity.csv"
games = pd.read_csv(filename, sep=";")
games["glove_vector"] = games["glove_vector"].apply(lambda x: np.fromstring(str(x).strip("[]"), sep=" "))
reviews = pd.read_csv("filtered_reviews.csv", sep=";")

# ------------------------------
# TF-IDF
# ------------------------------
def prepare_tfidf_matrix(games):
  # Tokenize
  tokenized_descr = [clean_description.split() for clean_description in games["clean_description"]]

  # Dictionary and corpus
  dictionary = corpora.Dictionary(tokenized_descr)
  bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_descr]

  # TF-IDF corpus
  tfidf = models.TfidfModel(bow_corpus)
  tfidf_corpus = tfidf[bow_corpus]
  num_docs = len(tfidf_corpus)
  num_terms = len(dictionary)

  # Create empty matrix
  X_tfidf_gensim = np.zeros((num_docs, num_terms))

  # Fill it with TF-IDF scores
  for doc_idx, doc in enumerate(tfidf_corpus):
      for term_id, tfidf_score in doc:
          X_tfidf_gensim[doc_idx, term_id] = tfidf_score

  return dictionary, bow_corpus, tfidf_corpus, X_tfidf_gensim

def plot_tfidf_summary(dictionary, X_tfidf, top_n=25):
    # Average and total scores
    avg_scores = X_tfidf.mean(axis=0)
    tfidf_term_scores = X_tfidf.sum(axis=0)

    # Bar plot: top average scores
    top_indices = avg_scores.argsort()[::-1][:top_n]
    top_terms = [dictionary[i] for i in top_indices]
    top_scores = avg_scores[top_indices]

    bar_fig = go.Figure(go.Bar(
        x=top_terms,
        y=top_scores,
        marker_color='indigo'
    ))
    bar_fig.update_layout(
        xaxis_title="Term",
        yaxis_title="Average TF-IDF",
        xaxis_tickangle=-45
    )

    # Word cloud
    word_freq = {dictionary[i]: tfidf_term_scores[i] for i in range(len(dictionary))}

    wordcloud = WordCloud(
        width=800, height=400,
        background_color='white',
        colormap='viridis',
        max_words=200
    ).generate_from_frequencies(word_freq)

    # Convert to base64 image string for Dash
    img = BytesIO()
    wordcloud.to_image().save(img, format='PNG')
    img.seek(0)
    encoded_image = base64.b64encode(img.read()).decode()

    return bar_fig, encoded_image

dictionary, bow_corpus, tfidf_corpus, X_tfidf = prepare_tfidf_matrix(games)
_, encoded_image = plot_tfidf_summary(dictionary, X_tfidf)


# ------------------------------
# Popular Games
# ------------------------------
def get_diverse_popular_games(games, top_k=10):
    recommended_games = []

    # Iterate over clusters to get popular games from each cluster
    for cluster in range(games["description_cluster20"].nunique()):
        # Get games in the cluster
        games_cluster = games[games["description_cluster20"] == cluster]

        # Sort by popularity
        games_cluster = games_cluster.sort_values("popularity_score", ascending=False)

        # Select the top 1 game from each cluster to ensure diversity
        top_game = games_cluster.head(1)
        recommended_games.append(top_game)

    # Combine all recommended games and sort by popularity score
    recommended_games = pd.concat(recommended_games)
    recommended_games = recommended_games.sort_values("popularity_score", ascending=False).head(top_k)

    explanations = []
    for _, row in recommended_games.iterrows():
        cluster = row["description_cluster20"]
        rating = round(row["avg_rating"], 2)
        num = int(row["num_ratings"])

        explanation = (f"This game is highly rated ({rating}/10 from {num} users)")
        explanations.append(explanation)

    recommended_games["explanation"] = explanations

    return recommended_games[["name", "clean_description", "avg_rating", "num_ratings", "popularity_score", "explanation"]]

# ------------------------------
# Content-Based System (MMR)
# ------------------------------
def mmr(user_profile, candidate_vectors, candidate_ids, lambda_param=0.7, top_k=10):
    selected = []
    selected_ids = []
    candidate_indices = list(range(len(candidate_vectors)))

    similarities_to_user = cosine_similarity(candidate_vectors, [user_profile]).flatten()
    similarity_matrix = cosine_similarity(candidate_vectors)

    for _ in range(top_k):
        mmr_scores = []
        for idx in candidate_indices:
            if not selected:
                diversity_penalty = 0
            else:
                diversity_penalty = max(similarity_matrix[idx][j] for j in selected)

            mmr_score = lambda_param * similarities_to_user[idx] - (1 - lambda_param) * diversity_penalty
            mmr_scores.append((idx, mmr_score))

        selected_idx, _ = max(mmr_scores, key=lambda x: x[1])
        selected.append(selected_idx)
        selected_ids.append(candidate_ids[selected_idx])
        candidate_indices.remove(selected_idx)

    return selected_ids

def visualize_mmr(user_vector, candidate_vectors, candidate_ids, game_names, lambda_param=0.7, top_k=10):
    # Get top-k by similarity
    relevance = cosine_similarity(candidate_vectors, [user_vector]).flatten()
    top_k_indices = relevance.argsort()[::-1][:top_k]
    top_k_ids = [candidate_ids[i] for i in top_k_indices]

    # Get top-k using MMR
    mmr_ids = mmr(user_vector, candidate_vectors, candidate_ids, lambda_param, top_k)

    # Reduce dimensions for plotting
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(np.vstack([user_vector, candidate_vectors]))
    user_p = reduced[0]
    candidates_p = reduced[1:]

    plt.figure(figsize=(10, 7))
    plt.scatter(candidates_p[:, 0], candidates_p[:, 1], c='lightgray', label="All Candidates", alpha=0.5)

    # Plot user profile
    plt.scatter(user_p[0], user_p[1], c='blue', label="User Profile", marker='X', s=100)

    # Plot top-k by similarity
    for idx in top_k_indices:
        plt.scatter(candidates_p[idx, 0], candidates_p[idx, 1], c='green', label="Top-k Similarity" if idx == top_k_indices[0] else "", edgecolors='black')
        plt.text(candidates_p[idx, 0], candidates_p[idx, 1], game_names[candidate_ids[idx]], fontsize=8, color='darkgreen')

    # Plot MMR-selected games
    for mmr_id in mmr_ids:
        i = candidate_ids.index(mmr_id)
        plt.scatter(candidates_p[i, 0], candidates_p[i, 1], c='orange', label="Top-k MMR" if mmr_id == mmr_ids[0] else "", edgecolors='black')
        plt.text(candidates_p[i, 0], candidates_p[i, 1], game_names[mmr_id], fontsize=8, color='darkorange')

    plt.legend()
    plt.title(f"MMR vs Top-k Similarity (λ = {lambda_param})")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.grid(True)

    # Instead of plt.show(), convert to base64
    buf = BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight')
    plt.close()
    buf.seek(0)
    return base64.b64encode(buf.read()).decode()

def recommend_content_based_mmr(user_name, games, reviews, top_k=10, testing=False, max_playingtime=None, min_age=None, minplayers=None, maxplayers=None, min_rating=6, plot=False):
    # Filter games based on user preferences
    filtered_games = games.copy()

    if max_playingtime is not None:
        filtered_games = filtered_games[filtered_games["playingtime"] <= max_playingtime]
    if min_age is not None:
        filtered_games = filtered_games[filtered_games["minage"] >= min_age]
    if minplayers is not None:
        filtered_games = filtered_games[filtered_games["minplayers"] <= minplayers]
    if maxplayers is not None:
        filtered_games = filtered_games[filtered_games["maxplayers"] >= maxplayers]
    if min_rating is not None:
        filtered_games = filtered_games[filtered_games["rating"] >= min_rating]

    if filtered_games.empty:
        print("No games match the provided filters.")
        return

    # Get games the user liked
    liked_game_ids = reviews[(reviews["user"] == user_name) & (reviews["rating"] >= 6)]
    liked_vectors = liked_game_ids.merge(games[["id", "glove_vector"]], left_on="ID", right_on="id")

    if liked_vectors.empty:
        print("Not enough liked games with vectors to build a profile.")
        return

    # Build user profile (weighted average)
    ratings = liked_vectors["rating"].values
    weights = ratings / ratings.max()
    vectors = np.vstack(liked_vectors["glove_vector"])
    user_profile = np.average(vectors, axis=0, weights=weights)

    if not testing:
        # Remove already liked games from recommendations
        unseen_mask = ~filtered_games["id"].isin(liked_game_ids["ID"])
        filtered_games = filtered_games[unseen_mask]

    if filtered_games.empty:
        print("No new games to recommend after filtering out liked ones.")
        return

    # Prepare data for MMR
    game_vectors = np.vstack(filtered_games["glove_vector"].values)
    similarities = cosine_similarity(game_vectors, [user_profile]).flatten()

    # Apply MMR for diversity
    selected_ids = mmr(user_profile, game_vectors, filtered_games["id"].tolist(), lambda_param=0.7, top_k=top_k)
    recommended_games = filtered_games[filtered_games["id"].isin(selected_ids)].copy()

    # Add similarity score for interpretability
    recommended_games["similarity"] = similarities[[filtered_games["id"].tolist().index(i) for i in selected_ids]]

    # Generate personalized explanations with vibe and emotion
    explanations = []
    for game_id in selected_ids:
        # Find the most similar liked game
        similarities_to_liked_games = cosine_similarity([games.loc[games["id"] == game_id, "glove_vector"].values[0]], liked_vectors["glove_vector"].tolist()).flatten()
        most_similar_game_idx = np.argmax(similarities_to_liked_games)
        most_similar_game_id = liked_game_ids.iloc[most_similar_game_idx]["ID"]
        most_similar_game_name = games.loc[games["id"] == most_similar_game_id, "name"].values[0]
        most_similar_rating = liked_game_ids.iloc[most_similar_game_idx]["rating"]

        # Get predicted vibe and emotion for the recommended game
        predicted_vibe = games.loc[games["id"] == game_id, "predicted_vibe"].values[0]
        predicted_emotion = games.loc[games["id"] == game_id, "predicted_emotion"].values[0]

        # Generate the explanation
        explanation = (
            f"I recommend this game because you liked '{most_similar_game_name}' "
            f"(you rated it {most_similar_rating}/10). "
            f"This game has a '{predicted_emotion}' feeling and is '{predicted_vibe}'."
        )
        explanations.append(explanation)

    recommended_games["explanation"] = explanations

    mmr_plot_encoded = None
    if plot:
        mmr_plot_encoded = visualize_mmr(user_profile, game_vectors, filtered_games["id"].tolist(), dict(zip(games["id"], games["name"])), lambda_param=0.7, top_k=top_k)

    return recommended_games[["name", "rating", "id", "similarity", "explanation"]], mmr_plot_encoded

# ------------------------------
# Hybrid Content-Based
# ------------------------------
def predict_all_ratings(user_name, games_df, reviews_df, k=10):
    # Get user-rated games and their vectors
    user_reviews = reviews_df[reviews_df["user"] == user_name]
    if user_reviews.empty:
        return "No reviews found for this user."

    user_games = user_reviews.merge(games_df[["id", "glove_vector"]], left_on="ID", right_on="id")
    if user_games.empty:
        return "User has rated games with no vector information."

    rated_vectors = np.vstack(user_games["glove_vector"])
    rated_ratings = user_games["rating"].values
    rated_names = user_games["name"].tolist()

    # Games not yet rated by user
    unseen_games = games_df[~games_df["id"].isin(user_games["id"])].copy()
    if unseen_games.empty:
        return "No unseen games to predict ratings for."

    predicted_ratings = []

    for _, game in unseen_games.iterrows():
        target_vector = game["glove_vector"]

        # Cosine similarity with all user-rated games
        similarities = cosine_similarity(rated_vectors, [target_vector]).flatten()
        similarities = np.maximum(similarities, 0)

        # Select top-k most similar games
        if len(similarities) < k:
            k_adj = len(similarities)
        else:
            k_adj = k

        top_k_idx = np.argsort(similarities)[-k_adj:]
        top_similarities = similarities[top_k_idx]
        top_ratings = rated_ratings[top_k_idx]
        top_game_names = [rated_names[i] for i in top_k_idx]

        if top_similarities.sum() == 0:
            pred_rating = np.nan
            explanation = "No sufficiently similar games found."
        else:
            pred_rating = np.average(top_ratings, weights=top_similarities)
            # Build explanation
            if len(top_game_names) == 1:
                explanation = f"Because you liked **{top_game_names[0]}**, and this game is similar to it."
            elif len(top_game_names) == 2:
                explanation = f"Because you liked **{top_game_names[0]}** and **{top_game_names[1]}**, and this game is similar to both."
            else:
                explanation = f"Because you liked games like **{top_game_names[0]}**, **{top_game_names[1]}**, and others."

        predicted_ratings.append((game["id"], game["name"], round(pred_rating, 2) if not np.isnan(pred_rating) else None, explanation))

    # Return as a DataFrame
    pred_df = pd.DataFrame(predicted_ratings, columns=["id", "name", "predicted_rating", "explanation"])
    pred_df = pred_df.sort_values(by="predicted_rating", ascending=False)

    return pred_df

# ------------------------------
# Collaborative functions
# ------------------------------
def recommend_with_knn(user_name, games_df, reviews_df, top_k=10):
    # Load KNNWithMeans model
    with open("knn_model.pkl", "rb") as f:
        model = pickle.load(f)

    # Get games the user has not rated
    rated_ids = reviews_df[reviews_df["user"] == user_name]["ID"].unique()
    unseen_ids = [gid for gid in games_df["id"].unique() if gid not in rated_ids]

    # Predict ratings
    predictions = [(gid, model.predict(user_name, gid).est) for gid in unseen_ids]
    top_preds = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_k]

    # Retrieve game info
    top_game_ids = [gid for gid, _ in top_preds]
    recommended = games_df[games_df["id"].isin(top_game_ids)][["name", "rating", "id"]].copy()
    recommended["predicted_rating"] = [round(r, 2) for _, r in top_preds]

    return recommended.sort_values("predicted_rating", ascending=False)

def recommend_with_svd(user_name, games_df, reviews_df, top_k=10):
    # Load model
    with open("svd_model.pkl", "rb") as f:
        model = pickle.load(f)

    # Get games the user has not rated
    rated_ids = reviews_df[reviews_df["user"] == user_name]["ID"].unique()
    unseen_ids = [gid for gid in games_df["id"].unique() if gid not in rated_ids]

    # Predict ratings
    predictions = [(gid, model.predict(user_name, gid).est) for gid in unseen_ids]
    top_preds = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_k]

    # Retrieve game info
    top_game_ids = [gid for gid, _ in top_preds]
    recommended = games_df[games_df["id"].isin(top_game_ids)][["name", "rating", "id"]].copy()
    recommended["predicted_rating"] = [round(r, 2) for _, r in top_preds]

    return recommended.sort_values("predicted_rating", ascending=False)

# ------------------------------
# Dash App
# ------------------------------
app = Dash(__name__, suppress_callback_exceptions=True)

# --- Global Styles ---
app.index_string = '''
<!DOCTYPE html>
<html>
    <head>
        {%metas%}
        <title>Board Game Recommender</title>
        {%favicon%}
        {%css%}
        <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap" rel="stylesheet">
        <style>
            body {
                font-family: 'Inter', sans-serif;
                background-color: #f8f9fa;
                margin: 0;
                padding: 0;
            }
            .landing-container {
                display: flex;
                flex-direction: column;
                align-items: center;
                justify-content: center;
                height: 100vh;
                text-align: center;
            }
            .landing-container h1 {
                font-size: 3rem;
                margin-bottom: 0.5em;
            }
            .landing-container p {
                font-size: 1.2rem;
                color: #555;
                margin-bottom: 2em;
            }
            .button-row {
                display: flex;
                gap: 1.5em;
                flex-wrap: wrap;
                justify-content: center;
            }
            .start-button {
                font-size: 1rem;
                padding: 0.75em 1.5em;
                border: none;
                border-radius: 8px;
                background-color: #007BFF;
                color: white;
                cursor: pointer;
                transition: background-color 0.3s ease;
                min-width: 180px;
            }
            .start-button:hover {
                background-color: #0056b3;
            }
            .back-button-container {
                display: flex;
                justify-content: center;
                margin-top: 3em;
            }
            .back-button {
                font-size: 1rem;
                padding: 0.75em 1.5em;
                border: none;
                border-radius: 8px;
                background-color: #6c757d;
                color: white;
                cursor: pointer;
                transition: background-color 0.3s ease;
                min-width: 180px;
            }
            .back-button:hover {
                background-color: #5a6268;
            }
        </style>
    </head>
    <body>
        {%app_entry%}
        <footer>
            {%config%}
            {%scripts%}
            {%renderer%}
        </footer>
    </body>
</html>
'''

# --- Landing Layout ---
landing_layout = html.Div(className="landing-container", children=[
    html.H1("🎲 Welcome to the Board Game Recommender"),
    html.P("Discover games tailored to your preferences."),
    html.Div(className="button-row", children=[
        html.A(html.Button("Explore Dataset", className="start-button"), href="/exploration"),
        html.A(html.Button("Start Recommending", className="start-button"), href="/recommender")
    ])
])

# --- Recommender Layout ---
recommender_layout = html.Div(style={
    'display': 'flex',
    'flexDirection': 'column',
    'alignItems': 'center',
    'justifyContent': 'center',
    'padding': '5vh 2rem',
    'textAlign': 'center'
}, children=[
    html.Div("🕵️‍♂️", style={'fontSize': '5rem', 'marginBottom': '0.5rem'}),
    html.H2("Who are you?"),
    html.P("Enter your username to continue"),
    dcc.Input(
        id='user-name-input',
        type='text',
        placeholder='e.g. boardgamefan99',
        debounce=True,
        style={
            'padding': '0.6rem 1rem',
            'fontSize': '1rem',
            'borderRadius': '8px',
            'border': '1px solid #ccc',
            'width': '300px',
            'marginTop': '1rem'
        }
    ),

    html.Div(id='game-rating-inputs', style={'width': '100%', 'maxWidth': '600px'}),
    html.Div(style={'display': 'flex', 'gap': '1rem', 'marginTop': '1.5rem'}, children=[
        html.A(html.Button("← Go Back", className="back-button"), href="/"),
        html.Button("Continue", id='user-continue-button', n_clicks=0, style={
            'padding': '0.6rem 1.5rem',
            'fontSize': '1rem',
            'border': 'none',
            'borderRadius': '8px',
            'backgroundColor': '#007BFF',
            'color': 'white',
            'cursor': 'pointer'
        })
    ]),
    html.Div(id='user-error', style={'color': 'red', 'marginTop': '1rem'})
])

# --- Data Exploration Layout ---
exploration_layout = html.Div([
    html.H2("📊 Dataset Overview"),

    html.H4("ℹ️ TMI but..."),
    html.P([
        "Our dataset is self-collected and based on data from ",
        html.A("BoardGameGeek", href="https://boardgamegeek.com/", target="_blank"),
        ", the world’s largest online board game database and community. It contains",
        f" {len(games):,} board games including ratings, player requirements, age guidelines, and more."
    ]),
    html.P("BoardGameGeek hosts ratings, reviews, descriptions, and play guides for thousands of games, powered by contributions from over 2 million registered users. We could say it’s a go-to hub for board game enthusiasts around the world."),

    html.Br(),
    html.H4("🔍 TF-IDF Summary"),
    html.P("Ever wondered which words make board games truly stand out? The chart and cloud below show the most distinctive terms across all game descriptions, calculated using a method called TF-IDF. The higher the score, the more unique and important a word is to describing those games. Try adjusting the slider to explore the top keywords!"),

    html.Div(style={
        'backgroundColor': '#e7f1fb',
        'border': '1px solid #c3d9ec',
        'borderRadius': '12px',
        'padding': '1.5rem',
        'marginTop': '1rem',
        'marginBottom': '2rem',
        'boxShadow': '0 2px 5px rgba(0,0,0,0.05)'
    }, children=[
        html.Label("Select number of top terms:"),
        dcc.Slider(
            id='top-k-slider',
            min=5,
            max=50,
            step=1,
            value=25,
            marks={i: str(i) for i in range(5, 51, 5)},
            tooltip={"placement": "bottom", "always_visible": True}
        ),
        html.Br(),

        html.Div(style={'display': 'flex', 'justifyContent': 'space-between', 'gap': '2em'}, children=[

            # Left: bar chart
            html.Div(style={'flex': '1'}, children=[
                dcc.Graph(id='tfidf-bar-chart', style={'height': '230px'})
            ]),

            # Right: word cloud
            html.Div(style={'flex': '1', 'textAlign': 'center'}, children=[
                html.Img(id='tfidf-wordcloud',
                        src='data:image/png;base64,{}'.format(encoded_image),
                        style={'width': '100%', 'maxWidth': '100%'})
            ])
        ])
    ]),

    html.Br(),
    html.H4("🧠 Topic Distribution (LDA)"),
    html.P("Explore how topics are distributed across the board game descriptions. This interactive map lets you hover over topics, see top keywords, and better understand how the model organizes the data."),
    html.Div(style={
        'backgroundColor': '#e7f1fb',
        'border': '1px solid #c3d9ec',
        'borderRadius': '12px',
        'padding': '1.5rem',
        'marginTop': '1rem',
        'marginBottom': '2rem',
        'boxShadow': '0 2px 5px rgba(0,0,0,0.05)'
    }, children=[
        html.Iframe(
            src="/assets/lda_vis.html",
            style={
                "width": "100%",
                "height": "600px",
                "border": "none",
                "borderRadius": "12px"
            }
        )
    ]),

    html.Div(className="back-button-container", children=[
        html.A(html.Button("← Go Back", className="back-button"), href="/")
    ])
])

# --- Register User Layout ---
register_user_layout = html.Div(style={
    'display': 'flex',
    'flexDirection': 'column',
    'alignItems': 'center',
    'padding': '5vh 2rem',
    'textAlign': 'center'
}, children=[
    html.H2("Oops! Looks like you are new here...please register!"),

    # Username row
    html.Div(style={'display': 'flex', 'alignItems': 'center', 'gap': '1rem', 'marginBottom': '1.5rem'}, children=[
        html.Label("Username:", style={'fontWeight': 'bold', 'whiteSpace': 'nowrap'}),
        dcc.Input(
            id='new-username-input',
            type='text',
            placeholder='e.g. boardgamefan123',
            style={
                'padding': '0.6rem 1rem',
                'fontSize': '1rem',
                'borderRadius': '8px',
                'border': '1px solid #ccc',
                'width': '300px'
            }
        )
    ]),

    # Game selection row
    html.Div(style={'display': 'flex', 'alignItems': 'center', 'gap': '1rem', 'marginBottom': '1.5rem'}, children=[
        html.Label("Search for 5 games you like:", style={'fontWeight': 'bold', 'whiteSpace': 'nowrap'}),
        dcc.Dropdown(
            id='game-selection-dropdown',
            options=[
                {'label': row['name'], 'value': row['id']}
                for _, row in games.sort_values('name').iterrows()
            ],
            multi=True,
            placeholder="Start typing to find games…",
            style={'width': '500px'}
        )
    ]),

    # Rating sliders will appear here
    html.Div(id='game-rating-inputs', style={
        'width': '100%',
        'maxWidth': '600px',
        'marginBottom': '2rem'
    }),

    # Buttons
    html.Div(style={'display': 'flex', 'gap': '1rem'}, children=[
        html.A(html.Button("← Go Back", className="back-button"), href="/"),
        html.Button("Register", id="register-button", n_clicks=0, style={
            'padding': '0.6rem 1.5rem',
            'fontSize': '1rem',
            'border': 'none',
            'borderRadius': '8px',
            'backgroundColor': '#007BFF',
            'color': 'white',
            'cursor': 'pointer'
        })
    ]),

    html.Div(id='register-error', style={'color': 'red', 'marginTop': '1rem'})
])

# --- Recommendation Options Layout ---
recommendation_options_layout = html.Div(style={
    'display': 'flex',
    'flexDirection': 'column',
    'alignItems': 'center',
    'padding': '5vh 2rem',
    'textAlign': 'center'
}, children=[
    html.H2("🧠 Choose a Recommendation Strategy"),
    html.P("Select an algorithm below to get your personalized board game recommendations."),

    html.Div(style={'display': 'flex', 'gap': '1.5rem', 'marginTop': '2rem'}, children=[
        html.Button("🔍 Content-Based Filtering (MMR)", id='algo-content', className='start-button'),
        html.Button("🤝 Collaborative Filtering", id='algo-collab', className='start-button'),
        html.Button("🧪 Hybrid Approach", id='algo-hybrid', className='start-button')
    ]),

    html.Div(className="back-button-container", children=[
        html.A(html.Button("← Go Back", className="back-button"), href="/")
    ])
])

# --- New User Choice Layout ---
new_user_choice_layout = html.Div(style={
    'display': 'flex',
    'flexDirection': 'column',
    'alignItems': 'center',
    'justifyContent': 'center',
    'padding': '8vh 2rem',
    'textAlign': 'center'
}, children=[
    html.Div("✨", style={'fontSize': '4rem', 'marginBottom': '0.5rem'}),
    html.H2("Looks like it's your first time here!"),
    html.P("Would you like to create a profile for personalized recommendations or just see what's popular?"),

    html.Div(style={'display': 'flex', 'gap': '2rem', 'marginTop': '2rem'}, children=[
        html.Button("🎯 Create a Profile", id="go-to-register", n_clicks=0, style={
            'padding': '1rem 2rem',
            'fontSize': '1.1rem',
            'border': 'none',
            'borderRadius': '10px',
            'backgroundColor': '#007BFF',
            'color': 'white',
            'cursor': 'pointer',
            'minWidth': '220px'
        }),
        html.Button("🔥 Show Popular Games", id="go-to-popular", n_clicks=0, style={
            'padding': '1rem 2rem',
            'fontSize': '1.1rem',
            'border': 'none',
            'borderRadius': '10px',
            'backgroundColor': '#28a745',
            'color': 'white',
            'cursor': 'pointer',
            'minWidth': '220px'
        })
    ]),

    html.Div(className="back-button-container", children=[
        html.A(html.Button("← Go Back", className="back-button"), href="/")
    ])
])

# --- Popular Games Layout ---
popular_games_layout = html.Div([
    html.H2("🔥 Popular Board Games"),
    html.Div([
        html.Label("Select how many games to show:", style={'fontWeight': 'bold'}),
        dcc.Slider(
            id="top-n-slider",
            min=5,
            max=15,
            step=1,
            value=10,
            marks={i: str(i) for i in range(5, 16)},
            tooltip={"placement": "bottom", "always_visible": True},
            updatemode="drag"
        )
    ], style={'width': '100%', 'padding': '1rem 3rem', 'marginBottom': '2rem'}),
    html.Div(id="popular-games-output"),
    html.Div(className="back-button-container", children=[
        html.A(html.Button("← Go Back", className="back-button"), href="/")
    ])
])

# --- Content-Based MMR Layout ---
content_mmr_layout = html.Div(style={
    'display': 'flex',
    'flexDirection': 'column',
    'alignItems': 'center',
    'padding': '5vh 2rem',
    'textAlign': 'center'
}, children=[

    html.H2("🎯 Content-Based MMR Recommender"),
    html.P("Customize the parameters below to get personalized and diverse game recommendations."),

    html.Div([

        html.Div([
            html.Label("🎲 Max Playing Time (minutes):"),
            dcc.Slider(
                id="input-max-playingtime",
                min=10, max=300, step=10, value=60,
                marks={i: str(i) for i in range(30, 301, 30)},
                tooltip={"placement": "bottom", "always_visible": True}
            )
        ], style={"width": "100%", "maxWidth": "600px", "marginBottom": "1.5rem"}),

        html.Div([
            html.Label("👶 Minimum Age:"),
            dcc.Slider(
                id="input-min-age",
                min=3, max=18, step=1, value=10,
                marks={i: str(i) for i in range(3, 19, 3)},
                tooltip={"placement": "bottom", "always_visible": True}
            )
        ], style={"width": "100%", "maxWidth": "600px", "marginBottom": "1.5rem"}),

        html.Div([
            html.Label("👥 Minimum Players:"),
            dcc.Slider(
                id="input-min-players",
                min=1, max=10, step=1, value=2,
                marks={i: str(i) for i in range(1, 11)},
                tooltip={"placement": "bottom", "always_visible": True}
            )
        ], style={"width": "100%", "maxWidth": "600px", "marginBottom": "1.5rem"}),

        html.Div([
            html.Label("👥 Maximum Players:"),
            dcc.Slider(
                id="input-max-players",
                min=1, max=20, step=1, value=4,
                marks={i: str(i) for i in range(2, 21, 2)},
                tooltip={"placement": "bottom", "always_visible": True}
            )
        ], style={"width": "100%", "maxWidth": "600px", "marginBottom": "1.5rem"}),

        html.Div([
            html.Label("⭐ Minimum Rating:"),
            dcc.Slider(
                id="input-min-rating",
                min=1, max=10, step=1, value=6,
                marks={i: str(i) for i in range(1, 11)},
                tooltip={"placement": "bottom", "always_visible": True}
            )
        ], style={"width": "100%", "maxWidth": "600px", "marginBottom": "1.5rem"}),

        html.Div([
            html.Label("🔝 Number of Recommendations (Top-K):"),
            dcc.Slider(
                id="input-top-k",
                min=3, max=20, step=1, value=10,
                marks={i: str(i) for i in range(3, 21)},
                tooltip={"placement": "bottom", "always_visible": True}
            )
        ], style={"width": "100%", "maxWidth": "600px", "marginBottom": "1.5rem"}),

        html.Div([
            html.Label("⚖️ MMR Lambda (Diversity vs Relevance):"),
            dcc.Slider(
                id="input-lambda-param",
                min=0.0, max=1.0, step=0.05, value=0.7,
                marks={i / 10: str(i / 10) for i in range(0, 11)},
                tooltip={"placement": "bottom", "always_visible": True}
            )
        ], style={"width": "100%", "maxWidth": "600px", "marginBottom": "2rem"}),

        html.Button("🚀 Run Recommender", id="run-mmr-recommender", n_clicks=0, className="start-button")

    ], style={"width": "100%", "maxWidth": "650px"}),

    html.Hr(style={"margin": "3rem 0", "width": "100%"}),

    html.Div(id="cb-mmr-recommendation-output", style={"width": "100%", "maxWidth": "800px"}),
    html.Img(id="cb-mmr-plot-output", style={"marginTop": "2rem", "maxWidth": "100%"}),

    html.Div(className="back-button-container", children=[
        html.A(html.Button("← Go Back", className="back-button"), href="/recommendation-options")
    ])
])

# --- Hybrid Content-Based Layout ---
hybrid_layout = html.Div(style={
    'display': 'flex',
    'flexDirection': 'column',
    'alignItems': 'center',
    'padding': '5vh 2rem',
    'textAlign': 'center'
}, children=[

    html.H2("🧪 Hybrid Recommender"),
    html.P("This approach combines content-based filtering with rating prediction using similarity."),

    html.Div(style={"width": "100%", "maxWidth": "650px", "marginBottom": "2rem"}, children=[
        html.Label("🔍 Number of Similar Games to Consider (k):", style={'fontWeight': 'bold'}),
        dcc.Slider(
            id="input-k",
            min=1,
            max=20,
            step=1,
            value=10,
            marks={i: str(i) for i in range(1, 21)},
            tooltip={"placement": "bottom", "always_visible": True}
        )
    ]),

    html.Button("🚀 Run Hybrid Recommender", id="run-hybrid-recommender", n_clicks=0, className="start-button"),

    html.Hr(style={"margin": "3rem 0", "width": "100%"}),

    html.Div(id="hybrid-recommendation-output", style={"width": "100%", "maxWidth": "800px"}),

    html.Div(className="back-button-container", children=[
        html.A(html.Button("← Go Back", className="back-button"), href="/recommendation-options")
    ])
])

# --- Collaborative Filtering Layout ---
collaborative_layout = html.Div(style={
    'display': 'flex',
    'flexDirection': 'column',
    'alignItems': 'center',
    'padding': '5vh 2rem',
    'textAlign': 'center'
}, children=[
    html.H2("🤝 Collaborative Filtering"),
    html.P("Select the method and number of recommendations."),

    html.Div([
        html.Label("📌 Choose a method:"),
        dcc.RadioItems(
            id="collab-method",
            options=[
                {"label": "KNN With Means", "value": "knn"},
                {"label": "SVD", "value": "svd"}
            ],
            value="knn",
            labelStyle={"display": "inline-block", "marginRight": "1rem"}
        )
    ], style={"marginBottom": "2rem"}),

    html.Div([
        html.Label("🔝 Number of Recommendations (Top-K):"),
        dcc.Slider(
            id="collab-top-k",
            min=3, max=20, step=1, value=10,
            marks={i: str(i) for i in range(3, 21)},
            tooltip={"placement": "bottom", "always_visible": True}
        )
    ], style={"width": "100%", "maxWidth": "600px", "marginBottom": "2rem"}),

    html.Button("🚀 Run Collaborative Recommender", id="run-collab-recommender", n_clicks=0, className="start-button"),

    html.Hr(style={"margin": "3rem 0", "width": "100%"}),

    html.Div(id="collab-recommendation-output", style={"width": "100%", "maxWidth": "800px"}),

    html.Div(className="back-button-container", children=[
        html.A(html.Button("← Go Back", className="back-button"), href="/recommendation-options")
    ])
])

# --- App Layout ---
app.layout = html.Div([
    dcc.Location(id='url', refresh=False),
    dcc.Store(id='stored-username', storage_type='session'),
    html.Div(id='page-content')
])

# --- Page Router ---
@app.callback(Output('page-content', 'children'),
              Input('url', 'pathname'))
def display_page(pathname):
    if pathname == "/recommender":
        return recommender_layout
    elif pathname == "/exploration":
        return exploration_layout
    elif pathname == "/register-user":
        return register_user_layout
    elif pathname == "/new-user-choice":
        return new_user_choice_layout
    elif pathname == "/recommendation-options":
        return recommendation_options_layout
    elif pathname == "/popular":
        return popular_games_layout
    elif pathname == "/content-based-mmr":
        return content_mmr_layout
    elif pathname == "/hybrid":
        return hybrid_layout
    elif pathname == "/collaborative-filtering":
        return collaborative_layout
    else:
        return landing_layout

@app.callback(
    Output('tfidf-bar-chart', 'figure'),
    Input('top-k-slider', 'value')
)
def update_tfidf_bar_chart(top_k):
    fig, _ = plot_tfidf_summary(dictionary, X_tfidf, top_n=top_k)
    return fig

@app.callback(
    Output('game-rating-inputs', 'children'),
    Input('game-selection-dropdown', 'value')
)
def show_rating_sliders(selected_games):
    if not selected_games:
        return []

    sliders = []
    for game_id in selected_games:
        game_name = games.loc[games["id"] == game_id, "name"].values[0]
        sliders.append(
            html.Div(style={'marginBottom': '1rem'}, children=[
                html.Label(f"Rate '{game_name}':", style={'fontWeight': 'bold'}),
                dcc.Slider(
                    id={'type': 'rating-slider', 'index': str(game_id)},
                    min=1, max=10, step=1, value=8,
                    marks={i: str(i) for i in range(1, 11)},
                    tooltip={"placement": "bottom", "always_visible": True}
                )
            ])
        )
    return sliders

@app.callback(
    Output('url', 'pathname'),
    Output('user-error', 'children'),
    Output('stored-username', 'data'),
    Input('user-continue-button', 'n_clicks'),
    State('user-name-input', 'value'),
    prevent_initial_call=True
)
def route_existing_user(n_clicks, username):
    if n_clicks == 0:
        raise PreventUpdate

    if not username or username.strip() == "":
        return dash.no_update, "Please enter a valid username.", None

    username = username.strip()
    if username in reviews["user"].values:
        return "/recommendation-options", "", username
    else:
        return "/new-user-choice", "", username

@app.callback(
    Output('url', 'pathname', allow_duplicate=True),
    Output('register-error', 'children'),
    Input('register-button', 'n_clicks'),
    State('new-username-input', 'value'),
    State('game-selection-dropdown', 'value'),
    State({'type': 'rating-slider', 'index': ALL}, 'value'),
    prevent_initial_call=True
)
def register_user(n_clicks, new_username, selected_games, ratings):
    global reviews

    if n_clicks == 0:
        raise PreventUpdate

    if not new_username or new_username.strip() == "":
        return dash.no_update, "Please enter a username."
    if not selected_games or len(selected_games) < 5:
        return dash.no_update, "Please select at least 5 games."
    if len(selected_games) != len(ratings):
        return dash.no_update, "Something went wrong with rating capture."

    new_entries = pd.DataFrame({
        "user": [new_username.strip()] * len(selected_games),
        "ID": selected_games,
        "rating": ratings,
        "comment": [np.nan] * len(selected_games),
        "name": [games.loc[games["id"] == gid, "name"].values[0] for gid in selected_games]
    })

    reviews = pd.concat([reviews, new_entries], ignore_index=True)

    return "/recommendation-options", ""

@app.callback(
    Output('url', 'pathname', allow_duplicate=True),
    Input('go-to-register', 'n_clicks'),
    Input('go-to-popular', 'n_clicks'),
    prevent_initial_call=True
)
def route_from_choice(n_register, n_popular):
    ctx = callback_context.triggered_id
    if ctx == "go-to-register":
        return "/register-user"
    elif ctx == "go-to-popular":
        return "/popular"
    raise PreventUpdate

@app.callback(
    Output("popular-games-output", "children"),
    Input("url", "pathname"),
    Input("top-n-slider", "value"),
    prevent_initial_call=True
)
def display_popular_games(pathname, top_k):
    if pathname != "/popular":
        raise PreventUpdate

    df = get_diverse_popular_games(games, top_k=top_k)

    return [
        html.Div(style={
            'border': '1px solid #ccc',
            'borderRadius': '10px',
            'padding': '1rem',
            'marginBottom': '1rem',
            'backgroundColor': 'white',
            'boxShadow': '0 2px 6px rgba(0,0,0,0.05)'
        }, children=[
            html.H4(row["name"], style={'marginBottom': '0.5rem'}),
            html.Small(f"⭐ Average Rating: {row['avg_rating']:.2f} from {int(row['num_ratings'])} ratings", style={'color': '#777'})
        ])
        for _, row in df.iterrows()
    ]

@app.callback(
    Output("cb-mmr-recommendation-output", "children"),
    Output("cb-mmr-plot-output", "src"),
    Input("run-mmr-recommender", "n_clicks"),
    State("stored-username", "data"),
    State("input-max-playingtime", "value"),
    State("input-min-age", "value"),
    State("input-min-players", "value"),
    State("input-max-players", "value"),
    State("input-min-rating", "value"),
    State("input-top-k", "value"),
    State("input-lambda-param", "value"),
    prevent_initial_call=True
)
def generate_cb_mmr_output(n_clicks, username, max_playingtime, min_age, minplayers, maxplayers, min_rating, top_k, lambda_param):
    if not username:
        return html.Div("⚠️ Please enter a username.", style={"color": "red"}), None

    results, plot_base64 = recommend_content_based_mmr(
        user_name=username,
        games=games,
        reviews=reviews,
        top_k=top_k,
        max_playingtime=max_playingtime,
        min_age=min_age,
        minplayers=minplayers,
        maxplayers=maxplayers,
        min_rating=min_rating,
        plot=True
    )

    if results is None or results.empty:
        return html.Div("😕 No recommendations found. Try relaxing the filters.", style={"color": "orange"}), None

    cards = []
    for _, row in results.iterrows():
        cards.append(html.Div([
            html.H4(row["name"]),
            html.P(f"Similarity Score: {row['similarity']:.2f}"),
            html.P(row["explanation"])
        ], style={
            "border": "1px solid #ccc",
            "borderRadius": "10px",
            "padding": "1rem",
            "marginBottom": "1rem",
            "backgroundColor": "#fefefe",
            "boxShadow": "0 2px 5px rgba(0,0,0,0.05)"
        }))

    img_src = f"data:image/png;base64,{plot_base64}" if plot_base64 else None
    return cards, img_src

@app.callback(
    Output("hybrid-recommendation-output", "children"),
    Input("run-hybrid-recommender", "n_clicks"),
    State("stored-username", "data"),
    State("input-k", "value"),
    prevent_initial_call=True
)
def run_hybrid_recommender(n_clicks, username, k):
    if not username:
        return html.Div("⚠️ Username not found. Please go back and enter your name.", style={"color": "red"})

    pred_df = predict_all_ratings(user_name=username, games_df=games, reviews_df=reviews, k=k)

    if isinstance(pred_df, str):  # error message
        return html.Div(f"⚠️ {pred_df}", style={"color": "orange"})

    cards = []
    for _, row in pred_df.head(10).iterrows():  # show top 10
        cards.append(html.Div([
            html.H4(row["name"]),
            html.P(f"Predicted Rating: {row['predicted_rating']}"),
            html.P(row["explanation"])
        ], style={
            "border": "1px solid #ccc",
            "borderRadius": "10px",
            "padding": "1rem",
            "marginBottom": "1rem",
            "backgroundColor": "#fefefe",
            "boxShadow": "0 2px 5px rgba(0,0,0,0.05)"
        }))

    return cards

@app.callback(
    Output('url', 'pathname', allow_duplicate=True),
    Input('algo-content', 'n_clicks'),
    Input('algo-collab', 'n_clicks'),
    Input('algo-hybrid', 'n_clicks'),
    prevent_initial_call=True
)
def go_to_algo(n1, n2, n3):
    ctx = callback_context.triggered_id
    if ctx == "algo-content":
        return "/content-based-mmr"
    elif ctx == "algo-collab":
        return "/collaborative-filtering"
    elif ctx == "algo-hybrid":
        return "/hybrid"
    raise PreventUpdate

@app.callback(
    Output("collab-recommendation-output", "children"),
    Input("run-collab-recommender", "n_clicks"),
    State("collab-method", "value"),
    State("collab-top-k", "value"),
    State("stored-username", "data"),
    prevent_initial_call=True
)
def run_collaborative_recommender(n_clicks, method, top_k, username):
    if not username:
        return html.Div("⚠️ Username not found. Please go back and enter your name.", style={"color": "red"})

    if method == "knn":
        results = recommend_with_knn(username, games, reviews, top_k)
    elif method == "svd":
        results = recommend_with_svd(username, games, reviews, top_k)

    else:
        return html.Div("⚠️ Unknown method selected.", style={"color": "red"})

    if results is None or results.empty:
        return html.Div("😕 No recommendations found.", style={"color": "orange"})

    cards = []
    for _, row in results.iterrows():
        cards.append(html.Div([
            html.H4(row["name"]),
            html.P(f"Predicted Rating: {row['predicted_rating']}")
        ], style={
            "border": "1px solid #ccc",
            "borderRadius": "10px",
            "padding": "1rem",
            "marginBottom": "1rem",
            "backgroundColor": "#fefefe",
            "boxShadow": "0 2px 5px rgba(0,0,0,0.05)"
        }))

    return cards

if __name__ == '__main__':
    app.run(debug=True)