# Content Based Recommender system

In [206]:
# In this recommender system the content of the movie 
# (overview, cast, crew, keyword, tagline etc) is used to find its similarity with other movies. 
# Then the movies that are most likely to be similar are recommended.

In [242]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [208]:
animes = pd.read_csv('animes.csv')

In [209]:
animes.head()

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...,https://myanimelist.net/anime/31758/Kizumonoga...


In [210]:
# drop duplicate titles
animes = animes.drop_duplicates(subset=['title'])

### Plot description based Recommender

In [211]:
# We will compute pairwise similarity scores for all animes based on their plot descriptions and 
# recommend animes based on that similarity score. 
# The plot description is given in the overview feature of our dataset. Let's take a look at the data. ..

In [212]:
animes.synopsis.head()

0    Following their participation at the Inter-Hig...
1    Music accompanies the path of the human metron...
2    The Abyss—a gaping chasm stretching down into ...
3    "In order for something to be obtained, someth...
4    After helping revive the legendary vampire Kis...
Name: synopsis, dtype: object

In [213]:
# we need to convert the word vector for each description therefore we will TF-IDF VEctorizer

In [214]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
animes['synopsis'] = animes['synopsis'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(animes['synopsis'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(16214, 43912)

In [215]:
# Since we have used the TF-IDF vectorizer, 
# calculating the dot product will directly give us the cosine similarity score. 
# Therefore, we will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.

In [216]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [217]:
# We are going to define a function that takes in animes title as an input and outputs 
# a list of the 10 most similar animes. Firstly, for this, 
# we need a reverse mapping of anime titles and DataFrame indices. In other words, 
# we need a mechanism to identify the index of an anime in our metadata DataFrame, given its title

In [218]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(animes.index, index=animes['title']).drop_duplicates()

In [219]:
# Function that takes in anime title as input and outputs most similar animes
def get_recommendations(name, cosine_sim=cosine_sim):
    # Get the index of the anime that matches the title
    idx = indices[name]

    # Get the pairwsie similarity scores of all animes with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the animes based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar animes
    sim_scores = sim_scores[1:11]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar animes
    return animes['title'].iloc[anime_indices]

In [220]:
get_recommendations('Made in Abyss')

905                   Pandora Hearts
383                    Tsumiki no Ie
3347            Mahoutsukai Precure!
13846                 Whistle! (ONA)
229         Jigoku Shoujo Futakomori
12985            Gundam Build Divers
6600     Gundam Build Divers Re:Rise
6247                       Plunderer
7770        Yasashii Fue, Tori, Ishi
9504                        Kokuhaku
Name: title, dtype: object

# Metricas

In [221]:
df_train = pd.read_csv(
    "train", sep=",", names=["userid", "itemid", "rating"], header=None
)

df_train.rating = [1 if x >= 5 else 0 for x in df_train.rating]

df_train.head()

Unnamed: 0,userid,itemid,rating
0,2052,12445.0,1
1,5141,34599.0,1
2,3340,37510.0,1
3,588,853.0,1
4,4822,27775.0,1


In [222]:
df_test = pd.read_csv(
    "test", sep=",", names=["userid", "itemid", "rating"], header=None
)

df_test.head()

Unnamed: 0,userid,itemid,rating
0,18887,37450.0,10
1,8831,32379.0,2
2,37283,36882.0,9
3,35602,10490.0,9
4,39042,,6


In [223]:
item_interaction_counts = df_train['itemid'].value_counts()
user_count = df_train['userid'].nunique()
item_popularity = (item_interaction_counts / user_count).to_dict()
metadata = animes[['uid', 'genre']]
item_categories: dict[int, set[str | None]] = {}
for row in metadata.itertuples():
    item_categories[int(row[1]) if row[1].is_integer() else row[1]] = set(map(lambda i: i.strip(), row[2].split(','))) if isinstance(row[2], str) else set()

In [None]:
from metrics import precision_at_k, ndcg_at_k, novelty, diversity, recall_at_k, average_precision_at_k

In [225]:
user_items = {}
itemset = set()

for row in df_train.itertuples():
    if row[1] not in user_items:
        user_items[row[1]] = []

    user_items[row[1]].append(row[2])
    itemset.add(row[2])

itemset = np.sort(list(itemset))

In [226]:
user2row = {user_id: matrix_row for matrix_row, user_id in enumerate(user_items.keys())}
row2user = {matrix_row: user_id for user_id, matrix_row in user2row.items()}

uid_to_idx = pd.Series(range(len(animes)), index=animes['uid'])

In [227]:
print(uid_to_idx)

uid
28891        0
23273        1
34599        2
5114         3
31758        4
         ...  
10075    16209
35828    16210
10378    16211
33082    16212
16934    16213
Length: 16214, dtype: int64


In [None]:
def recommend(user_id, n=10):
    """Recommend top-n anime titles for a user based on content similarity."""
    seen_items = user_items.get(user_id, [])
    
    N = cosine_sim.shape[0]
    scores = np.zeros(N, dtype=float)

    for item in seen_items:
        try:
            idx = uid_to_idx.get(item)
        except KeyError:
            continue

        row = cosine_sim[idx]
        try:
            import scipy.sparse as _sps
            if _sps.issparse(row):
                row = row.toarray()
        except Exception:
            pass

        row = np.asarray(row)
        if row.ndim == 2:
            row = row.sum(axis=0)
        row = row.ravel()
        if row.shape[0] == N:
            scores += row
        else:
            continue

    # Exclude items the user has already seen
    seen_idxs = []
    for item in seen_items:
        if item in uid_to_idx.index:
            idx = uid_to_idx.get(item)
            seen_idxs.append(idx)
    if seen_idxs:
        scores[seen_idxs] = -np.inf

    top_idxs = np.argsort(scores)[::-1][:n]

    return animes["uid"].iloc[top_idxs].values

In [229]:
user_items_test = {}

for row in df_test.itertuples():
    if row[1] not in user_items_test:
        user_items_test[row[1]] = []

    user_items_test[row[1]].append(row[2])

In [None]:
def evaluate_model(n, user_groups, delta=0.05):
    """
    Evalúa el modelo calculando métricas globales y de fairness.
    
    Args:
        n (int): El 'k' para las métricas @k.
        user_groups (dict): {user_id: 'group_label'}
        delta (float): Umbral de disparidad para considerar "sesgado".
    """
    
    group_scores = {
        'recall': {},  
        'precision': {} 
    }
    
    all_recalls = []
    all_aps = []
    all_ndcgs = []
    all_novelties = []
    all_diversities = []
    all_precisions = []

    for user_id in user_items_test.keys():
        
        # 1. Obtener datos del usuario
        if user_id not in user_groups:
            continue
            
        group = user_groups[user_id]
        truth_items = user_items_test.get(user_id, set())
        
        # 2. Obtener recomendaciones y vector de relevancia
        rec = recommend(user_id, n) # n es k
        
        rel_vector = np.isin(rec, truth_items, assume_unique=True).astype(int)

        # 3. Calcular métricas individuales
        user_recall = recall_at_k(rel_vector, n)
        user_precision = precision_at_k(rel_vector, n)
        user_ap = average_precision_at_k(rel_vector, n)
        user_ndcg = ndcg_at_k(rel_vector, n)
        user_novelty = novelty(rec, item_popularity)
        user_diversity = diversity(rec, n, item_categories)

        # 4. Almacenar para métricas globales
        all_recalls.append(user_recall)
        all_precisions.append(user_precision)
        all_aps.append(user_ap)
        all_ndcgs.append(user_ndcg)
        all_novelties.append(user_novelty)
        all_diversities.append(user_diversity)

        # 5. Almacenar métricas de fairness por grupo
        if group not in group_scores['recall']:
            group_scores['recall'][group] = []
            group_scores['precision'][group] = []
            
        group_scores['recall'][group].append(user_recall)
        group_scores['precision'][group].append(user_precision)

    # --- 6. Calcular Métricas Promedio (Globales) ---
    
    metrics_global = {
        'mean_recall': np.mean(all_recalls) if all_recalls else 0.0,
        'mean_precision': np.mean(all_precisions) if all_precisions else 0.0,
        'mean_ap (MAP)': np.mean(all_aps) if all_aps else 0.0,
        'mean_ndcg': np.mean(all_ndcgs) if all_ndcgs else 0.0,
        'mean_novelty': np.mean(all_novelties) if all_novelties else 0.0,
        'mean_diversity': np.mean(all_diversities) if all_diversities else 0.0,
        'num_users_evaluated': len(all_recalls)
    }
    
    # --- 7. Calcular Métricas de Fairness (Disparidad) ---
    
    fairness_report = {
        'delta_threshold': delta,
        'is_biased_recall': 0,
        'is_biased_precision': 0,
        'group_averages': {},
        'disparity_reports': []
    }

    avg_recall_group = {g: np.mean(s) for g, s in group_scores['recall'].items() if s}
    avg_precision_group = {g: np.mean(s) for g, s in group_scores['precision'].items() if s}
    
    fairness_report['group_averages'] = {
        g: {
            'recall (Cobertura)': avg_recall_group.get(g, 0.0),
            'precision (Tasa Aceptación)': avg_precision_group.get(g, 0.0),
            'count': len(group_scores['recall'].get(g, []))
        } for g in avg_recall_group.keys()
    }

    # Comparar pares de grupos
    group_names = list(avg_recall_group.keys())
    for i in range(len(group_names)):
        for j in range(i + 1, len(group_names)):
            g_a = group_names[i]
            g_b = group_names[j]
            
            r_a, r_b = avg_recall_group[g_a], avg_recall_group[g_b]
            abs_disp_recall = abs(r_a - r_b)
            
            p_a, p_b = avg_precision_group[g_a], avg_precision_group[g_b]
            abs_disp_precision = abs(p_a - p_b)

            pair_report = {
                'pair': (g_a, g_b),
                'recall_disparity': abs_disp_recall,
                'precision_disparity': abs_disp_precision,
                'biased_recall (Cobertura)': 1 if abs_disp_recall > delta else 0,
                'biased_precision (Tasa Aceptación)': 1 if abs_disp_precision > delta else 0
            }
            fairness_report['disparity_reports'].append(pair_report)
            
            if abs_disp_recall > delta:
                fairness_report['is_biased_recall'] = 1
            if abs_disp_precision > delta:
                fairness_report['is_biased_precision'] = 1
    
    return metrics_global, fairness_report

In [None]:
profiles = pd.read_csv('data/profiles.csv')
profiles.head()

Unnamed: 0,profile,gender,birthday,favorites_anime,link
0,DesolatePsyche,Male,"Oct 2, 1994","['33352', '25013', '5530', '33674', '1482', '2...",https://myanimelist.net/profile/DesolatePsyche
1,baekbeans,Female,"Nov 10, 2000","['11061', '31964', '853', '20583', '918', '925...",https://myanimelist.net/profile/baekbeans
2,skrn,,,"['918', '2904', '11741', '17074', '23273', '32...",https://myanimelist.net/profile/skrn
3,edgewalker00,Male,Sep 5,"['5680', '849', '2904', '3588', '37349']",https://myanimelist.net/profile/edgewalker00
4,aManOfCulture99,Male,"Oct 30, 1999","['4181', '7791', '9617', '5680', '2167', '4382...",https://myanimelist.net/profile/aManOfCulture99


In [None]:
reviews = pd.read_csv("data/reviews.csv")

# Assign a unique id to each username
unique_users = reviews["username"].unique()
user_mapping = {user: i for i, user in enumerate(unique_users)}
reviews["user_id"] = reviews["username"].map(user_mapping)

In [256]:
profiles["user_id"] = profiles["profile"].map(user_mapping)

In [None]:
import json

user_groups_map = {}
# user_groups_map id a sexo
for row in profiles.itertuples():
    user_id = row.user_id
    user_groups_map[user_id] = row.gender

K = 10
DELTA_FAIRNESS = 0.05

global_metrics, fairness_results = evaluate_model(K, user_groups_map, DELTA_FAIRNESS)

print("--- Métricas Globales de Evaluación ---")
print(json.dumps(global_metrics, indent=2))

print("\n--- Reporte de Fairness (Disparidad de Grupo) ---")
print(json.dumps(fairness_results, indent=2))

print(f"\nRecall Global: {global_metrics['mean_recall']:.4f}")
print(f"MAP Global: {global_metrics['mean_ap (MAP)']:.4f}")
print(f"¿Es sesgado (Recall)?: {fairness_results['is_biased_recall']}")
print(f"¿Es sesgado (Precision)?: {fairness_results['is_biased_precision']}")

--- Métricas Globales de Evaluación ---
{
  "mean_recall": 0.02015145680913875,
  "mean_precision": 0.00215633423180593,
  "mean_ap (MAP)": 0.008987425458353453,
  "mean_ndcg": 0.011666327192415053,
  "mean_novelty": 7.8902495469825515,
  "mean_diversity": 0.8685184573651092,
  "num_users_evaluated": 15582
}

--- Reporte de Fairness (Disparidad de Grupo) ---
{
  "delta_threshold": 0.05,
  "is_biased_recall": 0,
  "is_biased_precision": 0,
  "group_averages": {
    "NaN": {
      "recall (Cobertura)": 0.018451664661050943,
      "precision (Tasa Aceptaci\u00f3n)": 0.001965503409546731,
      "count": 4986
    },
    "Female": {
      "recall (Cobertura)": 0.015105740181268883,
      "precision (Tasa Aceptaci\u00f3n)": 0.0015105740181268882,
      "count": 2979
    },
    "Male": {
      "recall (Cobertura)": 0.023526266541906163,
      "precision (Tasa Aceptaci\u00f3n)": 0.0025665018045715813,
      "count": 7481
    },
    "Non-Binary": {
      "recall (Cobertura)": 0.00735294117647058