In [1]:
import pandas as pd
import numpy as np
import implicit
import scipy.sparse as sp

  from .autonotebook import tqdm as notebook_tqdm


# Metricas

In [2]:
animes = pd.read_csv('clean_data/animes.csv')

In [3]:
df_train = pd.read_csv(
    "train", sep=",", names=["userid", "itemid", "rating"], header=None
)

df_train.rating = [1 if x >= 5 else 0 for x in df_train.rating]

df_train.head()

Unnamed: 0,userid,itemid,rating
0,2052,12445.0,1
1,5141,34599.0,1
2,3340,37510.0,1
3,588,853.0,1
4,4822,27775.0,1


In [4]:
df_test = pd.read_csv(
    "test", sep=",", names=["userid", "itemid", "rating"], header=None
)

df_test.head()

Unnamed: 0,userid,itemid,rating
0,18887,37450.0,10
1,8831,32379.0,2
2,37283,36882.0,9
3,35602,10490.0,9
4,39042,,6


In [5]:
item_interaction_counts = df_train['itemid'].value_counts()
user_count = df_train['userid'].nunique()
item_popularity = (item_interaction_counts / user_count).to_dict()
metadata = animes[['uid', 'genre']]
item_categories: dict[int, set[str | None]] = {}
for row in metadata.itertuples():
    if isinstance(row.genre, str):
        genre_list = set(g.strip().replace("'", "") for g in row.genre[1:-1].split(','))
    else:
        genre_list: set = set()

    if isinstance(row.uid, int):
        item_categories[row.uid] = genre_list
    else:
        raise ValueError("Unexpected non-integer uid")

In [6]:
# drop nan items
df_train = df_train.dropna(subset=['itemid'])

In [7]:
user_items = {}
itemset = set()

for row in df_train.itertuples():
    if row[1] not in user_items:
        user_items[row[1]] = []

    user_items[row[1]].append(row[2])
    itemset.add(row[2])

itemset = np.sort(list(itemset))

sparse_matrix = np.zeros((len(user_items), len(itemset)))

for i, items in enumerate(user_items.values()):
    sparse_matrix[i] = np.isin(itemset, items, assume_unique=True).astype(int)

matrix = sp.csr_matrix(sparse_matrix.T)

user_item_matrix = matrix.T.tocsr()

In [8]:
user2row = {user_id: matrix_row for matrix_row, user_id in enumerate(user_items.keys())}
row2user = {matrix_row: user_id for user_id, matrix_row in user2row.items()}

item2col = {item_id: matrix_col for matrix_col, item_id in enumerate(itemset)}
col2item = {matrix_col: item_id for item_id, matrix_col in item2col.items()}

In [9]:
user_items_test = {}

for row in df_test.itertuples():
    if row[1] not in user_items_test:
        user_items_test[row[1]] = []

    user_items_test[row[1]].append(row[2])

# Drop all users that are not in the training set
user_items_test = {user: items for user, items in user_items_test.items() if user in user2row}

In [10]:
model_als = implicit.als.AlternatingLeastSquares(factors=300)
model_als.fit(user_item_matrix, show_progress=False)

  check_blas_config()


In [11]:
def get_recommendations(user_id, n):
    # if user_id not in user2row:
    #     continue
        
    user_row = user2row[user_id]
    
    rec = model_als.recommend(user_row, user_item_matrix[user_row], n)[0]
    return np.array([col2item[col] for col in rec])

In [12]:
from evaluate import get_metrics

get_metrics(user_items_test, item_popularity, item_categories, get_recommendations, k=10, delta=0.2)

Evaluando usuarios: 100%|██████████| 15419/15419 [00:09<00:00, 1625.92it/s]

--- Métricas Globales de Evaluación ---
{
  "mean_recall": 0.027152887233127955,
  "mean_precision": 0.002987534030663419,
  "mean_ap (MAP)": 0.012284287670267038,
  "mean_ndcg": 0.015861270266657807,
  "mean_novelty": 8.494715920776278,
  "mean_diversity": 0.8390062703377051,
  "num_users_evaluated": 13958
}

--- Reporte de Fairness (Disparidad de Grupo) ---
{
  "delta_threshold": 0.2,
  "group_averages": {
    "NaN": {
      "recall (Cobertura)": 0.025711857203569912,
      "precision (Tasa Aceptaci\u00f3n)": 0.002826179345516362,
      "MAP": 0.010915649727804424,
      "NDCG": 0.014471401082798244,
      "novelty": 8.515348831176267,
      "diversity": 0.8397234974235965,
      "count": 4706
    },
    "Female": {
      "recall (Cobertura)": 0.021241830065359478,
      "precision (Tasa Aceptaci\u00f3n)": 0.0022467320261437907,
      "MAP": 0.009115961199294533,
      "NDCG": 0.012036175317187857,
      "novelty": 8.453714853198301,
      "diversity": 0.8377316424875657,
      "coun


