In [25]:
import pandas as pd
import numpy as np
import implicit
import scipy.sparse as sp

# Metricas

In [26]:
animes = pd.read_csv('clean_data/animes.csv')

In [27]:
df_train = pd.read_csv(
    "train", sep=",", names=["userid", "itemid", "rating"], header=None
)

df_train.rating = [1 if x >= 5 else 0 for x in df_train.rating]

df_train.head()

Unnamed: 0,userid,itemid,rating
0,14179,13601.0,1
1,37548,34300.0,1
2,796,2592.0,0
3,3041,949.0,1
4,2493,5114.0,1


In [28]:
df_test = pd.read_csv(
    "test", sep=",", names=["userid", "itemid", "rating"], header=None
)

df_test.head()

Unnamed: 0,userid,itemid,rating
0,506,37517.0,10
1,16392,7311.0,9
2,553,12471.0,5
3,13348,8937.0,6
4,276,35997.0,3


In [29]:
item_interaction_counts = df_train['itemid'].value_counts()
user_count = df_train['userid'].nunique()
item_popularity = (item_interaction_counts / user_count).to_dict()
metadata = animes[['uid', 'genre']]
item_categories: dict[int, set[str | None]] = {}
for row in metadata.itertuples():
    item_categories[int(row[1]) if row[1].is_integer() else row[1]] = set(map(lambda i: i.strip(), row[2].split(','))) if isinstance(row[2], str) else set()

In [30]:
# drop nan items
df_train = df_train.dropna(subset=['itemid'])

In [31]:
user_items = {}
itemset = set()

for row in df_train.itertuples():
    if row[1] not in user_items:
        user_items[row[1]] = []

    user_items[row[1]].append(row[2])
    itemset.add(row[2])

itemset = np.sort(list(itemset))

sparse_matrix = np.zeros((len(user_items), len(itemset)))

for i, items in enumerate(user_items.values()):
    sparse_matrix[i] = np.isin(itemset, items, assume_unique=True).astype(int)

matrix = sp.csr_matrix(sparse_matrix.T)

user_item_matrix = matrix.T.tocsr()

In [32]:
user2row = {user_id: matrix_row for matrix_row, user_id in enumerate(user_items.keys())}
row2user = {matrix_row: user_id for user_id, matrix_row in user2row.items()}

item2col = {item_id: matrix_col for matrix_col, item_id in enumerate(itemset)}
col2item = {matrix_col: item_id for item_id, matrix_col in item2col.items()}

In [33]:
user_items_test = {}

for row in df_test.itertuples():
    if row[1] not in user_items_test:
        user_items_test[row[1]] = []

    user_items_test[row[1]].append(row[2])

# Drop all users that are not in the training set
user_items_test = {user: items for user, items in user_items_test.items() if user in user2row}

In [34]:
model_als = implicit.als.AlternatingLeastSquares(factors=300)
model_als.fit(user_item_matrix, show_progress=False)

In [35]:
def get_recommendations(user_id, n):
    # if user_id not in user2row:
    #     continue
        
    user_row = user2row[user_id]
    
    rec = model_als.recommend(user_row, user_item_matrix[user_row], n)[0]
    return np.array([col2item[col] for col in rec])

In [36]:
from evaluate import get_metrics

get_metrics(user_items_test, item_popularity, item_categories, get_recommendations, k=10, delta=0.05)

--- Métricas Globales de Evaluación ---
{
  "mean_recall": 0.0227966253814396,
  "mean_precision": 0.0024591635253993897,
  "mean_ap (MAP)": 0.010818246702737818,
  "mean_ndcg": 0.013692820278637644,
  "mean_novelty": 8.478027005045533,
  "mean_diversity": 0.9075535178500433,
  "num_users_evaluated": 16713
}

--- Reporte de Fairness (Disparidad de Grupo) ---
{
  "delta_threshold": 0.05,
  "is_biased_recall": 0,
  "is_biased_precision": 0,
  "group_averages": {
    "Male": {
      "recall (Cobertura)": 0.023999005222581447,
      "precision (Tasa Aceptaci\u00f3n)": 0.002611290723700572,
      "count": 8042
    },
    "NaN": {
      "recall (Cobertura)": 0.02135036496350365,
      "precision (Tasa Aceptaci\u00f3n)": 0.0023357664233576644,
      "count": 5480
    },
    "Non-Binary": {
      "recall (Cobertura)": 0.029197080291970802,
      "precision (Tasa Aceptaci\u00f3n)": 0.0029197080291970805,
      "count": 137
    },
    "Female": {
      "recall (Cobertura)": 0.021938441388343156,