In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.inputs import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import DeepFM
from tensorflow.keras.preprocessing.sequence import pad_sequences

# DeepFM with Side Information (Genres, Score, Popularity)

In [2]:
# 1. Load Data
animes = pd.read_csv('clean_data/animes.csv')
df_train = pd.read_csv("train", sep=",", names=["userid", "itemid", "rating"], header=None)
df_test = pd.read_csv("test", sep=",", names=["userid", "itemid", "rating"], header=None)

# Binarize ratings (Target)
df_train['target'] = [1 if x >= 5 else 0 for x in df_train['rating']]
df_train = df_train.dropna(subset=['itemid'])

In [3]:
# 2. Feature Engineering on Animes Metadata
import ast  # <--- NEW IMPORT

# > Clean Numerical Features
# Handle 'Unknown' in episodes
animes['episodes'] = pd.to_numeric(animes['episodes'], errors='coerce').fillna(1)
animes['score'] = pd.to_numeric(animes['score'], errors='coerce').fillna(animes['score'].mean())
animes['members'] = pd.to_numeric(animes['members'], errors='coerce').fillna(0)

# > Extract Year from 'aired'
# Format is usually 'Month Day, Year' or 'Year'. We extract the first 4 digits.
animes['year'] = animes['aired'].str.extract(r'(\d{4})').fillna('2000')
animes['year'] = animes['year'].astype(int)

# > Process Genres (Multi-Label) <--- CHANGED SECTION
# The csv has genres like: "['Comedy', 'Sports']" (Stringified list)
# We use ast.literal_eval to safely convert that string into a real python list.
def parse_genre(x):
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return []

animes['genre'] = animes['genre'].fillna('[]').apply(parse_genre)

# Create Genre Vocabulary
genre_list = set()
for genres in animes['genre']:
    genre_list.update(genres)
    
genre_to_idx = {genre: i+1 for i, genre in enumerate(genre_list)} # 0 is padding
genre_vocab_size = len(genre_to_idx) + 1

# Encode Genres into integer lists
def encode_genres(genre_list):
    return [genre_to_idx.get(g, 0) for g in genre_list]

animes['genre_encoded'] = animes['genre'].apply(encode_genres)

# Pad genres to fixed length (e.g., max 4 genres per anime)
MAX_GENRE_LEN = 4
genre_padded = pad_sequences(animes['genre_encoded'], maxlen=MAX_GENRE_LEN, padding='post', truncating='post')

In [4]:
# 3. Merge Metadata into Interaction Data
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)

# Merge side info
data = data.merge(animes[['uid', 'score', 'members', 'episodes', 'year']], left_on='itemid', right_on='uid', how='left')

# Fill missing metadata (for items in train/test but not in animes.csv)
data['score'] = data['score'].fillna(data['score'].mean())
data['members'] = data['members'].fillna(0)
data['episodes'] = data['episodes'].fillna(1)
data['year'] = data['year'].fillna(2000)

# 4. Encode ID Features
lbe_user = LabelEncoder()
data['userid_enc'] = lbe_user.fit_transform(data['userid'])

lbe_item = LabelEncoder()
data['itemid_enc'] = lbe_item.fit_transform(data['itemid'])

lbe_year = LabelEncoder()
data['year_enc'] = lbe_year.fit_transform(data['year'])

# 5. Normalize Dense Features
mms = MinMaxScaler(feature_range=(0, 1))
data[['score', 'members', 'episodes']] = mms.fit_transform(data[['score', 'members', 'episodes']])

# 6. Prepare Genre Input Column
# We need to map the merged data back to the padded genre array
# Create a mapping from itemid -> padded_genre_row
item_id_to_genre = dict(zip(animes['uid'], genre_padded))

# Look up genres for every row in data
# Use a default [0,0,0,0] for missing items
default_genre = np.zeros(MAX_GENRE_LEN, dtype=int)
genre_feature_data = np.array([item_id_to_genre.get(iid, default_genre) for iid in data['itemid']])

In [8]:
# 7. Define Feature Columns for DeepFM
# FIX: All embedding_dim must be equal (e.g., 16) for DeepFM interaction layers.

fixlen_feature_columns = [
    SparseFeat("userid_enc", vocabulary_size=data['userid_enc'].max() + 1, embedding_dim=16),
    SparseFeat("itemid_enc", vocabulary_size=data['itemid_enc'].max() + 1, embedding_dim=16),
    # CHANGED: 8 -> 16
    SparseFeat("year_enc", vocabulary_size=data['year_enc'].max() + 1, embedding_dim=16), 
    DenseFeat("score", 1),
    DenseFeat("members", 1),
    DenseFeat("episodes", 1)
]

# Variable length feature for Genres
varlen_feature_columns = [
    # CHANGED: 8 -> 16
    VarLenSparseFeat(SparseFeat('genre', vocabulary_size=genre_vocab_size, embedding_dim=16),
                     maxlen=MAX_GENRE_LEN, combiner='mean')
]

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
feature_names = get_feature_names(linear_feature_columns)

In [9]:
# 8. Split Train/Test and Generate Dictionaries
train_idx = slice(0, len(df_train))
test_idx = slice(len(df_train), len(data))

def make_input_dict(idx):
    input_dict = {name: data.iloc[idx][name].values for name in feature_names if 'genre' not in name}
    # Add the genre array separately
    input_dict['genre'] = genre_feature_data[idx]
    return input_dict

train_model_input = make_input_dict(train_idx)
test_model_input = make_input_dict(test_idx)

In [10]:
# 9. Train Model
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', device=device)
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'])

# Increased epochs to 5 for better convergence with new features
history = model.fit(train_model_input, df_train['target'].values, batch_size=256, epochs=5, verbose=1)

cpu
Train on 139689 samples, validate on 0 samples, 546 steps per epoch


546it [00:03, 164.91it/s]


Epoch 1/5
3s - loss:  0.3492 - binary_crossentropy:  0.3493


546it [00:02, 192.38it/s]


Epoch 2/5
2s - loss:  0.3328 - binary_crossentropy:  0.3328


546it [00:02, 183.69it/s]


Epoch 3/5
2s - loss:  0.3034 - binary_crossentropy:  0.3033


546it [00:02, 193.22it/s]


Epoch 4/5
2s - loss:  0.2784 - binary_crossentropy:  0.2783


546it [00:02, 189.94it/s]

Epoch 5/5
2s - loss:  0.2640 - binary_crossentropy:  0.2638





In [11]:
# --- Inference Logic with Side Info ---

# Pre-calculate unique item features to avoid re-processing in the loop
unique_items = data[['itemid_enc', 'itemid', 'score', 'members', 'episodes', 'year_enc']].drop_duplicates('itemid_enc')
unique_items = unique_items.sort_values('itemid_enc').reset_index(drop=True)

# Helper arrays for fast lookup
all_items_enc = unique_items['itemid_enc'].values
all_item_scores = unique_items['score'].values
all_item_members = unique_items['members'].values
all_item_episodes = unique_items['episodes'].values
all_item_years = unique_items['year_enc'].values

# Genre lookup by encoded item id
# unique_items['itemid'] maps to original ID. We use that to get genre array.
all_item_genres = np.array([item_id_to_genre.get(iid, default_genre) for iid in unique_items['itemid']])

enc_to_raw_item = {enc: raw for enc, raw in zip(unique_items['itemid_enc'], unique_items['itemid'])}

def get_recommendations(user_id, n):
    try:
        user_enc = lbe_user.transform([user_id])[0]
    except ValueError:
        return np.array([])

    # Broadcast User ID
    count = len(all_items_enc)
    user_enc_col = np.full(count, user_enc)
    
    # Construct Input with ALL features
    pred_input = {
        "userid_enc": user_enc_col,
        "itemid_enc": all_items_enc,
        "year_enc": all_item_years,
        "score": all_item_scores,
        "members": all_item_members,
        "episodes": all_item_episodes,
        "genre": all_item_genres
    }
    
    # Predict
    preds = model.predict(pred_input, batch_size=4096).flatten()
    
    # Rank
    top_indices = preds.argsort()[-n:][::-1]
    top_enc_items = all_items_enc[top_indices]
    
    return np.array([enc_to_raw_item[i] for i in top_enc_items])

In [12]:
from evaluate import get_metrics

# Metrics setup (same as before)
item_interaction_counts = df_train['itemid'].value_counts()
user_count = df_train['userid'].nunique()
item_popularity = (item_interaction_counts / user_count).to_dict()
metadata = animes[['uid', 'genre']]
item_categories: dict[int, set[str | None]] = {}
for row in metadata.itertuples():
    item_categories[int(row[1]) if hasattr(row[1], 'is_integer') and row[1].is_integer() else row[1]] = set(map(lambda i: i.strip(), row[2].split(','))) if isinstance(row[2], str) else set()

user_items_test = {}
for row in df_test.itertuples():
    if row.userid not in user_items_test:
        user_items_test[row.userid] = []
    user_items_test[row.userid].append(row.itemid)

get_metrics(user_items_test, item_popularity, item_categories, get_recommendations, k=10, delta=0.05)

Evaluando usuarios: 100%|██████████| 18591/18591 [08:32<00:00, 36.25it/s]

--- Métricas Globales de Evaluación ---
{
  "mean_recall": 0.004525695230155422,
  "mean_precision": 0.0004525695230155422,
  "mean_ap (MAP)": 0.0015170011314238076,
  "mean_ndcg": 0.0022015454557601315,
  "mean_novelty": 9.765866344968858,
  "mean_diversity": 0.0,
  "num_users_evaluated": 16793
}

--- Reporte de Fairness (Disparidad de Grupo) ---
{
  "delta_threshold": 0.05,
  "is_biased_recall": 0,
  "is_biased_precision": 0,
  "group_averages": {
    "Male": {
      "recall (Cobertura)": 0.0039657950179700086,
      "precision (Tasa Aceptaci\u00f3n)": 0.0003965795017970009,
      "count": 8069
    },
    "NaN": {
      "recall (Cobertura)": 0.004893077201884741,
      "precision (Tasa Aceptaci\u00f3n)": 0.0004893077201884741,
      "count": 5518
    },
    "Non-Binary": {
      "recall (Cobertura)": 0.0,
      "precision (Tasa Aceptaci\u00f3n)": 0.0,
      "count": 137
    },
    "Female": {
      "recall (Cobertura)": 0.005539263603779733,
      "precision (Tasa Aceptaci\u00f3n)": 


