In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Load train and test CSV files into DataFrames
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

train_df.sample(10)

Unnamed: 0,user,item,rating
51559,6236,129400,7.0
70953,9196,163891,7.0
24716,2684,79945,10.0
254963,48311,88661,6.0
241717,45223,21163,8.0
286172,54670,643,7.0
66679,8512,81893,5.0
63640,7906,151324,8.0
5992,534,39933,8.0
366665,72282,177443,2.0


In [5]:
test_df.sample(10)

Unnamed: 0,ID,user,item,rating
16683,16683,839,4644,5.0
11930,11930,4201,13354,10.0
17823,17823,534,1413,10.0
33574,33574,6740,17917,7.0
32946,32946,10548,24539,7.0
15224,15224,1959,7458,5.0
24856,24856,1357,2004,8.0
4366,4366,3711,2295,7.0
14121,14121,6010,16711,6.0
21023,21023,2639,9061,10.0


In [3]:
rating_distribution = train_df['rating'].value_counts().sort_index()
print(rating_distribution)

rating
1.0      1592
2.0      2478
3.0      5423
4.0      8042
5.0     45231
6.0     33489
7.0     69102
8.0     93366
9.0     60507
10.0    71121
Name: count, dtype: int64


## 

## BNMF

In [5]:
import pandas as pd
import numpy as np
import random
from collections import defaultdict
from scipy.special import digamma
import csv

# Configuración
NUM_FACTORS = 5
ALPHA = 0.8
BETA = 5
R = 4
MIN_RATING = 0
MAX_RATING = 10
NUM_ITERATIONS = 50

# Leer datasets
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

# Indexado consecutivo
user_ids = sorted(set(df_train['user']).union(df_test['user']))
item_ids = sorted(set(df_train['item']).union(df_test['item']))

user_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
item_to_index = {iid: idx for idx, iid in enumerate(item_ids)}
index_to_user = {idx: uid for uid, idx in user_to_index.items()}
index_to_item = {idx: iid for iid, idx in item_to_index.items()}

NUM_USERS = len(user_ids)
NUM_ITEMS = len(item_ids)

# Diccionarios dispersos
ratings = {}
user_items = defaultdict(set)
item_users = defaultdict(set)

for _, row in df_train.iterrows():
    u = user_to_index[row['user']]
    i = item_to_index[row['item']]
    r = row['rating']
    ratings[(u, i)] = r
    user_items[u].add(i)
    item_users[i].add(u)

# Inicialización aleatoria de parámetros
gamma = np.random.rand(NUM_USERS, NUM_FACTORS)
ep = np.random.rand(NUM_ITEMS, NUM_FACTORS)
em = np.random.rand(NUM_ITEMS, NUM_FACTORS)

# Entrenamiento BNMF
for it in range(NUM_ITERATIONS):
    print(f"Iteración {it+1} de {NUM_ITERATIONS}")

    lmbda = dict()

    for (u, i), rating in ratings.items():
        r_norm = (rating - MIN_RATING) / (MAX_RATING - MIN_RATING)
        rp = R * r_norm
        rm = R * (1 - r_norm)

        lambda_prime = np.exp(
            digamma(gamma[u]) +
            rp * digamma(ep[i]) +
            rm * digamma(em[i]) -
            R * digamma(ep[i] + em[i])
        )

        lmbda[(u, i)] = lambda_prime / np.sum(lambda_prime)

    # Reinicializar gamma, ep, em con ALPHA y BETA
    gamma = np.full((NUM_USERS, NUM_FACTORS), ALPHA, dtype=np.float64)
    ep = np.full((NUM_ITEMS, NUM_FACTORS), BETA, dtype=np.float64)
    em = np.full((NUM_ITEMS, NUM_FACTORS), BETA, dtype=np.float64)

    # Actualizar parámetros
    for (u, i), rating in ratings.items():
        r_norm = (rating - MIN_RATING) / (MAX_RATING - MIN_RATING)
        rp = R * r_norm
        rm = R * (1 - r_norm)
        lam = lmbda[(u, i)]

        gamma[u] += lam
        ep[i] += lam * rp
        em[i] += lam * rm

# Calcular matrices a y b
a = gamma / gamma.sum(axis=1, keepdims=True)
b = ep / (ep + em)

# Predicción normalizada
def predict(u, i):
    return np.dot(a[u], b[i]) * (MAX_RATING - MIN_RATING) + MIN_RATING

# Predicción para test y guardado en CSV
output_rows = []

for _, row in df_test.iterrows():
    test_id = row['ID']
    u_raw = row['user']
    i_raw = row['item']

    u = user_to_index.get(u_raw)
    i = item_to_index.get(i_raw)

    if u is not None and i is not None:
        pred = predict(u, i)
        pred = round(max(MIN_RATING, min(MAX_RATING, pred)), 3)
    else:
        pred = 8.0  # Valor por defecto si faltan datos

    output_rows.append((test_id, pred))

# Guardar archivo
with open("predicciones_BNMF_50_iter.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["ID", "rating"])
    writer.writerows(output_rows)

print("Archivo generado correctamente.")


Iteración 1 de 50
Iteración 2 de 50
Iteración 3 de 50
Iteración 4 de 50
Iteración 5 de 50
Iteración 6 de 50
Iteración 7 de 50
Iteración 8 de 50
Iteración 9 de 50
Iteración 10 de 50
Iteración 11 de 50
Iteración 12 de 50
Iteración 13 de 50
Iteración 14 de 50
Iteración 15 de 50
Iteración 16 de 50
Iteración 17 de 50
Iteración 18 de 50
Iteración 19 de 50
Iteración 20 de 50
Iteración 21 de 50
Iteración 22 de 50
Iteración 23 de 50
Iteración 24 de 50
Iteración 25 de 50
Iteración 26 de 50
Iteración 27 de 50
Iteración 28 de 50
Iteración 29 de 50
Iteración 30 de 50
Iteración 31 de 50
Iteración 32 de 50
Iteración 33 de 50
Iteración 34 de 50
Iteración 35 de 50
Iteración 36 de 50
Iteración 37 de 50
Iteración 38 de 50
Iteración 39 de 50
Iteración 40 de 50
Iteración 41 de 50
Iteración 42 de 50
Iteración 43 de 50
Iteración 44 de 50
Iteración 45 de 50
Iteración 46 de 50
Iteración 47 de 50
Iteración 48 de 50
Iteración 49 de 50
Iteración 50 de 50
Archivo generado correctamente.


### Hiperparámetros

In [None]:
def train_and_evaluate_bnmf(train_df, val_df, num_factors, alpha, beta, R, iterations):
    # Preparar datos e índices
    user_ids = sorted(set(train_df['user']).union(val_df['user']))
    item_ids = sorted(set(train_df['item']).union(val_df['item']))
    user_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
    item_to_index = {iid: idx for idx, iid in enumerate(item_ids)}
    num_users = len(user_ids)
    num_items = len(item_ids)

    ratings = {}
    for _, row in train_df.iterrows():
        u = user_to_index[row['user']]
        i = item_to_index[row['item']]
        ratings[(u, i)] = row['rating']

    # Inicializar parámetros
    gamma = np.random.rand(num_users, num_factors)
    ep = np.random.rand(num_items, num_factors)
    em = np.random.rand(num_items, num_factors)

    for _ in range(iterations):
        lmbda = {}
        for (u, i), rating in ratings.items():
            r_norm = (rating - MIN_RATING) / (MAX_RATING - MIN_RATING)
            rp = R * r_norm
            rm = R * (1 - r_norm)

            lambda_prime = np.exp(
                digamma(gamma[u]) +
                rp * digamma(ep[i]) +
                rm * digamma(em[i]) -
                R * digamma(ep[i] + em[i])
            )
            lmbda[(u, i)] = lambda_prime / np.sum(lambda_prime)

        gamma = np.full((num_users, num_factors), alpha, dtype=np.float64)
        ep = np.full((num_items, num_factors), beta, dtype=np.float64)
        em = np.full((num_items, num_factors), beta, dtype=np.float64)

        for (u, i), rating in ratings.items():
            r_norm = (rating - MIN_RATING) / (MAX_RATING - MIN_RATING)
            rp = R * r_norm
            rm = R * (1 - r_norm)
            lam = lmbda[(u, i)]
            gamma[u] += lam
            ep[i] += lam * rp
            em[i] += lam * rm

    # Calcular a y b
    a = gamma / gamma.sum(axis=1, keepdims=True)
    b = ep / (ep + em)

    # Validación MAE
    errors = []
    for _, row in val_df.iterrows():
        u = user_to_index.get(row['user'])
        i = item_to_index.get(row['item'])
        if u is not None and i is not None:
            pred = np.dot(a[u], b[i]) * (MAX_RATING - MIN_RATING) + MIN_RATING
            errors.append(abs(row['rating'] - pred))
    return np.mean(errors) if errors else None



# División train/validación
train_data, val_data = train_test_split(df_train, test_size=0.2, random_state=42)

# Rejilla de hiperparámetros
param_grid = {
    "num_factors": [5, 10, 15],
    "alpha": [0.3, 0.5, 0.8],
    "beta": [2, 3, 5],
    "R": [2, 4],
    "iterations": [10, 15, 20, 30]
}

best_mae = float("inf")
best_config = None

for k in param_grid["num_factors"]:
    for alpha in param_grid["alpha"]:
        for beta in param_grid["beta"]:
            for R in param_grid["R"]:
                for iters in param_grid["iterations"]:
                    print(f"Entrenando BNMF con k={k}, alpha={alpha}, beta={beta}, R={R}, iters={iters}")
                    mae = train_and_evaluate_bnmf(train_data, val_data, k, alpha, beta, R, iters)
                    print(f"MAE = {mae:.4f}")
                    if mae < best_mae:
                        best_mae = mae
                        best_config = (k, alpha, beta, R, iters)

print("\nMejores hiperparámetros encontrados:")
print(f"Factores: {best_config[0]}, ALPHA: {best_config[1]}, BETA: {best_config[2]}, R: {best_config[3]}, Iter: {best_config[4]}")
print(f"MAE validación: {best_mae:.4f}")

Entrenando BNMF con k=3, alpha=0.3, beta=2, R=2, iters=10
MAE = 2.1849
Entrenando BNMF con k=3, alpha=0.3, beta=2, R=2, iters=15
MAE = 2.1946
Entrenando BNMF con k=3, alpha=0.3, beta=2, R=2, iters=20
MAE = 2.2006
Entrenando BNMF con k=3, alpha=0.3, beta=2, R=2, iters=30
MAE = 2.2072
Entrenando BNMF con k=3, alpha=0.3, beta=2, R=4, iters=10
MAE = 2.1202
Entrenando BNMF con k=3, alpha=0.3, beta=2, R=4, iters=15
MAE = 2.1347
Entrenando BNMF con k=3, alpha=0.3, beta=2, R=4, iters=20
MAE = 2.1436
Entrenando BNMF con k=3, alpha=0.3, beta=2, R=4, iters=30
MAE = 2.1514
Entrenando BNMF con k=3, alpha=0.3, beta=3, R=2, iters=10
MAE = 2.2464
Entrenando BNMF con k=3, alpha=0.3, beta=3, R=2, iters=15
MAE = 2.2503
Entrenando BNMF con k=3, alpha=0.3, beta=3, R=2, iters=20
MAE = 2.2552
Entrenando BNMF con k=3, alpha=0.3, beta=3, R=2, iters=30
MAE = 2.2596
Entrenando BNMF con k=3, alpha=0.3, beta=3, R=4, iters=10


  lmbda[(u, i)] = lambda_prime / np.sum(lambda_prime)


MAE = nan
Entrenando BNMF con k=3, alpha=0.3, beta=3, R=4, iters=15
MAE = 2.1693
Entrenando BNMF con k=3, alpha=0.3, beta=3, R=4, iters=20


KeyboardInterrupt: 