In [None]:
import pandas as pd
import pandas as pd
import numpy as np
import random
import csv
from collections import defaultdict

In [None]:
# Load train and test CSV files into DataFrames
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

df_train.sample(10)

Unnamed: 0,user,item,rating
51559,6236,129400,7.0
70953,9196,163891,7.0
24716,2684,79945,10.0
254963,48311,88661,6.0
241717,45223,21163,8.0
286172,54670,643,7.0
66679,8512,81893,5.0
63640,7906,151324,8.0
5992,534,39933,8.0
366665,72282,177443,2.0


In [None]:
df_test.sample(10)

Unnamed: 0,ID,user,item,rating
16683,16683,839,4644,5.0
11930,11930,4201,13354,10.0
17823,17823,534,1413,10.0
33574,33574,6740,17917,7.0
32946,32946,10548,24539,7.0
15224,15224,1959,7458,5.0
24856,24856,1357,2004,8.0
4366,4366,3711,2295,7.0
14121,14121,6010,16711,6.0
21023,21023,2639,9061,10.0


In [None]:
rating_distribution = df_train['rating'].value_counts().sort_index()
print(rating_distribution)

rating
1.0      1592
2.0      2478
3.0      5423
4.0      8042
5.0     45231
6.0     33489
7.0     69102
8.0     93366
9.0     60507
10.0    71121
Name: count, dtype: int64


## PMF

In [None]:
# Parámetros del modelo
NUM_FACTORS = 7
LEARNING_RATE = 0.001
REGULARIZATION = 0.1
NUM_ITERATIONS = 10
MIN_RATING = 0
MAX_RATING = 10

### Hiperparámetros

In [3]:
from sklearn.model_selection import train_test_split

# Dividir datos de entrenamiento en train + validación
train_data, val_data = train_test_split(df_train, test_size=0.2, random_state=42)

In [7]:
def train_and_evaluate(train_df, val_df, num_factors, lr, reg, iterations):
    # Indexado
    user_ids = sorted(set(train_df['user']).union(val_df['user']))
    item_ids = sorted(set(train_df['item']).union(val_df['item']))
    user_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
    item_to_index = {iid: idx for idx, iid in enumerate(item_ids)}
    num_users = len(user_ids)
    num_items = len(item_ids)

    ratings = {}
    for _, row in train_df.iterrows():
        u = user_to_index[row['user']]
        i = item_to_index[row['item']]
        ratings[(u, i)] = row['rating']

    rating_avg = sum(ratings.values()) / len(ratings)

    # Inicialización
    p = np.random.rand(num_users, num_factors)
    q = np.random.rand(num_items, num_factors)
    bu = np.random.rand(num_users)
    bi = np.random.rand(num_items)

    # Entrenamiento
    for _ in range(iterations):
        for (u, i), r in ratings.items():
            pred = rating_avg + bu[u] + bi[i] + np.dot(p[u], q[i])
            error = r - pred
            bu[u] += lr * (error - reg * bu[u])
            bi[i] += lr * (error - reg * bi[i])
            p[u] += lr * (error * q[i] - reg * p[u])
            q[i] += lr * (error * p[u] - reg * q[i])

    # Validación
    errors = []
    for _, row in val_df.iterrows():
        u = user_to_index.get(row['user'])
        i = item_to_index.get(row['item'])
        if u is not None and i is not None:
            pred = rating_avg + bu[u] + bi[i] + np.dot(p[u], q[i])
            pred = max(MIN_RATING, min(MAX_RATING, pred))
            errors.append(abs(row['rating'] - pred))
    return np.mean(errors)

In [None]:
param_grid = {
    "num_factors": [5, 10, 15, 20],
    "lr": [0.0001, 0.001, 0.01],
    "reg": [0.001, 0.05, 0.1, 0.5],
    "iterations": [10, 15, 20, 25, 30]
}

best_mae = float('inf')
best_config = None

for f in param_grid["num_factors"]:
    for lr in param_grid["lr"]:
        for reg in param_grid["reg"]:
            for iters in param_grid["iterations"]:
                print(f"Entrenando con factores={f}, lr={lr}, reg={reg}, iters={iters}")
                mae = train_and_evaluate(train_data, val_data, f, lr, reg, iters)
                print(f"MAE: {mae:.4f}")
                if mae < best_mae:
                    best_mae = mae
                    best_config = (f, lr, reg, iters)
    
print("\nMejores hiperparámetros encontrados:")
print(f"Factores: {best_config[0]}, LR: {best_config[1]}, Reg: {best_config[2]}, Iteraciones: {best_config[3]}")
print(f"MAE en validación: {best_mae:.4f}")

Entrenando con factores=5, lr=0.0001, reg=0.001, iters=10
MAE: 2.0154
Entrenando con factores=5, lr=0.0001, reg=0.001, iters=15
MAE: 1.9691
Entrenando con factores=5, lr=0.0001, reg=0.001, iters=20
MAE: 1.9297
Entrenando con factores=5, lr=0.0001, reg=0.001, iters=25
MAE: 1.8936
Entrenando con factores=5, lr=0.0001, reg=0.001, iters=30
MAE: 1.8702
Entrenando con factores=5, lr=0.0001, reg=0.05, iters=10
MAE: 2.0164
Entrenando con factores=5, lr=0.0001, reg=0.05, iters=15
MAE: 1.9668
Entrenando con factores=5, lr=0.0001, reg=0.05, iters=20
MAE: 1.9236
Entrenando con factores=5, lr=0.0001, reg=0.05, iters=25
MAE: 1.8947
Entrenando con factores=5, lr=0.0001, reg=0.05, iters=30
MAE: 1.8640
Entrenando con factores=5, lr=0.0001, reg=0.1, iters=10
MAE: 2.0092
Entrenando con factores=5, lr=0.0001, reg=0.1, iters=15
MAE: 1.9637
Entrenando con factores=5, lr=0.0001, reg=0.1, iters=20
MAE: 1.9251
Entrenando con factores=5, lr=0.0001, reg=0.1, iters=25
MAE: 1.8921
Entrenando con factores=5, lr=0.0

In [None]:
# Indexado de usuarios e items
user_ids = sorted(set(df_train['user']).union(df_test['user']))
item_ids = sorted(set(df_train['item']).union(df_test['item']))

user_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
item_to_index = {iid: idx for idx, iid in enumerate(item_ids)}

NUM_USERS = len(user_ids)
NUM_ITEMS = len(item_ids)

# Diccionarios para ratings
ratings = {}
user_items = defaultdict(set)

for _, row in df_train.iterrows():
    u = user_to_index[row['user']]
    i = item_to_index[row['item']]
    r = row['rating']
    ratings[(u, i)] = r
    user_items[u].add(i)

# Media de las valoraciones
rating_average = sum(ratings.values()) / len(ratings)

# Inicialización aleatoria
p = np.random.rand(NUM_USERS, NUM_FACTORS)
q = np.random.rand(NUM_ITEMS, NUM_FACTORS)
bu = np.random.rand(NUM_USERS)
bi = np.random.rand(NUM_ITEMS)

# Predicción con sesgos
def compute_biased_prediction(avg, b_u, b_i, p_u, q_i):
    return avg + b_u + b_i + np.dot(p_u, q_i)

# Entrenamiento
for it in range(NUM_ITERATIONS):
    print(f"Iteración {it+1} de {NUM_ITERATIONS}")

    for (u, i), rating in ratings.items():
        pred = compute_biased_prediction(rating_average, bu[u], bi[i], p[u], q[i])
        error = rating - pred

        # Actualización de parámetros
        bu[u] += LEARNING_RATE * (error - REGULARIZATION * bu[u])
        bi[i] += LEARNING_RATE * (error - REGULARIZATION * bi[i])

        p[u] += LEARNING_RATE * (error * q[i] - REGULARIZATION * p[u])
        q[i] += LEARNING_RATE * (error * p[u] - REGULARIZATION * q[i])

# Generar predicciones para test
output_rows = []

for _, row in df_test.iterrows():
    test_id = row['ID']
    u_raw = row['user']
    i_raw = row['item']

    u = user_to_index.get(u_raw)
    i = item_to_index.get(i_raw)

    if u is not None and i is not None:
        pred = compute_biased_prediction(rating_average, bu[u], bi[i], p[u], q[i])
        pred = round(max(MIN_RATING, min(MAX_RATING, pred)), 3)
    else:
        pred = 8.0  # Valor neutro si no hay datos

    output_rows.append((test_id, pred))

# Guardar CSV
with open("predicciones_PMF_10.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["ID", "rating"])
    writer.writerows(output_rows)

print("Archivo generado correctamente.")


Iteración 1 de 10
Iteración 2 de 10
Iteración 3 de 10
Iteración 4 de 10
Iteración 5 de 10
Iteración 6 de 10
Iteración 7 de 10
Iteración 8 de 10
Iteración 9 de 10
Iteración 10 de 10
Archivo 'predicciones.csv' generado correctamente.
