In [None]:
import pandas as pd
import pandas as pd
import numpy as np
import random
import csv
from collections import defaultdict

In [None]:
# Load train and test CSV files into DataFrames
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

df_train.sample(10)

Unnamed: 0,user,item,rating
51559,6236,129400,7.0
70953,9196,163891,7.0
24716,2684,79945,10.0
254963,48311,88661,6.0
241717,45223,21163,8.0
286172,54670,643,7.0
66679,8512,81893,5.0
63640,7906,151324,8.0
5992,534,39933,8.0
366665,72282,177443,2.0


In [None]:
df_test.sample(10)

Unnamed: 0,ID,user,item,rating
16683,16683,839,4644,5.0
11930,11930,4201,13354,10.0
17823,17823,534,1413,10.0
33574,33574,6740,17917,7.0
32946,32946,10548,24539,7.0
15224,15224,1959,7458,5.0
24856,24856,1357,2004,8.0
4366,4366,3711,2295,7.0
14121,14121,6010,16711,6.0
21023,21023,2639,9061,10.0


In [None]:
rating_distribution = df_train['rating'].value_counts().sort_index()
print(rating_distribution)

rating
1.0      1592
2.0      2478
3.0      5423
4.0      8042
5.0     45231
6.0     33489
7.0     69102
8.0     93366
9.0     60507
10.0    71121
Name: count, dtype: int64


## PMF

In [None]:
# Parámetros del modelo
NUM_FACTORS = 7
LEARNING_RATE = 0.001
REGULARIZATION = 0.1
NUM_ITERATIONS = 10
MIN_RATING = 0
MAX_RATING = 10

### Hiperparámetros

In [3]:
from sklearn.model_selection import train_test_split

# Dividir datos de entrenamiento en train + validación
train_data, val_data = train_test_split(df_train, test_size=0.2, random_state=42)

In [7]:
def train_and_evaluate(train_df, val_df, num_factors, lr, reg, iterations):
    # Indexado
    user_ids = sorted(set(train_df['user']).union(val_df['user']))
    item_ids = sorted(set(train_df['item']).union(val_df['item']))
    user_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
    item_to_index = {iid: idx for idx, iid in enumerate(item_ids)}
    num_users = len(user_ids)
    num_items = len(item_ids)

    ratings = {}
    for _, row in train_df.iterrows():
        u = user_to_index[row['user']]
        i = item_to_index[row['item']]
        ratings[(u, i)] = row['rating']

    rating_avg = sum(ratings.values()) / len(ratings)

    # Inicialización
    p = np.random.rand(num_users, num_factors)
    q = np.random.rand(num_items, num_factors)
    bu = np.random.rand(num_users)
    bi = np.random.rand(num_items)

    # Entrenamiento
    for _ in range(iterations):
        for (u, i), r in ratings.items():
            pred = rating_avg + bu[u] + bi[i] + np.dot(p[u], q[i])
            error = r - pred
            bu[u] += lr * (error - reg * bu[u])
            bi[i] += lr * (error - reg * bi[i])
            p[u] += lr * (error * q[i] - reg * p[u])
            q[i] += lr * (error * p[u] - reg * q[i])

    # Validación
    errors = []
    for _, row in val_df.iterrows():
        u = user_to_index.get(row['user'])
        i = item_to_index.get(row['item'])
        if u is not None and i is not None:
            pred = rating_avg + bu[u] + bi[i] + np.dot(p[u], q[i])
            pred = max(MIN_RATING, min(MAX_RATING, pred))
            errors.append(abs(row['rating'] - pred))
    return np.mean(errors)

In [None]:
param_grid = {
    "num_factors": [5, 10, 15, 20],
    "lr": [0.0001, 0.001, 0.01],
    "reg": [0.001, 0.05, 0.1, 0.5],
    "iterations": [10, 15, 20, 25, 30]
}

best_mae = float('inf')
best_config = None

for f in param_grid["num_factors"]:
    for lr in param_grid["lr"]:
        for reg in param_grid["reg"]:
            for iters in param_grid["iterations"]:
                print(f"Entrenando con factores={f}, lr={lr}, reg={reg}, iters={iters}")
                mae = train_and_evaluate(train_data, val_data, f, lr, reg, iters)
                print(f"MAE: {mae:.4f}")
                if mae < best_mae:
                    best_mae = mae
                    best_config = (f, lr, reg, iters)
    
print("\nMejores hiperparámetros encontrados:")
print(f"Factores: {best_config[0]}, LR: {best_config[1]}, Reg: {best_config[2]}, Iteraciones: {best_config[3]}")
print(f"MAE en validación: {best_mae:.4f}")

Entrenando con factores=5, lr=0.0001, reg=0.001, iters=10
MAE: 2.0154
Entrenando con factores=5, lr=0.0001, reg=0.001, iters=15
MAE: 1.9691
Entrenando con factores=5, lr=0.0001, reg=0.001, iters=20
MAE: 1.9297
Entrenando con factores=5, lr=0.0001, reg=0.001, iters=25
MAE: 1.8936
Entrenando con factores=5, lr=0.0001, reg=0.001, iters=30
MAE: 1.8702
Entrenando con factores=5, lr=0.0001, reg=0.05, iters=10
MAE: 2.0164
Entrenando con factores=5, lr=0.0001, reg=0.05, iters=15
MAE: 1.9668
Entrenando con factores=5, lr=0.0001, reg=0.05, iters=20
MAE: 1.9236
Entrenando con factores=5, lr=0.0001, reg=0.05, iters=25
MAE: 1.8947
Entrenando con factores=5, lr=0.0001, reg=0.05, iters=30
MAE: 1.8640
Entrenando con factores=5, lr=0.0001, reg=0.1, iters=10
MAE: 2.0092
Entrenando con factores=5, lr=0.0001, reg=0.1, iters=15
MAE: 1.9637
Entrenando con factores=5, lr=0.0001, reg=0.1, iters=20
MAE: 1.9251
Entrenando con factores=5, lr=0.0001, reg=0.1, iters=25
MAE: 1.8921
Entrenando con factores=5, lr=0.0

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ParameterGrid
from joblib import Parallel, delayed
import csv
from collections import defaultdict

# Cargar datos completos
df = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

# Dividir en entrenamiento y validación
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)

# Indexado de usuarios e ítems
user_ids = sorted(set(df_train['user']).union(df_val['user']).union(df_test['user']))
item_ids = sorted(set(df_train['item']).union(df_val['item']).union(df_test['item']))

user_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
item_to_index = {iid: idx for idx, iid in enumerate(item_ids)}

NUM_USERS = len(user_ids)
NUM_ITEMS = len(item_ids)

# Diccionarios para ratings
ratings = {}
user_items = defaultdict(set)

for _, row in df_train.iterrows():
    u = user_to_index[row['user']]
    i = item_to_index[row['item']]
    r = row['rating']
    ratings[(u, i)] = r
    user_items[u].add(i)

# Media de las valoraciones
rating_average = sum(ratings.values()) / len(ratings)

# Búsqueda de hiperparámetros
param_grid = {
    'NUM_FACTORS': [5, 10, 15, 20],
    'NUM_ITERATIONS': [10, 20, 30, 50],
    'LEARNING_RATE': [0.001, 0.005, 0.01],
    'REGULARIZATION': [0.01, 0.1, 0.5]
}

# Función para entrenar el modelo y calcular MAE
def train_and_evaluate(params):
    p = np.random.rand(NUM_USERS, params['NUM_FACTORS'])
    q = np.random.rand(NUM_ITEMS, params['NUM_FACTORS'])
    bu = np.random.rand(NUM_USERS)
    bi = np.random.rand(NUM_ITEMS)

    def compute_biased_prediction(avg, b_u, b_i, p_u, q_i):
        return avg + b_u + b_i + np.dot(p_u, q_i)

    # Entrenamiento
    for it in range(params['NUM_ITERATIONS']):
        for (u, i), rating in ratings.items():
            pred = compute_biased_prediction(rating_average, bu[u], bi[i], p[u], q[i])
            error = rating - pred

            bu[u] += params['LEARNING_RATE'] * (error - params['REGULARIZATION'] * bu[u])
            bi[i] += params['LEARNING_RATE'] * (error - params['REGULARIZATION'] * bi[i])

            p[u] += params['LEARNING_RATE'] * (error * q[i] - params['REGULARIZATION'] * p[u])
            q[i] += params['LEARNING_RATE'] * (error * p[u] - params['REGULARIZATION'] * q[i])

    # Evaluación con el conjunto de validación
    val_mae = []
    for _, row in df_val.iterrows():
        u_raw, i_raw, actual_rating = row['user'], row['item'], row['rating']
        u = user_to_index.get(u_raw)
        i = item_to_index.get(i_raw)
        if u is not None and i is not None:
            pred = compute_biased_prediction(rating_average, bu[u], bi[i], p[u], q[i])
            val_mae.append(abs(actual_rating - pred))

    mae = np.mean(val_mae) if val_mae else float('inf')
    print(f"Probando parámetros: {params} -> MAE: {mae}")
    return params, mae

# Paralelización de la búsqueda de hiperparámetros
results = Parallel(n_jobs=12, backend="loky")(delayed(train_and_evaluate)(params) for params in ParameterGrid(param_grid))

# Obtener los mejores parámetros
best_params, best_mae = min(results, key=lambda x: x[1])
print("Mejores hiperparámetros:", best_params)
print(f"Mejor MAE: {best_mae:.4f}")

Probando parámetros: {'LEARNING_RATE': 0.001, 'NUM_FACTORS': 5, 'NUM_ITERATIONS': 10, 'REGULARIZATION': 0.01} -> MAE: 1.7210371625398713
Probando parámetros: {'LEARNING_RATE': 0.001, 'NUM_FACTORS': 5, 'NUM_ITERATIONS': 10, 'REGULARIZATION': 0.1} -> MAE: 1.716158631243736
Probando parámetros: {'LEARNING_RATE': 0.001, 'NUM_FACTORS': 5, 'NUM_ITERATIONS': 10, 'REGULARIZATION': 0.5} -> MAE: 1.7020375498454894
Probando parámetros: {'LEARNING_RATE': 0.001, 'NUM_FACTORS': 5, 'NUM_ITERATIONS': 20, 'REGULARIZATION': 0.1} -> MAE: 1.6115365068247864
Probando parámetros: {'LEARNING_RATE': 0.001, 'NUM_FACTORS': 5, 'NUM_ITERATIONS': 20, 'REGULARIZATION': 0.01} -> MAE: 1.617581444700255
Probando parámetros: {'LEARNING_RATE': 0.001, 'NUM_FACTORS': 10, 'NUM_ITERATIONS': 10, 'REGULARIZATION': 0.01} -> MAE: 2.0442177182798154
Probando parámetros: {'LEARNING_RATE': 0.001, 'NUM_FACTORS': 5, 'NUM_ITERATIONS': 20, 'REGULARIZATION': 0.5} -> MAE: 1.599012222544092
Probando parámetros: {'LEARNING_RATE': 0.001, '

In [None]:
import pandas as pd
import numpy as np
import csv
from ray import tune
from ray.tune.search.bayesopt import BayesOptSearch
from collections import defaultdict

# Parámetros del modelo
# Parámetros del modelo predeterminados
NUM_FACTORS = 20
LEARNING_RATE = 0.005
REGULARIZATION = 0.01
NUM_ITERATIONS = 50
MIN_RATING = 0
MAX_RATING = 10

# Cargar datos
from sklearn.model_selection import train_test_split

df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

# División del conjunto de entrenamiento en train y validación
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42)

# Indexado de usuarios e ítems
user_ids = sorted(set(df_train['user']).union(df_test['user']))
item_ids = sorted(set(df_train['item']).union(df_test['item']))

user_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
item_to_index = {iid: idx for idx, iid in enumerate(item_ids)}

NUM_USERS = len(user_ids)
NUM_ITEMS = len(item_ids)

# Diccionario de ratings
train_ratings = {}
user_items = defaultdict(set)

# Rellenar el diccionario de ratings
for _, row in df_train.iterrows():
    u = user_to_index[row['user']]
    i = item_to_index[row['item']]
    r = row['rating']
    train_ratings[(u, i)] = r
    user_items[u].add(i)

# Cálculo de la media de valoraciones
rating_average = sum(train_ratings.values()) / len(train_ratings)

# Inicialización aleatoria
p = np.random.rand(NUM_USERS, NUM_FACTORS)
q = np.random.rand(NUM_ITEMS, NUM_FACTORS)
bu = np.random.rand(NUM_USERS)
bi = np.random.rand(NUM_ITEMS)

# Predicción con sesgos
def compute_biased_prediction(avg, b_u, b_i, p_u, q_i):
    return avg + b_u + b_i + np.dot(p_u, q_i)


# Función de entrenamiento
def train_pmf(config):
    for it in range(NUM_ITERATIONS):
        for (u, i), rating in train_ratings.items():
            pred = compute_biased_prediction(rating_average, bu[u], bi[i], p[u], q[i])
            error = rating - pred

            # Actualización de parámetros
            bu[u] += config['learning_rate'] * (error - config['regularization'] * bu[u])
            bi[i] += config['learning_rate'] * (error - config['regularization'] * bi[i])
            p[u] += config['learning_rate'] * (error * q[i] - config['regularization'] * p[u])
            q[i] += config['learning_rate'] * (error * p[u] - config['regularization'] * q[i])

    # Evaluación en el conjunto de validación
    val_errors = []
    for _, row in df_val.iterrows():
        u = user_to_index.get(row['user'])
        i = item_to_index.get(row['item'])
        if u is not None and i is not None:
            pred = compute_biased_prediction(rating_average, bu[u], bi[i], p[u], q[i])
            val_errors.append(abs(row['rating'] - pred))

    mean_val_mae = np.mean(val_errors)
    tune.report({"score": mean_val_mae})

# Configuración de búsqueda
search_space = {
    'learning_rate': tune.loguniform(0.001, 0.01),
    'regularization': tune.loguniform(0.001, 0.1),
    'num_factors': tune.uniform(5, 20),
    'num_iterations': tune.uniform(20, 100)
}

# Ejecución de la búsqueda con Tuner
tuner = tune.Tuner(
    train_pmf,
    param_space=search_space,
    tune_config=tune.TuneConfig(
        metric="score",
        mode="min",
        num_samples=50,
        max_concurrent_trials=1,
        search_alg=BayesOptSearch()
    ),
    run_config=tune.RunConfig(
        storage_path="/home/corti/RECSYS/Practica 1/tune"
    )
)
results = tuner.fit()

# Guardar mejores resultados
print("Mejores hiperparámetros encontrados:")
best_result = results.get_best_result()
print(best_result.config)

# Generación de predicciones para el conjunto de prueba
output_rows = []
for _, row in df_test.iterrows():
    test_id = row['ID']
    u_raw = row['user']
    i_raw = row['item']
    u = user_to_index.get(u_raw)
    i = item_to_index.get(i_raw)
    if u is not None and i is not None:
        pred = compute_biased_prediction(rating_average, bu[u], bi[i], p[u], q[i])
        pred = round(max(MIN_RATING, min(MAX_RATING, pred)), 3)
    else:
        pred = 8.0
    output_rows.append((test_id, pred))

# Guardar predicciones en CSV
filename = "/home/corti/RECSYS/Practica 1/tune/predicciones_PMF_raytune.csv"
with open(filename, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["ID", "rating"])
    writer.writerows(output_rows)

print(f"Archivo '{filename}' generado correctamente.")

0,1
Current time:,2025-03-29 13:57:09
Running for:,00:01:17.46
Memory:,9.8/15.6 GiB

Trial name,status,loc,learning_rate,num_factors,num_iterations,regularization
train_pmf_85559104,RUNNING,172.30.202.84:215805,0.00437086,19.2607,78.5595,0.0602672


### Ejecución

In [None]:
#'LEARNING_RATE': 0.005, 'NUM_FACTORS': 20, 'NUM_ITERATIONS': 50, 'REGULARIZATION': 0.01

# Parámetros del modelo
NUM_FACTORS = 20
LEARNING_RATE = 0.005
REGULARIZATION = 0.01
NUM_ITERATIONS = 50
MIN_RATING = 0
MAX_RATING = 10

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Indexado de usuarios e items
user_ids = sorted(set(df_train['user']).union(df_test['user']))
item_ids = sorted(set(df_train['item']).union(df_test['item']))

user_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
item_to_index = {iid: idx for idx, iid in enumerate(item_ids)}

NUM_USERS = len(user_ids)
NUM_ITEMS = len(item_ids)

# Diccionarios para ratings
ratings = {}
user_items = defaultdict(set)

for _, row in df_train.iterrows():
    u = user_to_index[row['user']]
    i = item_to_index[row['item']]
    r = row['rating']
    ratings[(u, i)] = r
    user_items[u].add(i)

# Media de las valoraciones
rating_average = sum(ratings.values()) / len(ratings)

# Inicialización aleatoria
p = np.random.rand(NUM_USERS, NUM_FACTORS)
q = np.random.rand(NUM_ITEMS, NUM_FACTORS)
bu = np.random.rand(NUM_USERS)
bi = np.random.rand(NUM_ITEMS)

# Predicción con sesgos
def compute_biased_prediction(avg, b_u, b_i, p_u, q_i):
    return avg + b_u + b_i + np.dot(p_u, q_i)

# Entrenamiento
for it in range(NUM_ITERATIONS):
    print(f"Iteración {it+1} de {NUM_ITERATIONS}")

    for (u, i), rating in ratings.items():
        pred = compute_biased_prediction(rating_average, bu[u], bi[i], p[u], q[i])
        error = rating - pred

        # Actualización de parámetros
        bu[u] += LEARNING_RATE * (error - REGULARIZATION * bu[u])
        bi[i] += LEARNING_RATE * (error - REGULARIZATION * bi[i])

        p[u] += LEARNING_RATE * (error * q[i] - REGULARIZATION * p[u])
        q[i] += LEARNING_RATE * (error * p[u] - REGULARIZATION * q[i])

# Generar predicciones para test
output_rows = []

for _, row in df_test.iterrows():
    test_id = row['ID']
    u_raw = row['user']
    i_raw = row['item']

    u = user_to_index.get(u_raw)
    i = item_to_index.get(i_raw)

    if u is not None and i is not None:
        pred = compute_biased_prediction(rating_average, bu[u], bi[i], p[u], q[i])
        pred = round(max(MIN_RATING, min(MAX_RATING, pred)), 3)
    else:
        pred = 8.0  # Valor neutro si no hay datos

    output_rows.append((test_id, pred))

# Guardar CSV
with open("predicciones_PMF_20_0.005_0.01_50.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["ID", "rating"])
    writer.writerows(output_rows)

print("Archivo generado correctamente.")