In [None]:
import pandas as pd
import numpy as np
import random
from collections import defaultdict
from scipy.special import digamma
import csv

import pandas as pd
from sklearn.model_selection import train_test_split

# Configuración
NUM_FACTORS = 5
ALPHA = 0.8
BETA = 5
R = 4
MIN_RATING = 0
MAX_RATING = 10
NUM_ITERATIONS = 50

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')


In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.trainset import Trainset
import csv

# https://surpriselib.com/

# Cargar datos de entrenamiento
df_train = pd.read_csv("train.csv")
reader = Reader(rating_scale=(0, 10))  # Asegúrate que coincide con tu escala
data = Dataset.load_from_df(df_train[['user', 'item', 'rating']], reader)
trainset: Trainset = data.build_full_trainset()

# Entrenar modelo SVD
algo = SVD(n_factors=50, n_epochs=20, reg_all=0.02, verbose=True)
algo.fit(trainset)

# Cargar datos de test
df_test = pd.read_csv("test.csv")

# Mapas para IDs originales
user_inner_id = trainset._raw2inner_id_users
item_inner_id = trainset._raw2inner_id_items

# Predecir y guardar
output_rows = []
for _, row in df_test.iterrows():
    test_id = row['ID']
    user = row['user']
    item = row['item']

    # Surprise maneja automáticamente nuevos usuarios/items como unknown
    pred = algo.predict(str(user), str(item)).est
    pred = round(max(0, min(10, pred)), 3)

    output_rows.append((test_id, pred))

# Guardar predicciones
with open("predicciones_SVD_50_20_0.02.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["ID", "rating"])
    writer.writerows(output_rows)

print("Archivo generado correctamente.")


Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Archivo generado correctamente.


In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV
import csv

# Cargar datos
df_train = pd.read_csv("train.csv")

# Convertir a formato surprise
reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(df_train[['user', 'item', 'rating']], reader)

# Definir rejilla de hiperparámetros
param_grid = {
    'n_factors': [20, 50, 100],
    'n_epochs': [10, 20],
    'reg_all': [0.02, 0.05, 0.1]
}

# Búsqueda con validación cruzada
gs = GridSearchCV(SVD, param_grid, measures=['mae'], cv=3, joblib_verbose=1, n_jobs=-1)
gs.fit(data)

# Mostrar mejores hiperparámetros
print("Mejores hiperparámetros:")
print(gs.best_params['mae'])
print(f"Mejor MAE: {gs.best_score['mae']:.4f}")

# Entrenar modelo final con mejores parámetros
best_algo = gs.best_estimator['mae']
trainset = data.build_full_trainset()
best_algo.fit(trainset)

# Cargar test
df_test = pd.read_csv("test.csv")

# Predecir y guardar resultados
output_rows = []
for _, row in df_test.iterrows():
    test_id = row['ID']
    user = str(row['user'])
    item = str(row['item'])

    pred = best_algo.predict(user, item).est
    pred = round(max(0, min(10, pred)), 3)

    output_rows.append((test_id, pred))

# Guardar CSV
filename = f"predicciones_SVD_gridsearch.csv"
with open(filename, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["ID", "rating"])
    writer.writerows(output_rows)

print(f"Archivo '{filename}' generado correctamente.")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  4.7min finished


Mejores hiperparámetros:
{'n_factors': 20, 'n_epochs': 20, 'reg_all': 0.02}
Mejor MAE: 1.2706
Archivo 'predicciones_SVD_gridsearch.csv' generado correctamente.


In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
from ray.tune.search.bayesopt import BayesOptSearch

from ray import tune
import csv
import ray
from math import ceil
from ray._private.utils import get_system_memory, get_used_memory

# Iniciar Ray y asignar recursos explícitamente
def get_additional_bytes_to_reach_memory_usage_pct(pct: float) -> int:
    used = get_used_memory()
    total = get_system_memory()
    bytes_needed = int(total * pct) - used
    assert (
        bytes_needed > 0
    ), "memory usage is already above the target. Increase the target percentage."
    return bytes_needed

@ray.remote
class MemoryHogger:
    def __init__(self):
        self.allocations = []

    def allocate(self, bytes_to_allocate: float) -> None:
        new_list = [0] * ceil(bytes_to_allocate / 8)
        self.allocations.append(new_list)

# Controlar el uso de memoria para evitar sobrecarga
memory_hogger = MemoryHogger.remote()
allocate_bytes = get_additional_bytes_to_reach_memory_usage_pct(0.6)
ray.get(memory_hogger.allocate.remote(allocate_bytes))

# Cargar datos
df_train = pd.read_csv('../data/train.csv')

# Convertir a formato surprise
reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(df_train[['user', 'item', 'rating']], reader)

# Función de entrenamiento
def train_svd(config):
    algo = SVD(n_factors=int(config['n_factors']), n_epochs=int(config['n_epochs']), reg_all=config['reg_all'])
    trainset = data.build_full_trainset()
    algo.fit(trainset)

    cv_results = cross_validate(algo, data, measures=['mae'], cv=3, verbose=False)
    mean_mae = cv_results['test_mae'].mean()

    for iteration in range(1, 11):
        tune.report({"score": mean_mae, "iter": iteration})

# Configuración de búsqueda
search_space = {
    'n_factors': tune.uniform(20, 100),
    'n_epochs': tune.uniform(10, 20),
    'reg_all': tune.loguniform(0.01, 0.1)
}

# Ejecución de la búsqueda con Tuner
tuner = tune.Tuner(
    train_svd,
    param_space=search_space,
    tune_config=tune.TuneConfig(
        metric="score",
        mode="min",
        num_samples=50,
        max_concurrent_trials=10,
        search_alg=BayesOptSearch()
    ),
    run_config=tune.RunConfig(
        storage_path="/home/corti/RECSYS/Practica 1/tune"        
    )
)
results = tuner.fit()

print("Mejores hiperparámetros encontrados:")
best_result = results.get_best_result()
print(best_result.config)

# Entrenar el mejor modelo
best_config = best_result.config
best_algo = SVD(n_factors=int(best_config['n_factors']), n_epochs=int(best_config['n_epochs']), reg_all=best_config['reg_all'])
trainset = data.build_full_trainset()
best_algo.fit(trainset)

# Cargar test
df_test = pd.read_csv('../data/test.csv')

# Predecir y guardar resultados
output_rows = []
for _, row in df_test.iterrows():
    test_id = row['ID']
    user = str(row['user'])
    item = str(row['item'])

    pred = best_algo.predict(user, item).est
    pred = round(max(0, min(10, pred)), 3)

    output_rows.append((test_id, pred))

# Guardar CSV
filename = "/home/corti/RECSYS/Practica 1/tune/predicciones_SVD_raytune.csv"
with open(filename, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["ID", "rating"])
    writer.writerows(output_rows)

print(f"Archivo '{filename}' generado correctamente.")

0,1
Current time:,2025-03-29 10:33:38
Running for:,00:03:34.59
Memory:,8.1/15.6 GiB

Trial name,status,loc,n_epochs,n_factors,reg_all,iter,total time (s),score,iter.1
train_svd_5a964a0b,TERMINATED,172.30.202.84:47269,13.7454,96.0571,0.0758795,10,28.7146,1.28743,10
train_svd_79aaf149,TERMINATED,172.30.202.84:47331,15.9866,32.4815,0.0240395,10,23.9558,1.27799,10
train_svd_a33a6f9a,TERMINATED,172.30.202.84:47415,10.5808,89.2941,0.0641004,10,25.5586,1.29386,10
train_svd_eee2839c,TERMINATED,172.30.202.84:47490,17.0807,21.6468,0.0972919,10,25.7394,1.27863,10
train_svd_b5e9ea54,TERMINATED,172.30.202.84:47561,18.3244,36.9871,0.0263642,10,28.7697,1.27462,10
train_svd_791cd651,TERMINATED,172.30.202.84:47647,11.834,44.3394,0.0572281,10,24.098,1.28895,10
train_svd_075ff1db,TERMINATED,172.30.202.84:47719,14.3195,43.2983,0.0650668,10,26.226,1.28236,10
train_svd_5b6d004f,TERMINATED,172.30.202.84:47807,11.3949,43.3716,0.0429726,10,23.5327,1.28743,10
train_svd_7ddbe451,TERMINATED,172.30.202.84:47885,14.5607,82.8141,0.0279706,10,27.851,1.28247,10
train_svd_d470785e,TERMINATED,172.30.202.84:47957,15.1423,67.3932,0.0141805,10,26.1871,1.27974,10


2025-03-29 10:33:38,313	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/corti/RECSYS/Practica 1/tune/train_svd_2025-03-29_10-30-03' in 0.0123s.
2025-03-29 10:33:38,326	INFO tune.py:1041 -- Total run time: 214.82 seconds (214.57 seconds for the tuning loop).


Mejores hiperparámetros encontrados:
{'n_factors': 40.86358647049009, 'n_epochs': 19.931167118593613, 'reg_all': 0.016259415607418847}
Archivo '/home/corti/RECSYS/Practica 1/tune/predicciones_SVD_raytune.csv' generado correctamente.
