In [3]:
import pandas as pd
import pandas as pd
import numpy as np
import csv
from collections import defaultdict

In [4]:
# Load train and test CSV files into DataFrames
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df.sample(10)

Unnamed: 0,user,item,rating
55256,6872,56791,7.0
240566,45021,125336,10.0
351376,69193,106285,7.0
223671,41541,27402,10.0
194524,34943,5003,7.0
253246,47854,130651,6.0
935,234,31787,6.0
205520,37353,31684,2.0
210982,38502,87862,6.0
309271,59837,2272,6.0


In [6]:
test_df.sample(10)

Unnamed: 0,ID,user,item
27267,27267,11280,25497
1884,1884,1777,7223
23175,23175,2777,9534
4972,4972,534,2799
18607,18607,6256,17175
7151,7151,1595,1111
30277,30277,10056,23647
17123,17123,6804,4710
20846,20846,10848,24872
29035,29035,7815,1009


In [7]:
rating_distribution = train_df['rating'].value_counts().sort_index()
print(rating_distribution)

rating
1.0      1592
2.0      2478
3.0      5423
4.0      8042
5.0     45231
6.0     33489
7.0     69102
8.0     93366
9.0     60507
10.0    71121
Name: count, dtype: int64


## KNN

In [3]:
# Leer CSVs
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [4]:
import pandas as pd
import numpy as np
import math
import csv
from collections import defaultdict

# Configuración
K = 15
MIN_RATING = 0
MAX_RATING = 10

# Mapeo de IDs
user_ids = sorted(set(df_train['user']).union(df_test['user']))
item_ids = sorted(set(df_train['item']).union(df_test['item']))
user_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
item_to_index = {iid: idx for idx, iid in enumerate(item_ids)}

# Número total
NUM_USERS = len(user_ids)
NUM_ITEMS = len(item_ids)

# Matriz dispersa: (u, i) -> rating
ratings = {}
user_items = defaultdict(set)
item_users = defaultdict(set)

for _, row in df_train.iterrows():
    u = user_to_index[row['user']]
    i = item_to_index[row['item']]
    r = row['rating']
    ratings[(u, i)] = r
    user_items[u].add(i)
    item_users[i].add(u)

# Media de un usuario
def rating_average(u):
    votos = [ratings[(u, i)] for i in user_items[u]]
    return sum(votos) / len(votos) if votos else 0

# Similaridad JMSD
def jmsd_similarity(u, v):
    items_u = user_items[u]
    items_v = user_items[v]
    intersection = items_u & items_v
    union = items_u | items_v

    if not intersection:
        return None

    diff = 0
    for i in intersection:
        ru = (ratings[(u, i)] - MIN_RATING) / (MAX_RATING - MIN_RATING)
        rv = (ratings[(v, i)] - MIN_RATING) / (MAX_RATING - MIN_RATING)
        diff += (ru - rv) ** 2

    jaccard = len(intersection) / len(union)
    msd = diff / len(intersection)
    return jaccard * (1 - msd)

# Obtener K vecinos
def get_neighbors(u, i):
    neighbors = []
    for v in item_users[i]:  # solo usuarios que han votado el ítem
        if u == v:
            continue
        sim = jmsd_similarity(u, v)
        if sim is not None:
            neighbors.append((v, sim))
    neighbors.sort(key=lambda x: x[1], reverse=True)
    return neighbors[:K]

# Agregación: media simple
def average_prediction(u, i, neighbors):
    ratings_sum = 0
    count = 0
    for v, _ in neighbors:
        if (v, i) in ratings:
            ratings_sum += ratings[(v, i)]
            count += 1
    return ratings_sum / count if count > 0 else None

# Predicciones para test
output_rows = []

for _, row in df_test.iterrows():
    test_id = row['ID']
    u_raw = row['user']
    i_raw = row['item']

    u = user_to_index.get(u_raw)
    i = item_to_index.get(i_raw)

    if u is not None and i is not None:
        neighbors = get_neighbors(u, i)
        pred = average_prediction(u, i, neighbors)
        pred = round(max(MIN_RATING, min(MAX_RATING, pred)), 3) if pred is not None else 8.0
    else:
        pred = 8.0  # Valor por defecto si no se puede calcular

    output_rows.append((test_id, pred))

# Guardar resultados
with open("predicciones_KNN_sparse.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["ID", "rating"])
    writer.writerows(output_rows)

print("Archivo 'predicciones_KNN_sparse.csv' generado correctamente.")

Archivo 'predicciones_KNN_sparse.csv' generado correctamente.


### Hiperparámetros

In [1]:
import pandas as pd
import numpy as np
import math
from collections import defaultdict
from sklearn.model_selection import train_test_split

# Configuración
MIN_RATING = 0
MAX_RATING = 10
K_values = [5, 10, 15, 20, 25, 30, 35, 40]  # valores de K a probar

# Leer dataset original
df = pd.read_csv("train.csv")
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)

# Preprocesado común
user_ids = sorted(set(df_train['user']).union(df_val['user']))
item_ids = sorted(set(df_train['item']).union(df_val['item']))
user_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
item_to_index = {iid: idx for idx, iid in enumerate(item_ids)}
NUM_USERS = len(user_ids)
NUM_ITEMS = len(item_ids)

# Convertir a estructuras dispersas
ratings = {}
user_items = defaultdict(set)
item_users = defaultdict(set)

for _, row in df_train.iterrows():
    u = user_to_index[row['user']]
    i = item_to_index[row['item']]
    r = row['rating']
    ratings[(u, i)] = r
    user_items[u].add(i)
    item_users[i].add(u)

# Función: media de un usuario
def rating_average(u):
    votos = [ratings[(u, i)] for i in user_items[u]]
    return sum(votos) / len(votos) if votos else 0

# Similaridad JMSD
def jmsd_similarity(u, v):
    items_u = user_items[u]
    items_v = user_items[v]
    intersection = items_u & items_v
    union = items_u | items_v
    if not intersection:
        return None
    diff = 0
    for i in intersection:
        ru = (ratings[(u, i)] - MIN_RATING) / (MAX_RATING - MIN_RATING)
        rv = (ratings[(v, i)] - MIN_RATING) / (MAX_RATING - MIN_RATING)
        diff += (ru - rv) ** 2
    jaccard = len(intersection) / len(union)
    msd = diff / len(intersection)
    return jaccard * (1 - msd)

# Obtener K vecinos
def get_neighbors(u, i, K):
    neighbors = []
    for v in item_users[i]:
        if u == v:
            continue
        sim = jmsd_similarity(u, v)
        if sim is not None:
            neighbors.append((v, sim))
    neighbors.sort(key=lambda x: x[1], reverse=True)
    return neighbors[:K]

# Agregación: media simple
def average_prediction(u, i, neighbors):
    ratings_sum = 0
    count = 0
    for v, _ in neighbors:
        if (v, i) in ratings:
            ratings_sum += ratings[(v, i)]
            count += 1
    return ratings_sum / count if count > 0 else None

# Evaluación MAE en validación
def evaluate_mae(K):
    errors = []
    for _, row in df_val.iterrows():
        u = user_to_index.get(row['user'])
        i = item_to_index.get(row['item'])
        true_rating = row['rating']
        if u is not None and i is not None:
            neighbors = get_neighbors(u, i, K)
            pred = average_prediction(u, i, neighbors)
            if pred is not None:
                pred = max(MIN_RATING, min(MAX_RATING, pred))
                errors.append(abs(true_rating - pred))
    return np.mean(errors) if errors else None

# Búsqueda de hiperparámetros
best_k = None
best_mae = float("inf")

print("Iniciando búsqueda de hiperparámetros...\n")
for k in K_values:
    print(f"Evaluando K = {k}")
    mae = evaluate_mae(k)
    print(f"MAE con K = {k}: {mae:.4f}")
    if mae < best_mae:
        best_mae = mae
        best_k = k

print("\nMejor valor encontrado:")
print(f"K = {best_k} con MAE = {best_mae:.4f}")

Iniciando búsqueda de hiperparámetros...

Evaluando K = 5
MAE con K = 5: 1.5940
Evaluando K = 10
MAE con K = 10: 1.5844
Evaluando K = 15
MAE con K = 15: 1.5837
Evaluando K = 20
MAE con K = 20: 1.5839
Evaluando K = 25
MAE con K = 25: 1.5845
Evaluando K = 30
MAE con K = 30: 1.5848
Evaluando K = 35
MAE con K = 35: 1.5851
Evaluando K = 40
MAE con K = 40: 1.5852

Mejor valor encontrado:
K = 15 con MAE = 1.5837


**KNNBaseline** librería

In [None]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.prediction_algorithms.knns import KNNBaseline
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split

import csv

# Cargar datos
df_train = pd.read_csv("train.csv")

# Convertir a formato surprise
reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(df_train[['user', 'item', 'rating']], reader)

# Dividir en entrenamiento y validación (80/20)
trainset, valset = train_test_split(data, test_size=0.2, random_state=42)

# Definir rejilla de hiperparámetros
param_grid = {
    'k': [20, 25, 30, 35],                      # Número de vecinos
    'min_k': [5, 10],                           # Número mínimo de vecinos
    'sim_options': {
        'name': ['pearson_baseline', 'cosine'],   # Similaridad
        'user_based': [True, False]             # Comparar usuarios o ítems
    }
}

# Búsqueda de hiperparámetros con GridSearchCV
print("Iniciando búsqueda de hiperparámetros...")
gs = GridSearchCV(KNNBaseline, param_grid, measures=['mae'], cv=3, joblib_verbose=1, n_jobs=10)
gs.fit(data)

# Mostrar mejores hiperparámetros
print("\nMejores hiperparámetros encontrados:")
print(gs.best_params['mae'])
print(f"Mejor MAE: {gs.best_score['mae']:.4f}")

# Entrenar modelo con mejores parámetros
print("\nEntrenando el modelo final con los mejores parámetros...")
best_algo = gs.best_estimator['mae']
trainset = data.build_full_trainset()
best_algo.fit(trainset)

# Cargar datos de test
df_test = pd.read_csv("test.csv")

# Guardar MAE por iteración
mae_per_iteration = []

# Generar predicciones y calcular MAE
print("\nGenerando predicciones y calculando MAE...")
output_rows = []
for _, row in df_test.iterrows():
    test_id = row['ID']
    user = str(row['user'])
    item = str(row['item'])

    # Obtener predicción
    pred = best_algo.predict(user, item)
    est = round(max(0, min(10, pred.est)), 3)

    # Calcular MAE acumulado
    mae_per_iteration.append(abs(pred.r_ui - est) if pred.r_ui is not None else 0)

    output_rows.append((test_id, est))

    # Mostrar progreso cada 1000 predicciones
    if len(output_rows) % 1000 == 0:
        current_mae = sum(mae_per_iteration) / len(mae_per_iteration)
        print(f"Predicción #{len(output_rows)} - MAE acumulado: {current_mae:.4f}")

# Guardar predicciones en CSV
filename = "predicciones_KNNBaseline.csv"
with open(filename, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["ID", "rating"])
    writer.writerows(output_rows)

print(f"\nArchivo '{filename}' generado correctamente.")

# Calcular MAE final
final_mae = sum(mae_per_iteration) / len(mae_per_iteration)
print(f"\nMAE final en test: {final_mae:.4f}")

Iniciando búsqueda de hiperparámetros...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}