In [7]:
import pandas as pd
from sklearn.decomposition import NMF

In [None]:
# Load train and test CSV files into DataFrames
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

train_df.sample(10)

Unnamed: 0,user,item,rating
51559,6236,129400,7.0
70953,9196,163891,7.0
24716,2684,79945,10.0
254963,48311,88661,6.0
241717,45223,21163,8.0
286172,54670,643,7.0
66679,8512,81893,5.0
63640,7906,151324,8.0
5992,534,39933,8.0
366665,72282,177443,2.0


In [5]:
test_df.sample(10)

Unnamed: 0,ID,user,item,rating
16683,16683,839,4644,5.0
11930,11930,4201,13354,10.0
17823,17823,534,1413,10.0
33574,33574,6740,17917,7.0
32946,32946,10548,24539,7.0
15224,15224,1959,7458,5.0
24856,24856,1357,2004,8.0
4366,4366,3711,2295,7.0
14121,14121,6010,16711,6.0
21023,21023,2639,9061,10.0


In [3]:
rating_distribution = train_df['rating'].value_counts().sort_index()
print(rating_distribution)

rating
1.0      1592
2.0      2478
3.0      5423
4.0      8042
5.0     45231
6.0     33489
7.0     69102
8.0     93366
9.0     60507
10.0    71121
Name: count, dtype: int64


## NMF

In [20]:
import pandas as pd
import numpy as np
import csv
from collections import defaultdict

# Parámetros del modelo
NUM_FACTORS = 5
NUM_ITERATIONS = 50

# Leer CSVs
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

# Mapeo usuarios/items → índices consecutivos
user_ids = sorted(set(df_train['user']).union(df_test['user']))
item_ids = sorted(set(df_train['item']).union(df_test['item']))

user_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
item_to_index = {iid: idx for idx, iid in enumerate(item_ids)}

NUM_USERS = len(user_ids)
NUM_ITEMS = len(item_ids)

# Diccionario de votos: (u_idx, i_idx) -> rating
ratings = {}
user_items = defaultdict(set)
item_users = defaultdict(set)

for _, row in df_train.iterrows():
    u = user_to_index[row['user']]
    i = item_to_index[row['item']]
    r = row['rating']
    ratings[(u, i)] = r
    user_items[u].add(i)
    item_users[i].add(u)

# Inicialización aleatoria de factores
w = np.random.rand(NUM_USERS, NUM_FACTORS)
h = np.random.rand(NUM_ITEMS, NUM_FACTORS)

# Predicción r_ui = dot product de w[u] · h[i]
def compute_prediction(u, i):
    return np.dot(w[u], h[i])

# Entrenamiento NMF
for it in range(NUM_ITERATIONS):
    print(f"Iteración {it+1} de {NUM_ITERATIONS}")

    # Actualizar W fijando H
    for u in range(NUM_USERS):
        for k in range(NUM_FACTORS):
            numer = 0
            denom = 1e-10
            for i in user_items[u]:
                r_ui = ratings[(u, i)]
                pred_ui = compute_prediction(u, i)
                numer += h[i][k] * r_ui
                denom += h[i][k] * pred_ui
            w[u][k] *= numer / denom

    # Actualizar H fijando W
    for i in range(NUM_ITEMS):
        for k in range(NUM_FACTORS):
            numer = 0
            denom = 1e-10
            for u in item_users[i]:
                r_ui = ratings[(u, i)]
                pred_ui = compute_prediction(u, i)
                numer += w[u][k] * r_ui
                denom += w[u][k] * pred_ui
            h[i][k] *= numer / denom

# Predicciones para test y guardado en CSV
output_rows = []

for _, row in df_test.iterrows():
    test_id = row['ID']
    u_raw = row['user']
    i_raw = row['item']

    u = user_to_index.get(u_raw)
    i = item_to_index.get(i_raw)

    if u is not None and i is not None:
        pred = compute_prediction(u, i)
        pred = max(0, min(10, pred))
        pred = round(pred, 3)
    else:
        pred = 8.0  # Valor neutral si falta user/item

    output_rows.append((test_id, pred))

# Guardar CSV
with open("predicciones_NMF_50_iter.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["ID", "rating"])
    writer.writerows(output_rows)

print("Archivo generado correctamente.")

Iteración 1 de 50
Iteración 2 de 50
Iteración 3 de 50
Iteración 4 de 50
Iteración 5 de 50
Iteración 6 de 50
Iteración 7 de 50
Iteración 8 de 50
Iteración 9 de 50
Iteración 10 de 50
Iteración 11 de 50
Iteración 12 de 50
Iteración 13 de 50
Iteración 14 de 50
Iteración 15 de 50
Iteración 16 de 50
Iteración 17 de 50
Iteración 18 de 50
Iteración 19 de 50
Iteración 20 de 50
Iteración 21 de 50
Iteración 22 de 50
Iteración 23 de 50
Iteración 24 de 50
Iteración 25 de 50
Iteración 26 de 50
Iteración 27 de 50
Iteración 28 de 50
Iteración 29 de 50
Iteración 30 de 50
Iteración 31 de 50
Iteración 32 de 50
Iteración 33 de 50
Iteración 34 de 50
Iteración 35 de 50
Iteración 36 de 50
Iteración 37 de 50
Iteración 38 de 50
Iteración 39 de 50
Iteración 40 de 50
Iteración 41 de 50
Iteración 42 de 50
Iteración 43 de 50
Iteración 44 de 50
Iteración 45 de 50
Iteración 46 de 50
Iteración 47 de 50
Iteración 48 de 50
Iteración 49 de 50
Iteración 50 de 50
Archivo 'predicciones.csv' generado correctamente.


### Hiperparámetros

In [None]:
def train_and_evaluate_nmf(train_df, val_df, num_factors, num_iterations):
    # Mapeo usuarios e ítems
    user_ids = sorted(set(train_df['user']).union(val_df['user']))
    item_ids = sorted(set(train_df['item']).union(val_df['item']))
    user_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
    item_to_index = {iid: idx for idx, iid in enumerate(item_ids)}
    num_users = len(user_ids)
    num_items = len(item_ids)

    # Preparar datos dispersos
    ratings = {}
    user_items = defaultdict(set)
    item_users = defaultdict(set)

    for _, row in train_df.iterrows():
        u = user_to_index[row['user']]
        i = item_to_index[row['item']]
        ratings[(u, i)] = row['rating']
        user_items[u].add(i)
        item_users[i].add(u)

    # Inicializar factores
    w = np.random.rand(num_users, num_factors)
    h = np.random.rand(num_items, num_factors)

    # Entrenamiento
    for _ in range(num_iterations):
        for u in range(num_users):
            for k in range(num_factors):
                numer = sum(h[i][k] * ratings[(u, i)] for i in user_items[u])
                denom = sum(h[i][k] * np.dot(w[u], h[i]) for i in user_items[u]) + 1e-10
                w[u][k] *= numer / denom

        for i in range(num_items):
            for k in range(num_factors):
                numer = sum(w[u][k] * ratings[(u, i)] for u in item_users[i])
                denom = sum(w[u][k] * np.dot(w[u], h[i]) for u in item_users[i]) + 1e-10
                h[i][k] *= numer / denom

    # Evaluación (MAE)
    errors = []
    for _, row in val_df.iterrows():
        u = user_to_index.get(row['user'])
        i = item_to_index.get(row['item'])
        if u is not None and i is not None:
            pred = np.dot(w[u], h[i])
            pred = max(0, min(10, pred))
            errors.append(abs(row['rating'] - pred))
    return np.mean(errors) if errors else None


from sklearn.model_selection import train_test_split

# Dividir dataset
train_data, val_data = train_test_split(df_train, test_size=0.2, random_state=42)

# Hiperparámetros a explorar
factors_list = [5, 10, 20]
iterations_list = [20, 50]

best_mae = float('inf')
best_config = None

for k in factors_list:
    for iters in iterations_list:
        print(f"Entrenando con {k} factores, {iters} iteraciones...")
        mae = train_and_evaluate_nmf(train_data, val_data, k, iters)
        print(f"MAE validación: {mae:.4f}")
        if mae < best_mae:
            best_mae = mae
            best_config = (k, iters)

print("\nMejor configuración encontrada:")
print(f"Factores: {best_config[0]}, Iteraciones: {best_config[1]}")
print(f"MAE validación: {best_mae:.4f}")
