# Notebook 3: k-NN Baseline

Este notebook construye un baseline colaborativo utilizando:

- **User-based k-NN**
- **Item-based k-NN**

Se evalúa con las métricas:

- HitRate@K
- Recall@K
- NDCG@K

Este baseline servirá para comparar métodos de reducción de datos y eficiencia.

In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
from tqdm import tqdm
from pathlib import Path

# Carga de datos procesados

In [2]:
data_path = Path("../data/processed")

train = pd.read_csv(data_path / "train.csv")
test = pd.read_csv(data_path / "test.csv")
movies = pd.read_csv(data_path / "movies.csv")

# Preparación del dataset para Surprise

In [3]:
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train[['userId','movieId','rating']], reader).build_full_trainset()
testset = list(zip(test['userId'], test['movieId'], test['rating']))

# Configuración de k-NN

- Usaremos similitud coseno
- User-based y Item-based
- K vecinos = 40

In [4]:
sim_options_user = {'name': 'cosine', 'user_based': True}
sim_options_item = {'name': 'cosine', 'user_based': False}

knn_user = KNNBasic(k=40, sim_options=sim_options_user)
knn_item = KNNBasic(k=40, sim_options=sim_options_item)

# Entrenamiento

In [5]:
print("Entrenando UserKNN...")
knn_user.fit(train_data)
print("Entrenando ItemKNN...")
knn_item.fit(train_data)

Entrenando UserKNN...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Entrenando ItemKNN...
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x16c4c0a45e0>

# Evaluación Leave-One-Out
Se predice la puntuación del ítem de test para cada usuario.

In [6]:
def hit_rate(ranklist, gt_item):
    return 1.0 if gt_item in ranklist else 0.0

def ndcg(ranklist, gt_item):
    if gt_item in ranklist:
        idx = ranklist.index(gt_item)
        return 1 / np.log2(idx + 2)
    return 0.0

def recall(ranklist, gt_item):
    return 1.0 if gt_item in ranklist else 0.0

Ks = [5, 10, 20, 50]


In [7]:
user_train_items = train.groupby('userId')['movieId'].apply(set).to_dict()
all_items = set(movies['movieId'].unique())

# Evaluación top-K

In [8]:
results_user = {k: {'hit': [], 'ndcg': [], 'recall': []} for k in Ks}

for _, row in tqdm(test.iterrows(), total=len(test)):
    user, gt_item = row['userId'], row['movieId']
    seen = user_train_items.get(user, set())
    
    # Predecir puntuaciones para todos los ítems no vistos
    candidates = [i for i in all_items if i not in seen]
    predictions = [knn_user.predict(user, i).est for i in candidates]
    
    top_items = [x for _, x in sorted(zip(predictions, candidates), reverse=True)]
    
    for k in Ks:
        top_k = top_items[:k]
        results_user[k]['hit'].append(hit_rate(top_k, gt_item))
        results_user[k]['recall'].append(recall(top_k, gt_item))
        results_user[k]['ndcg'].append(ndcg(top_k, gt_item))

100%|██████████| 6040/6040 [1:14:09<00:00,  1.36it/s]


In [9]:
results_item = {k: {'hit': [], 'ndcg': [], 'recall': []} for k in Ks}

for _, row in tqdm(test.iterrows(), total=len(test)):
    user, gt_item = row['userId'], row['movieId']
    seen = user_train_items.get(user, set())
    
    candidates = [i for i in all_items if i not in seen]
    predictions = [knn_item.predict(user, i).est for i in candidates]
    
    top_items = [x for _, x in sorted(zip(predictions, candidates), reverse=True)]
    
    for k in Ks:
        top_k = top_items[:k]
        results_item[k]['hit'].append(hit_rate(top_k, gt_item))
        results_item[k]['recall'].append(recall(top_k, gt_item))
        results_item[k]['ndcg'].append(ndcg(top_k, gt_item))

100%|██████████| 6040/6040 [38:05<00:00,  2.64it/s]  


In [10]:
summary_user = []
summary_item = []

for k in Ks:
    summary_user.append({
        'K': k,
        'HitRate': np.mean(results_user[k]['hit']),
        'Recall': np.mean(results_user[k]['recall']),
        'NDCG': np.mean(results_user[k]['ndcg']),
    })
    
    summary_item.append({
        'K': k,
        'HitRate': np.mean(results_item[k]['hit']),
        'Recall': np.mean(results_item[k]['recall']),
        'NDCG': np.mean(results_item[k]['ndcg']),
    })

df_user = pd.DataFrame(summary_user)
df_item = pd.DataFrame(summary_item)

print("UserKNN Results")
display(df_user)

print("ItemKNN Results")
display(df_item)

UserKNN Results


Unnamed: 0,K,HitRate,Recall,NDCG
0,5,0.0,0.0,0.0
1,10,0.000497,0.000497,0.000146
2,20,0.011258,0.011258,0.002792
3,50,0.031623,0.031623,0.006761


ItemKNN Results


Unnamed: 0,K,HitRate,Recall,NDCG
0,5,0.000828,0.000828,0.00042
1,10,0.004139,0.004139,0.001466
2,20,0.012086,0.012086,0.003436
3,50,0.034603,0.034603,0.007847


In [11]:
df_user.to_csv(data_path / "userknn_results.csv", index=False)
df_item.to_csv(data_path / "itemknn_results.csv", index=False)
print("Resultados guardados en data/processed/")

Resultados guardados en data/processed/
