# Propuesta Proyecto RecSys 2025

Integrantes: Felipe Abarca, Nicolas Estevez y Alfredo Enrione

## Setup Datos

In [13]:
from IPython.display import clear_output
!pip install datasets scipy
clear_output()

In [14]:
from datasets import load_dataset, load_from_disk

try:
  reviews = load_from_disk("reviews")
except:
  reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Books", trust_remote_code=True)
  reviews.save_to_disk("reviews")
  reviews.cleanup_cache_files()

Loading dataset from disk:   0%|          | 0/33 [00:00<?, ?it/s]

In [15]:
print(reviews.keys())

dict_keys(['full'])


In [16]:
print("Ejemplo de reseña:")
print(reviews["full"][0].keys())

Ejemplo de reseña:
dict_keys(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'])


## Procesamiento de Datos

In [17]:
from datasets import DatasetDict
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

SEED = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.5
REDUCE_PERCENT = 0.5
IGNORED_COLUMNS_REVIEWS = ['text', 'images', 'parent_asin', 'verified_purchase', 'timestamp']

In [18]:
reviews = reviews.remove_columns(IGNORED_COLUMNS_REVIEWS)
print("Ejemplo de reseña:")
print(reviews["full"][0])

Ejemplo de reseña:
{'rating': 1.0, 'title': 'Not a watercolor book! Seems like copies imo.', 'asin': 'B09BGPFTDB', 'user_id': 'AFKZENTNBQ7A7V7UXW5JJI6UGRYQ', 'helpful_vote': 0}


In [19]:
reviews = reviews.shuffle(seed=SEED)

train_test_split = reviews['full'].train_test_split(test_size=TEST_SIZE, seed=SEED)
train_set = train_test_split["train"]
temp_set = train_test_split["test"]

val_test_split = temp_set.train_test_split(test_size=VAL_SIZE, seed=SEED)
val_set = val_test_split["train"]
test_set = val_test_split["test"]

reviews_dict = DatasetDict({
    "train": train_set,
    "validation": val_set,
    "test": test_set
})

### Manejo Resenas

In [20]:
print("Reviews (shape):")
print(f"train: {len(reviews_dict['train']):,}".replace(",", "."))
print(f"val: {len(reviews_dict['validation']):,}".replace(",", "."))
print(f"test: {len(reviews_dict['test']):,}".replace(",", "."))

Reviews (shape):
train: 23.580.362
val: 2.947.545
test: 2.947.546


In [21]:
train_size = int(len(reviews_dict["train"]) * REDUCE_PERCENT)
reviews_dict["train"] = reviews_dict["train"].shuffle(seed=SEED).select(range(train_size))
df_train = reviews_dict["train"].to_pandas()

val_size = int(len(reviews_dict["validation"]) * REDUCE_PERCENT)
reviews_dict["validation"] = reviews_dict["validation"].shuffle(seed=SEED).select(range(val_size))
df_val = reviews_dict["validation"].to_pandas()

test_size = int(len(reviews_dict["test"]) * REDUCE_PERCENT)
reviews_dict["test"] = reviews_dict["test"].shuffle(seed=SEED).select(range(test_size))
df_test = reviews_dict["test"].to_pandas()

### Reduccion Metadatos

In [22]:
print("Resumen general de datos de entrenamiento:")
print("----------------------------------------")
print(f"Cantidad de libros: {len(df_train['asin'].unique()):,}".replace(",", "."))
print(f"Cantidad de usuarios: {len(df_train['user_id'].unique()):,}".replace(",", "."))
print(f"Cantidad de reseñas: {len(df_train):,}".replace(",", "."))
print(f"Cantidad de reseñas con votos de utilidad: {len(df_train[df_train['helpful_vote'] > 0]):,}".replace(",", "."))
print(f"Porcentaje de reseñas con votos: {len(df_train[df_train['helpful_vote'] > 0])/len(df_train)*100:.2f}%")

Resumen general de datos de entrenamiento:
----------------------------------------
Cantidad de libros: 2.819.890
Cantidad de usuarios: 5.803.743
Cantidad de reseñas: 11.790.181
Cantidad de reseñas con votos de utilidad: 4.267.088
Porcentaje de reseñas con votos: 36.19%


## DAN y Sampling

In [23]:
import numpy as np
import pandas as pd
from scipy import sparse
import torch

from model import LAE_DAN
import similarities as sml
import utils

DAN_CONFIG = {
    'reg_p': 0.1,
    'alpha': 0.2,
    'beta': 0.5,
    'drop_p': 0.3
}
SIMILARITY_METRIC = 'sapling'
GAMMA = 0.5
LAMBDA = 0.5


def build_mappings(df_train):
    users = df_train['user_id'].unique()
    items = df_train['asin'].unique()
    user2idx = {u: i for i, u in enumerate(users)}
    item2idx = {i: j for j, i in enumerate(items)}
    return user2idx, item2idx


def build_sparse_matrix(df, user2idx, item2idx):
    user_map = df['user_id'].map(user2idx)
    item_map = df['asin'].map(item2idx)
    mask = user_map.notna() & item_map.notna()
    if mask.sum() < len(df):
        print(f"Dropping {len(df) - mask.sum()} rows with unknown user/item in mapping")
    row_idx = user_map[mask].astype(int)
    col_idx = item_map[mask].astype(int)
    data = df['rating'][mask].astype(np.float32)
    n_users, n_items = len(user2idx), len(item2idx)
    return sparse.csr_matrix((data.values, (row_idx.values, col_idx.values)), shape=(n_users, n_items))


def compute_dan_scores(train_mat, test_mat):
    class DummyDataset:
        def __init__(self, train_mat, valid_mat, test_mat):
            self.UserItemNet = train_mat
            self.validUserItemNet = valid_mat
            self.testUserItemNet = test_mat
            self.n_users, self.m_items = train_mat.shape
            csr = valid_mat.tocsr()
            self.validDict = {u: csr[u].indices.tolist() for u in range(self.n_users)}
            self.num_valid_user = sum(1 for items in self.validDict.values() if len(items) > 0)
        def getUserPosItems(self, users):
            posItems = []
            
            for user in users:
                posItems.append(self.UserItemNet[user].nonzero()[1])
            return posItems
        
        def getValidUserPosItems(self, users):
            posItems = []
            for user in users:
                posItems.append(self.validUserItemNet[user].nonzero()[1])
            return posItems
    
        def getTestUserPosItems(self, users):
            posItems = []
            for user in users:
                posItems.append(self.testUserItemNet[user].nonzero()[1])
            return posItems

    dummy = DummyDataset(train_mat, test_mat, test_mat)
    model = LAE_DAN(DAN_CONFIG, dummy)
    all_users = torch.arange(dummy.n_users)
    with torch.no_grad():
        scores = model.getUsersRating(all_users)
    return scores.numpy()


def compute_sampling_scores(train_mat):
    M = train_mat.astype(np.float32).toarray()
    B_i = sml.similarity(M, SIMILARITY_METRIC, projection=1)
    rec_i = np.nan_to_num(np.dot(M, B_i) / np.sum(np.abs(B_i), axis=0))
    return rec_i



def merge_scores(dan_scores, sampling_scores):
    return LAMBDA * dan_scores + (1 - LAMBDA) * sampling_scores


def evaluate(train_mat, test_mat, score_matrix, K=20):
    n_users, n_items = train_mat.shape
    prec, rec, ndcg = utils.scores(
        train_mat, test_mat, score_matrix,
        n_users, n_items, K=K
    )
    return prec, rec, ndcg


def main(df_train, df_test):
    user2idx, item2idx = build_mappings(df_train)
    train_mat = build_sparse_matrix(df_train, user2idx, item2idx)
    test_mat = build_sparse_matrix(df_test, user2idx, item2idx)

    dan_scores = compute_dan_scores(train_mat, test_mat)
    sampling_scores = compute_sampling_scores(train_mat)
    final_scores = merge_scores(dan_scores, sampling_scores)

    K = 20
    p_d, r_d, n_d = evaluate(train_mat, test_mat, dan_scores, K)
    p_s, r_s, n_s = evaluate(train_mat, test_mat, sampling_scores, K)
    p_h, r_h, n_h = evaluate(train_mat, test_mat, final_scores, K)
    print(f"[DAN only] Precision@{K}: {p_d:.4f}, Recall@{K}: {r_d:.4f}, NDCG@{K}: {n_d:.4f}")
    print(f"[Sampling only] Precision@{K}: {p_s:.4f}, Recall@{K}: {r_s:.4f}, NDCG@{K}: {n_s:.4f}")
    print(f"[Hybrid] Precision@{K}: {p_h:.4f}, Recall@{K}: {r_h:.4f}, NDCG@{K}: {n_h:.4f}")

    users = list(user2idx.keys())
    items = list(item2idx.keys())
    recs = {}
    for u_idx, u in enumerate(users):
        top_items = np.argsort(-final_scores[u_idx])[:K]
        recs[u] = [items[i] for i in top_items]
    rec_df = pd.DataFrame.from_dict(recs, orient='index')
    rec_df.columns = [f'top_{i+1}' for i in range(K)]
    return rec_df

MAX_ITEMS = 500

def filter_top_items(df_train, df_test, max_items=MAX_ITEMS):
    top_items = (
        df_train['asin']
        .value_counts()
        .nlargest(max_items)
        .index
        .to_numpy()
    )
    mask_train = df_train['asin'].isin(top_items)
    mask_test  = df_test['asin'].isin(top_items)
    return df_train[mask_train], df_test[mask_test]

In [24]:
df_train_f, df_test_f = filter_top_items(df_train, df_test)
print(np.shape(df_train_f), np.shape(df_test_f))
recommendations = main(df_train_f, df_test_f)
print(recommendations.head())

(625042, 5) (78014, 5)
Dropping 58134 rows with unknown user/item in mapping


  recall = np.sum(right_pred/recall_n)


[DAN only] Precision@20: 0.0000, Recall@20: 0.0004, NDCG@20: 0.0002
[Sampling only] Precision@20: 0.0006, Recall@20: 0.0116, NDCG@20: 0.0059
[Hybrid] Precision@20: 0.0002, Recall@20: 0.0040, NDCG@20: 0.0021
                                   top_1       top_2       top_3       top_4  \
AHQ4L5J3TEBGICBKX3PEFX7SXC5Q  B00Y21VGYW  B015AL6H3C  B01COJUEZ0  B00YTXTIDO   
AF4A6XI2IXPMPIESZ6AJP5U6F3KA  0553418025  0307887448  1594633665  0375842209   
AE2M4TIFSAZNKEACFICFSFC3YXNQ  0399255370  0811879542  0803736800  0803741715   
AEZYTER7IGZ5VIOEH6HM2TKAG4GQ  0374300216  1589255518  0545392551  0671449028   
AHENSRSRTWMW7GDQPSQ25P3VWE5A  B00IRIR7K8  B00L9AXVPQ  B00BAXFAVK  B00U6DNYLI   

                                   top_5       top_6       top_7       top_8  \
AHQ4L5J3TEBGICBKX3PEFX7SXC5Q  B01AWO93CY  B019BILG2W  B01EN506CO  B01B1OGQH4   
AF4A6XI2IXPMPIESZ6AJP5U6F3KA  0812550706  1476746583  0307588378  1476738025   
AE2M4TIFSAZNKEACFICFSFC3YXNQ  0547248288  0811877825  0385376715  037436