# Propuesta Proyecto RecSys 2025

Integrantes: Felipe Abarca, Nicolas Estevez, Alfredo Enrione

## Setup Datos

In [7]:
from IPython.display import clear_output
!pip install datasets
clear_output()

In [8]:
from datasets import load_dataset, load_from_disk
# Cargar datos
try:
  reviews = load_from_disk("reviews")
except:
  reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Books", trust_remote_code=True)
  reviews.save_to_disk("reviews")
  reviews.cleanup_cache_files()

try:
  metadata = load_from_disk("metadata")
except:
  metadata = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Books", trust_remote_code=True)
  metadata.save_to_disk("metadata")
  metadata.cleanup_cache_files()

README.md: 0.00B [00:00, ?B/s]

Amazon-Reviews-2023.py: 0.00B [00:00, ?B/s]

Loading dataset shards:   0%|          | 0/33 [00:00<?, ?it/s]

Saving the dataset (0/33 shards):   0%|          | 0/29475453 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/28 [00:00<?, ?it/s]

Saving the dataset (0/28 shards):   0%|          | 0/4448181 [00:00<?, ? examples/s]

In [9]:
print(reviews.keys())

dict_keys(['full'])


In [10]:
# Ejemplos
print("Ejemplo de reseña:")
print(reviews["full"][0].keys())

print("\nEjemplo de metadata:")
print(metadata["full"][0].keys())

Ejemplo de reseña:
dict_keys(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'])

Ejemplo de metadata:
dict_keys(['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtitle', 'author'])


## Procesamiento de Datos

In [11]:
from datasets import DatasetDict
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')

SEED = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.5
REDUCE_PERCENT = 0.1
IGNORED_COLUMNS_REVIEWS = ['text', 'images', 'parent_asin', 'verified_purchase', 'timestamp']
IGNORED_COLUMNS_META = ['videos', 'images','parent_asin', 'bought_together', 'details', 'features', 'author', 'subtitle']


In [12]:
# Sacar columnas innecesarias
reviews = reviews.remove_columns(IGNORED_COLUMNS_REVIEWS)
metadata = metadata.remove_columns(IGNORED_COLUMNS_META)
print("Ejemplo de reseña:")
print(reviews["full"][0])

print("\nEjemplo de metadata:")
print(metadata["full"][0])

Ejemplo de reseña:
{'rating': 1.0, 'title': 'Not a watercolor book! Seems like copies imo.', 'asin': 'B09BGPFTDB', 'user_id': 'AFKZENTNBQ7A7V7UXW5JJI6UGRYQ', 'helpful_vote': 0}

Ejemplo de metadata:
{'main_category': 'Books', 'title': 'Chaucer', 'average_rating': 4.5, 'rating_number': 29, 'description': [], 'price': '8.23', 'store': 'Peter Ackroyd (Author)', 'categories': ['Books', 'Literature & Fiction', 'History & Criticism']}


In [13]:
# --- Reviews ---
reviews = reviews.shuffle(seed=SEED)

train_test_split = reviews['full'].train_test_split(test_size=TEST_SIZE, seed=SEED)
train_set = train_test_split["train"]
temp_set = train_test_split["test"]

val_test_split = temp_set.train_test_split(test_size=VAL_SIZE, seed=SEED)
val_set = val_test_split["train"]
test_set = val_test_split["test"]

reviews_dict = DatasetDict({
    "train": train_set,
    "validation": val_set,
    "test": test_set
})

In [14]:
# --- Metadata ---
metadata = metadata.shuffle(seed=SEED)

train_test_split = metadata['full'].train_test_split(test_size=TEST_SIZE, seed=SEED)
train_set = train_test_split["train"]
temp_set = train_test_split["test"]

val_test_split = temp_set.train_test_split(test_size=VAL_SIZE, seed=SEED)
val_set = val_test_split["train"]
test_set = val_test_split["test"]

metadata_dict = DatasetDict({
    "train": train_set,
    "validation": val_set,
    "test": test_set
})

In [15]:
print("Reviews (shape):")
print(f"train: {len(reviews_dict['train']):,}".replace(",", "."))
print(f"val: {len(reviews_dict['validation']):,}".replace(",", "."))
print(f"test: {len(reviews_dict['test']):,}".replace(",", "."))

Reviews (shape):
train: 23.580.362
val: 2.947.545
test: 2.947.546


In [16]:
# reducir para dfs
train_size = int(len(reviews_dict["train"]) * REDUCE_PERCENT)
reviews_dict["train"] = reviews_dict["train"].shuffle(seed=SEED).select(range(train_size))
df_train = reviews_dict["train"].to_pandas()

val_size = int(len(reviews_dict["validation"]) * REDUCE_PERCENT)
reviews_dict["validation"] = reviews_dict["validation"].shuffle(seed=SEED).select(range(val_size))
df_val = reviews_dict["validation"].to_pandas()

test_size = int(len(reviews_dict["test"]) * REDUCE_PERCENT)
reviews_dict["test"] = reviews_dict["test"].shuffle(seed=SEED).select(range(test_size))
df_test = reviews_dict["test"].to_pandas()

In [17]:
print("Metadata (shape):")
print(f"train: {len(metadata_dict['train']):,}".replace(",", "."))
print(f"val: {len(metadata_dict['validation']):,}".replace(",", "."))
print(f"test: {len(metadata_dict['test']):,}".replace(",", "."))

Metadata (shape):
train: 3.558.544
val: 444.818
test: 444.819


In [18]:
# reducir para dfs
train_size = int(len(metadata_dict["train"]) * REDUCE_PERCENT)
metadata_dict["train"] = metadata_dict["train"].shuffle(seed=SEED).select(range(train_size))
df_meta_train = metadata_dict["train"].to_pandas()

df_meta_val = metadata_dict["validation"].to_pandas()
df_meta_test = metadata_dict["test"].to_pandas()

In [19]:
print(df_train.head())

   rating                        title        asin  \
0     5.0                   Five Stars  1250032105   
1     5.0     SO ANGRY! ! HAT MAILMEN!  1689538961   
2     5.0        Still One of the Best  0387485376   
3     5.0             Wow what a ride!  B019AHMQS2   
4     5.0  Good read on human diseases  1285065921   

                        user_id  helpful_vote  
0  AGOOYE6ZO4VHZHCGZ5D2HL7EAKMA             0  
1  AHG6LQQ2R7YK76NL3GEHVDKHT35Q             1  
2  AEQV26NRNXQSNI6A3IEUED3FVIBA             6  
3  AHFBFONBYG3KFZ24X5NEI5AOQ7YQ             0  
4  AH5KMEIN3GIIXEW3NYE4OF6JF6VQ             0  


## Corriendo el modelo

In [27]:
## Corriendo el modelo

from pipeline import Pipeline
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

def create_interaction_matrix(df, user_col='user_id', item_col='asin', rating_col='rating', max_users=100, max_items=100):
    """
    Create a user-item interaction matrix from the Amazon dataset, limited to max_users and max_items.
    """
    print(f"Original dataset size: {len(df)}")
    unique_users = df[user_col].unique()
    unique_items = df[item_col].unique()
    print(f"Unique users: {len(unique_users)}, Unique items: {len(unique_items)}")
    # Sample users
    if len(unique_users) > max_users:
        sampled_users = np.random.choice(unique_users, max_users, replace=False)
        df = df[df[user_col].isin(sampled_users)]
        print(f"Sampled {max_users} users")
    # Sample items
    unique_items = df[item_col].unique()
    if len(unique_items) > max_items:
        sampled_items = np.random.choice(unique_items, max_items, replace=False)
        df = df[df[item_col].isin(sampled_items)]
        print(f"Sampled {max_items} items")
    print(f"Final dataset size: {len(df)}")
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()
    df['user_idx'] = user_encoder.fit_transform(df[user_col])
    df['item_idx'] = item_encoder.fit_transform(df[item_col])
    df['interaction'] = (df[rating_col] > 0).astype(int)
    matrix = csr_matrix((df['interaction'], (df['user_idx'], df['item_idx'])),
                       shape=(len(user_encoder.classes_), len(item_encoder.classes_)))
    return matrix.toarray(), user_encoder, item_encoder

print("Creating interaction matrix from Amazon data...")
interaction_matrix, user_encoder, item_encoder = create_interaction_matrix(df_train, max_users=100, max_items=100)
print(f"Interaction matrix shape: {interaction_matrix.shape}")
print(f"Number of users: {interaction_matrix.shape[0]}")
print(f"Number of items: {interaction_matrix.shape[1]}")
print(f"Number of interactions: {interaction_matrix.sum()}")
print(f"Sparsity: {1 - interaction_matrix.sum() / (interaction_matrix.shape[0] * interaction_matrix.shape[1]):.4f}")

# Pipeline configuration
config = {
    'reg_p': 20,
    'alpha': 0.2,
    'beta': 0.3,
    'drop_p': 0.5,
    'xi': 0.3
}
gamma = 0.5

# Run pipeline
pipeline = Pipeline(interaction_matrix, config, gamma)
recommendation_matrix = pipeline.run(normalization="RLAE")
print("Recommendation Matrix shape:", recommendation_matrix.shape)
print("Recommendation Matrix sample:\n", recommendation_matrix[:3, :5])

Creating interaction matrix from Amazon data...
Original dataset size: 2358036
Unique users: 1713939, Unique items: 1047426
Sampled 100 users
Sampled 100 items
Final dataset size: 100
Interaction matrix shape: (91, 100)
Number of users: 91
Number of items: 100
Number of interactions: 100
Sparsity: 0.9890
Recommendation Matrix shape: (91, 100)
Recommendation Matrix sample:
 [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


## Metricas

In [29]:
from metrics import RecommendationMetrics
from sklearn.model_selection import train_test_split

# 1. Initialize the metrics calculator
metrics_calculator = RecommendationMetrics(k_values=[1, 5, 10, 20])



user_indices = np.arange(interaction_matrix.shape[0])
train_idx, test_idx = train_test_split(user_indices, test_size=0.2, random_state=42)

# For metrics, we need lists of item indices for each user
def get_user_item_lists(matrix, user_indices):
    return [list(np.where(matrix[u] > 0)[0]) for u in user_indices]

train_data = get_user_item_lists(interaction_matrix, train_idx)
test_data = get_user_item_lists(interaction_matrix, test_idx)

# Now calculate metrics using these
metrics = metrics_calculator.calculate_metrics(
    train_data=train_data,
    test_data=test_data,
    recommendation_matrix=recommendation_matrix
)
metrics_calculator.print_metrics(metrics)

RECOMMENDATION SYSTEM METRICS

Metrics @ K=1:
  Recall@1: 0.0000
  Precision@1: 0.0000
  NDCG@1: 0.0000

Metrics @ K=5:
  Recall@5: 0.0526
  Precision@5: 0.0105
  NDCG@5: 0.0204

Metrics @ K=10:
  Recall@10: 0.0526
  Precision@10: 0.0053
  NDCG@10: 0.0204

Metrics @ K=20:
  Recall@20: 0.2105
  Precision@20: 0.0105
  NDCG@20: 0.0598

Other Metrics:
  MRR: 0.0390
  Hit Rate@10: 0.0526
