In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import os, sys
from data_utils.data_utils import set_seed
set_seed(seed=42)
from data_utils.preprocess import clean_and_filter

# compute the absolute path to your project root:
root = os.path.abspath(os.path.join(os.getcwd(), '..'))
# insert it at the front of Python’s module search path:
sys.path.insert(0, root)

In [3]:
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import numpy as np

import torch
import json
from torch.nn import functional as F
import torch.optim as optim

# Dataset wrapper
from data_utils.datasets import SASRTrainDataset

# Model and evaluation
from models.sasr import SASR
from evaluation import evaluate_ranking_model, evaluate_featureaware_model

In [4]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print("Running on", device)

Running on mps


In [5]:
from data_utils.preprocess import load_movielens, get_user_sequences, split_sequences
import json

def load_json_from_file(file_path):
    """
    Loads JSON data from a file.

    Args:
        file_path (str): The path to the JSON file.

    Returns:
        dict: A dictionary representing the JSON data, or None if an error occurs.
    """
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{file_path}'")
        return None

file_path = '../data/'


ratings, users, movies = load_movielens(file_path)
ratings, users, movies = clean_and_filter(ratings, users, movies, rating_threshold=4)
user_seqs = get_user_sequences(ratings)

user_splits = split_sequences(user_seqs, train_ratio=0.8, val_ratio=0.1)

train_exs = load_json_from_file(f'{file_path}train_data.json')
val_exs = load_json_from_file(f'{file_path}val_data.json')
test_exs = load_json_from_file(f'{file_path}test_data.json')

all_movies = set(movies["MovieID"].unique())
all_users = set(users["UserID"].unique())
num_total_movies = len(all_movies)
num_total_users = len(all_users)

print(train_exs[0])


{'UserID': 0, 'prefix': [3118, 1251], 'positive': 1673, 'negatives': [2621, 457, 103, 3039, 1127], 'padded_prefix': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3118, 1251], 'mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]}


In [58]:
batch_size = 1024

train_ds = SASRTrainDataset(train_exs, num_negatives=1)
val_ds   = SASRTrainDataset(val_exs,   num_negatives=1)

sasr = SASR(num_total_users, num_total_movies, d_model=64, n_head=2, num_layers=2).to(device)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=2)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=2)
optimizer = optim.Adam(sasr.parameters(), lr=1e-4)

def bpr_loss(sasr, encoded_seq_next, pos, neg):
    
    pos_scores = torch.sum(encoded_seq_next * sasr.item_embeddings(pos), -1)
    neg_scores = torch.sum(encoded_seq_next * sasr.item_embeddings(neg), -1)
    
    return -(pos_scores - neg_scores).sigmoid().log().mean()

In [59]:
epochs = 5
train_losses, val_losses = [], []

for epoch in range(1, epochs+1):
    # ——— train ———
    sasr.train()
    tot_train = 0.0
    for user, pos, neg, prefix in train_loader:
        user, pos, neg, prefix = (user.to(device), pos.to(device), neg.to(device), prefix.to(device))
        optimizer.zero_grad()
        encoded_seq_next = sasr(user, prefix)[:, -1, :]
        loss = bpr_loss(sasr, encoded_seq_next, pos, neg)
        loss.backward()
        optimizer.step()
        tot_train += loss.item()
    avg_train = tot_train / len(train_loader)
    train_losses.append(avg_train)

    # ——— val ———
    sasr.eval()
    tot_val = 0.0
    with torch.no_grad():
        for user, pos, neg, prefix in val_loader:
            user, pos, neg, prefix = user.to(device), pos.to(device), neg.to(device), prefix.to(device)
            encoded_seq_next = sasr(user, prefix)[:, -1, :]
            tot_val += bpr_loss(sasr, encoded_seq_next, pos, neg).item()
    avg_val = tot_val / len(val_loader)
    val_losses.append(avg_val)

    print(f"Epoch {epoch}/{epochs} — Train: {avg_train:.4f}, Val: {avg_val:.4f}")

# Cell 6: plot curves
plt.plot(range(1, epochs+1), train_losses, label="Train Loss")
plt.plot(range(1, epochs+1), val_losses,   label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("BPR Loss")
plt.title("Training & Validation Loss Curve")
plt.legend()
plt.show()

Epoch 1/5 — Train: 3.3994, Val: 3.1176
Epoch 2/5 — Train: 2.6655, Val: 2.5147
Epoch 3/5 — Train: 2.1278, Val: 2.0345
Epoch 4/5 — Train: 1.6958, Val: 1.6551


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x124c8c400>
Traceback (most recent call last):
  File "/Users/akshayd/miniconda3/envs/rec/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/Users/akshayd/miniconda3/envs/rec/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 1568, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/Users/akshayd/miniconda3/envs/rec/lib/python3.11/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/akshayd/miniconda3/envs/rec/lib/python3.11/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/akshayd/miniconda3/envs/rec/lib/python3.11/multiprocessing/connection.py", line 948, in wait
    ready = selector.select(timeout)
            ^^^^^^^

.

In [56]:
metrics = evaluate_ranking_model(
    model=sasr,
    user_splits=user_splits,
    global_items=all_movies,
    device=device,
    candidate_size=100,
    k=10,
    model_type='sasr'
)

print("Evaluation Metrics:")
for name, val in metrics.items():
    print(f"- {name}: {val:.4f}")

Evaluation Metrics:
- Hit@10: 0.5359
- Hit@10 Std: 0.0023
- NDCG@10: 0.2934
- NDCG@10 Std: 0.0020
- MRR: 0.2424
- MRR Std: 0.0021
- MAP: 0.2424
- MAP Std: 0.0021


In [57]:
# toy_story_emb = sasr.item_embeddings.weight[1]               # shape: [32]
# item_embs = F.normalize(sasr.item_embeddings.weight, dim=1)  # shape: [3883, 32]
# toy_story_emb = toy_story_emb / toy_story_emb.norm()       # shape: [32]
# similarities = item_embs @ toy_story_emb  # shape: [3883]
# # Step 4: Get top-k most similar item indices (excluding itself if needed)
# topk = torch.topk(similarities, k=10 + 1)
# 
# # ## Adding 1 here because 0 is my padding index
# # movie_id_to_index = {movie_id: i + 1 for i, movie_id in enumerate(movies['MovieID'].unique())}
# # index_to_movie_id = {v: k for k, v in movie_id_to_index.items()}
# 
# indices = topk.indices.tolist()
# # # Look them up in the movies DataFrame
# mapped_movies = []
# for values in indices:
#     mapped_movies.append(index_to_movie_id[values])
# similar_movies_df = movies[movies['MovieID'].isin(mapped_movies)]
# # # Optional: Sort by similarity score
# scores = similarities[indices].tolist()
# similar_movies_df['similarity'] = scores
# similar_movies_df = similar_movies_df.sort_values(by='similarity', ascending=False)
# 
# print(similar_movies_df[['MovieID', 'Title', 'similarity']])

from torch.nn import functional as F
toy_story_emb = sasr.item_embeddings.weight[0]               # shape: [32]   
item_embs = F.normalize(sasr.item_embeddings.weight, dim=1)  # shape: [3883, 32]
toy_story_emb = toy_story_emb / toy_story_emb.norm()       # shape: [32]
similarities = item_embs @ toy_story_emb  # shape: [3883]
# Step 4: Get top-k most similar item indices (excluding itself if needed)
topk = torch.topk(similarities, k=10 + 1)
indices = topk.indices.tolist()
# Look them up in the movies DataFrame
similar_movies_df = movies[movies['MovieID'].isin(indices)]
# Optional: Sort by similarity score
scores = similarities[indices].tolist()
similar_movies_df['similarity'] = scores
similar_movies_df = similar_movies_df.sort_values(by='similarity', ascending=False)

print(similar_movies_df[['MovieID', 'Title', 'similarity']])

      MovieID                                      Title  similarity
0           0                           Toy Story (1995)    1.000000
257       257  Star Wars: Episode IV - A New Hope (1977)    0.275273
589       589           Silence of the Lambs, The (1991)    0.273623
847       847                      Godfather, The (1972)    0.251519
1015     1015                        Mary Poppins (1964)    0.248279
1180     1180             Raiders of the Lost Ark (1981)    0.247023
1195     1195                          GoodFellas (1990)    0.245134
1220     1220                     Terminator, The (1984)    0.244651
1726     1726                  As Good As It Gets (1997)    0.244471
2789     2789                     American Beauty (1999)    0.240619
2890     2890                          Fight Club (1999)    0.230823


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_movies_df['similarity'] = scores
