Modules import and set up.

In [1]:
# import os, time, json, copy, pickle, random, requests, re
import numpy as np
# from PIL import Image

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader

In [2]:
from tqdm import tqdm
import time

In [23]:
import copy
from sklearn.model_selection import train_test_split

Read the movie and ratings files.

In [3]:
def create_films_dict(file_path):
    master_dict = {}
    with open(file_path, "r", encoding="latin-1") as file:
        for line in file:
            full_line = line.strip()
            id = int(full_line.split("::")[0])
            master_dict[id] = full_line
    return master_dict

def create_ratings_matrix(file_path):
    full_array = []
    with open(file_path, "r", encoding = "latin-1") as file:
        for line in file:
            full_line = line.strip()
            full_array.append(full_line)
        as_np_array = np.array(full_array)
    return as_np_array

In [24]:
def parse_ratings(ratings_array):
    """Parse ratings array into user_ids, item_ids, ratings"""
    user_ids = []
    item_ids = []
    ratings = []

    for entry in ratings_array:
        parts = entry.split("::")
        user_ids.append(int(parts[0]))
        item_ids.append(int(parts[1]))
        ratings.append(float(parts[2]))

    return np.array(user_ids), np.array(item_ids), np.array(ratings)

def prepare_item_profiles(films_dict):
    """Extract item profiles from films dictionary"""
    item_profiles = {}
    for item_id, info in films_dict.items():
        parts = info.split("::")
        if len(parts) >= 3:
            title = parts[1]
            genres = parts[2]
            # Combine title and genres as profile
            profile = f"{title} {genres.replace('|', ' ')}"
            item_profiles[item_id] = profile
    return item_profiles

In [25]:
###DEBUGGING: EXCLUDED FROM SCRIPT
movies_path = "movies.dat"
ratings_path = "ratings.dat"
filmsdict = create_films_dict(movies_path)
ratingsmat = create_ratings_matrix(ratings_path)
# Parse ratings
user_ids, item_ids, ratings = parse_ratings(ratingsmat)

# Prepare item profiles
item_profiles = prepare_item_profiles(filmsdict)

print(f"Loaded {len(user_ids)} ratings")
print(f"Loaded {len(item_profiles)} item profiles")

Loaded 1000209 ratings
Loaded 3883 item profiles


Train and test data split. (80:20 split according to paper.)

In [26]:
# Split data (80-20)
indices = np.arange(len(ratings))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)

train_users = user_ids[train_idx]
train_items = item_ids[train_idx]
train_ratings = ratings[train_idx]

test_users = user_ids[test_idx]
test_items = item_ids[test_idx]
test_ratings = ratings[test_idx]

print(f"Training samples: {len(train_ratings)}")
print(f"Test samples: {len(test_ratings)}")

Training samples: 800167
Test samples: 200042


Set device as GPU if possible.

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

Reference: https://huggingface.co/google-bert/bert-base-uncased

In [5]:
###DEBUGGING: EXCLUDED FROM SCRIPT
# Load BERT model
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased",
                                  output_hidden_states=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Generate vector with BERT.

In [27]:
def get_vector_batch(texts, bert_model, bert_tokenizer, batch_size=16, device='cpu'):
  """Generate BERT embeddings for a batch of texts"""
  bert_model.eval()
  bert_model.to(device)
  all_embeddings = []

  for i in tqdm(range(0, len(texts), batch_size), desc="Generating BERT embeddings"):
      batch = texts[i:i+batch_size]
      encoded_input = bert_tokenizer(batch, return_tensors='pt',
                                      padding=True, truncation=True,
                                      max_length=512)
      encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

      with torch.no_grad():
          output = bert_model(**encoded_input)
          hidden_states = output.hidden_states

          # Sum last 4 layers
          last4 = torch.stack(hidden_states[-4:]).sum(0)
          # Mean pooling
          embedding = last4.mean(dim=1)
          all_embeddings.append(embedding)

  return torch.cat(all_embeddings, dim=0)

In [None]:
###DEBUGGING: EXCLUDED FROM SCRIPT
vec = get_vector_batch(["hello world bro i dont know what else to Say what's wrong", "wha tis going on"])
# print(vec)
print(vec.shape)

100%|██████████| 1/1 [00:00<00:00,  3.29it/s]

torch.Size([2, 768])





Implement Deep AutoEncoder.

In [7]:
class DeepAutoEncoder(nn.Module):
  # could be modified to just take in a dims list
  def __init__(self, input_dim=768, hidden_dim=64):
    super().__init__()
    self.encoder = nn.Sequential(
        nn.Linear(input_dim, 512),
        nn.Sigmoid(),
        nn.Linear(512, 256),
        nn.Sigmoid(),
        nn.Linear(256, 128),
        nn.Sigmoid(),
        nn.Linear(128, hidden_dim)
      )
    self.decoder = nn.Sequential(
        nn.Linear(hidden_dim, 128),
        nn.Sigmoid(),
        nn.Linear(128, 256),
        nn.Sigmoid(),
        nn.Linear(256, 512),
        nn.Sigmoid(),
        nn.Linear(512, input_dim),
      )
  def forward(self, x):
    reduced = self.encoder(x)
    reconstructed = self.decoder(reduced)
    return reduced, reconstructed

Dataloader.

In [8]:
###DEBUGGING: EXCLUDED FROM SCRIPT
def get_dataloader(trainset, valset = None, batch_size = 64, num_workers = 2):
    train_loader = DataLoader(trainset, shuffle=True, num_workers=num_workers, batch_size=batch_size)
    if valset:
      val_loader = DataLoader(valset, shuffle=False, num_workers=num_workers, batch_size=batch_size)
      return (train_loader, val_loader)
    return (train_loader, None)

In [9]:
class ItemProfileDataset(Dataset):
    def __init__(self,input):
        """
        input: (num_items, 768) tensor
        """
        self.vecs = input

    def __len__(self):
        return self.vecs.size(0)

    def __getitem__(self, idx):
        v = self.vecs[idx]
        return v, v

Training DAE.

In [29]:
def train_DAE_epoch(model, train_loader, optimizer, criterion, device):
  """Train autoencoder for one epoch"""
  model.train()
  total_loss = 0.0

  for x, y in tqdm(train_loader, desc="Training DAE"):
      x = x.to(device)
      y = y.to(device)

      _, out = model.forward(x)
      loss = criterion(out, y)
      total_loss += loss.item()

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

  return total_loss / len(train_loader)

def train_DAE(model, trainset, optimizer, device, n_epochs=10, batch_size=64, lr=0.001):
    """Train deep autoencoder"""
    model.to(device)
    train_loader = DataLoader(trainset, shuffle=True, batch_size=batch_size)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
    criterion = nn.MSELoss()

    best_loss = float('inf')
    best_state = None

    for epoch in range(n_epochs):
        avg_loss = train_DAE_epoch(model, train_loader,
                                          optimizer, criterion, device)
        print(f"Epoch {epoch+1}/{n_epochs} - Loss: {avg_loss:.6f}")

        if avg_loss < best_loss:
            best_loss = avg_loss
            best_state = copy.deepcopy(model.state_dict())

    if best_state is not None:
        model.load_state_dict(best_state)

    return model


In [37]:
def save_dae_model(dae_model, filepath='dae_model.pt'):
    """Save trained DAE model"""
    torch.save(dae_model.state_dict(), filepath)
    print(f"DAE model saved to {filepath}")

def train_and_save_dae(item_profiles_dict, item_ids,
                       hidden_dim=20, batch_size=64,
                       n_epochs=10, device='cpu',
                       save_path='dae_model.pt'):
    """
    Train DAE on item profiles and save it

    Args:
        item_profiles_dict: dict mapping item_id to profile text
        item_ids: array of unique item IDs
        hidden_dim: output dimension of encoder
        batch_size: training batch size
        n_epochs: number of training epochs
        device: 'cpu' or 'cuda'
        save_path: path to save the trained model

    Returns:
        trained DAE model
    """
    print("=== Training DAE Model ===")

    # Load BERT
    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    bert_model = BertModel.from_pretrained("bert-base-uncased",
                                          output_hidden_states=True)

    # Prepare item texts
    unique_items = np.unique(item_ids)
    item_texts = []
    for item_id in unique_items:
        if item_id in item_profiles_dict:
            item_texts.append(item_profiles_dict[item_id])
        else:
            item_texts.append("")

    # Generate BERT embeddings
    print("Generating BERT embeddings...")
    bert_vecs = get_vector_batch(item_texts, bert_model, bert_tokenizer,
                                 batch_size=batch_size, device=device)

    # Train DAE
    print("Training Deep Autoencoder...")
    dae_model = DeepAutoEncoder(input_dim=768, hidden_dim=hidden_dim)
    dae_dataset = ItemProfileDataset(bert_vecs)
    dae_model = train_DAE(dae_model, dae_dataset,
                                  n_epochs=n_epochs,
                                  batch_size=batch_size,
                                  device=device)

    # Save model
    save_dae_model(dae_model, save_path)

    return dae_model

In [38]:
###DEBUGGING: EXCLUDED FROM SCRIPT
def train_and_save_model(model, trainset, optimizer, device, batch_size=64, n_epochs=10, filepath=None):
    model.to(device)
    # train_loader, val_loader = get_dataloader(trainset, None, batch_size)
    if filepath is None:
        filepath = f"models/trained_{model.__class__.__name__}.pt"
    model = train_DAE(model, trainset, optimizer, device, n_epochs, batch_size)
    torch.save(model.state_dict(), filepath)
    return model

Using DAE to reduce dimensions.

In [17]:
###DEBUGGING: EXCLUDED FROM SCRIPT
def get_reduced(model, input):
  data = torch.FloatTensor(input).to(device)
  model.eval()
  with torch.no_grad():
    reduced, _ = model.forward(data)
  return reduced

Implement factorization.

In [18]:
def cosine_similarity(X):
  X = X.numpy()
  numer = X @ X.T

  sq = np.square(X)
  sq_sum = sq.sum(axis=1)
  sqrt_sum = np.sqrt(sq_sum)
  denom = np.outer(sqrt_sum, sqrt_sum)

  result = np.divide(numer, denom)
  # print(result)
  result = torch.from_numpy(result)
  return result

In [31]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, num_factors=20, theta: torch.Tensor = None,  # [num_items, d]
                  k = 3,
                  beta= 0.01):
        """
        theta: the iteem profiles acquired from BERT+DAE
        """
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.f = num_factors
        self.beta = beta
        self.P = nn.Embedding(num_users, num_factors)
        self.Q = nn.Embedding(num_items, num_factors)

        #initialize randomly
        nn.init.normal_(self.P.weight, std=0.01)
        nn.init.normal_(self.Q.weight, std=0.01)

        if theta is not None:
            self.get_k_neighbors(theta, k)
        else:
            self.i_plus = None
            self.i_sims = None

        self.register_buffer("gamma", torch.ones(num_items))

    # get the k most similar items to each item
    def get_k_neighbors(self, theta, k):
        sim = cosine_similarity(theta)
        sim.fill_diagonal_(0)
        val, idx = torch.topk(sim, k=k, dim=1)
        self.register_buffer("i_plus", idx)
        self.register_buffer("i_sims", val)

    def set_gamma(self, item_count, z=5.0):
        count = item_count.float().clamp_min(1.0)
        self.gamma = z / count

    def forward(self, user_id, item_id):
        p = self.P(user_id)
        q = self.Q(item_id)
        return (p * q).sum(1)

    # objective function
    def loss(self, user_id, item_id, rating):
        pred = self.forward(user_id, item_id)
        error = 0.5 * (rating - pred).square().sum()

        p = self.P(user_id)
        q = self.Q(item_id)
        reg = self.beta * (p.square().sum() + q.square().sum())

        # this is the supplementary information regularization
        if (self.i_plus is not None) and (self.i_sims is not None):
            unique = torch.unique(item_id)
            q_self = self.Q(unique)
            q_neighbor = self.Q(self.i_plus[unique])
            diff = (q_self.unsqueeze(1) - q_neighbor).square().sum(dim=2)
            sim = self.i_sims[unique]
            gamma = self.gamma[unique].unsqueeze(1)
            reg += (diff * sim * gamma).sum()
        return error + reg




In [33]:
def train_MF_epoch(model, train_loader, optimizer):
  model.train()
  total_loss = 0.0
  for u, i, r in tqdm(train_loader):
      u = u.to(device)
      i = i.to(device)
      r = r.to(device)

      optimizer.zero_grad()
      loss = model.loss(u, i, r)
      loss.backward()
      optimizer.step()

      total_loss += loss.item()

  return total_loss / len(train_loader)

def train_MF(model, n_epochs, train_loader, optimizer, verbose=False):
  for epoch in range(1, n_epochs + 1):
      avg_loss = train_MF_epoch(model, train_loader, optimizer, device)
      print(f"Epoch {epoch:02d} | loss = {avg_loss:.4f}")
  return model


Construct ALDRS class.

In [32]:
class RatingsDataset(Dataset):
  def __init__(self, user_ids, item_ids, ratings):
    self.user_ids = torch.as_tensor(user_ids, dtype=torch.long)
    self.item_ids = torch.as_tensor(item_ids, dtype=torch.long)
    self.ratings = torch.as_tensor(ratings,  dtype=torch.float32)

  def __len__(self):
    return len(self.ratings)

  def __getitem__(self, idx):
    return (
      self.user_ids[idx],
      self.item_ids[idx],
      self.ratings[idx],
    )

In [39]:
def run_adlrs_pipeline(
    user_ids,
    item_ids,
    ratings,
    item_profiles_dict,
    dae_file=None,
    num_factors=20,
    k_neighbors=3,
    z_gamma=5.0,
    batch_size=64,
    dae_epochs=10,
    mf_epochs=20,
    lr=1e-2,
    device='cpu'
):
    """
    Complete ADLRS pipeline

    Args:
        user_ids: array of user IDs
        item_ids: array of item IDs
        ratings: array of ratings
        item_profiles_dict: dict mapping item_id to profile text
        ...

    Returns:
        trained model and item embeddings
    """

    # Map IDs to continuous indices
    unique_users = np.unique(user_ids)
    unique_items = np.unique(item_ids)

    user2idx = {u: i for i, u in enumerate(unique_users)}
    item2idx = {i: j for j, i in enumerate(unique_items)}

    u_idx = np.array([user2idx[u] for u in user_ids])
    i_idx = np.array([item2idx[i] for i in item_ids])

    num_users = len(unique_users)
    num_items = len(unique_items)

    print(f"Users: {num_users}, Items: {num_items}, Ratings: {len(ratings)}")

    # Step 1: Generate BERT embeddings
    print("\n=== Step 1: Generating BERT Embeddings ===")
    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    bert_model = BertModel.from_pretrained("bert-base-uncased",
                                          output_hidden_states=True)

    # Prepare item profiles in correct order
    item_texts = []
    for item_id in unique_items:
        if item_id in item_profiles_dict:
            item_texts.append(item_profiles_dict[item_id])
        else:
            item_texts.append("")  # Empty profile for missing items

    bert_vecs = get_vector_batch(item_texts, bert_model, bert_tokenizer,
                                 batch_size=batch_size, device=device)

    # Step 2: Load or Train Deep Autoencoder
    if dae_file is not None:
        print(f"\n=== Step 2: Loading Pre-trained DAE from {dae_file} ===")
        dae_model = DeepAutoEncoder(input_dim=768, hidden_dim=num_factors)
        dae_model.load_state_dict(torch.load(dae_file, map_location=device))
        dae_model.to(device)
        print("DAE model loaded successfully")
    else:
        print("\n=== Step 2: Training Deep Autoencoder ===")
        dae_model = DeepAutoEncoder(input_dim=768, hidden_dim=num_factors)
        dae_dataset = ItemProfileDataset(bert_vecs)
        dae_model = train_DAE(dae_model, dae_dataset,
                                      n_epochs=dae_epochs,
                                      batch_size=batch_size,
                                      device=device)

    # Step 3: Get reduced item representations
    print("\n=== Step 3: Extracting Item Features ===")
    dae_model.eval()
    with torch.no_grad():
        bert_vecs = bert_vecs.to(device)
        reduced, _ = dae_model(bert_vecs)
        reduced = reduced.cpu()

    # Step 4: Build and train MF model
    print("\n=== Step 4: Training Matrix Factorization ===")
    model = MF(
        num_users=num_users,
        num_items=num_items,
        num_factors=num_factors,
        theta=reduced,
        k=k_neighbors,
        beta=0.01,
    ).to(device)

    # Set gamma based on item rating counts
    item_counts = torch.bincount(
        torch.as_tensor(i_idx, dtype=torch.long),
        minlength=num_items,
    )
    model.set_gamma(item_counts, z=z_gamma)

    # Prepare training data
    train_dataset = RatingsDataset(u_idx, i_idx, ratings)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    trained_model = train_MF(model, mf_epochs, train_loader,
                            optimizer, device, verbose=True)

    return {
        'model': trained_model,
        'theta': reduced,
        'user2idx': user2idx,
        'item2idx': item2idx,
        'dae_model': dae_model
    }

def get_rating_pred(model, user_id, item_id, user2idx, item2idx, device):
    model.eval()
    u_idx = torch.tensor([user2idx[u] for u in user_id], dtype=torch.long).to(device)
    i_idx = torch.tensor([item2idx[i] for i in item_id], dtype=torch.long).to(device)

    with torch.no_grad():
      pred = model.forward(u_idx, i_idx)
    return pred

In [36]:
def save_adlrs_model(result, filepath='adlrs_model.pt', config=None):
    """
    Save complete ADLRS model including MF model and metadata

    Args:
        result: dict returned from run_adlrs_pipeline
        filepath: path to save the model
        config: optional dict with configuration parameters
    """
    save_dict = {
        'mf_model_state_dict': result['model'].state_dict(),
        'user2idx': result['user2idx'],
        'item2idx': result['item2idx'],
        'theta': result['theta'],
        'num_users': result['model'].num_users,
        'num_items': result['model'].num_items,
        'num_factors': result['model'].f,
    }

    if config is not None:
        save_dict['config'] = config

    torch.save(save_dict, filepath)
    print(f"ADLRS model saved to {filepath}")

def load_adlrs_model(filepath, device='cpu'):
    """
    Load saved ADLRS model

    Args:
        filepath: path to saved model
        device: 'cpu' or 'cuda'

    Returns:
        dict with model and metadata
    """
    checkpoint = torch.load(filepath, map_location=device)

    # Reconstruct MF model
    model = MF(
        num_users=checkpoint['num_users'],
        num_items=checkpoint['num_items'],
        num_factors=checkpoint['num_factors'],
        theta=checkpoint['theta'],
        k=checkpoint.get('config', {}).get('k_neighbors', 3),
        beta=checkpoint.get('config', {}).get('beta', 0.01)
    )

    model.load_state_dict(checkpoint['mf_model_state_dict'])
    model.to(device)
    model.eval()

    print(f"ADLRS model loaded from {filepath}")

    return {
        'model': model,
        'user2idx': checkpoint['user2idx'],
        'item2idx': checkpoint['item2idx'],
        'theta': checkpoint['theta'],
        'config': checkpoint.get('config', {})
    }

Implement evaluation metrics.

In [34]:
def MAE(y_true, y_pred):
  return np.mean(np.abs(y_pred - y_true))

def RMSE(y_true, y_pred):
  return np.sqrt(np.mean((y_pred - y_true) ** 2))

def HR(scores, user_items, N):
  """
  Compute Hit Ratio @ N for the whole dataset.

  Args:
      scores (dict): user_id -> {item_id: predicted_score}
      user_items (dict): user_id -> set of ground-truth relevant items
      N (int): cutoff N in top-N

  Returns:
      float: Hit Ratio @ N
  """
  hits = 0
  users = 0

  for user, relevant_items in user_items.items():

      # Skip users with no ground truth items
      if len(relevant_items) == 0:
          continue

      users += 1
      # sort scores for this user, descending
      ranked_items = sorted(scores[user].items(),
                            key=lambda x: x[1],
                            reverse=True)

      topN_items = [item for item, _ in ranked_items[:N]]

      # check if *any* relevant item appears in top-N
      if any(item in topN_items for item in relevant_items):
          hits += 1

  # avoid division by zero
  if users == 0:
      return 0.0
  return hits / users


Training ADLRS model.

In [40]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ============================================
# Load and Prepare Data
# ============================================

# Load movie data
movies_path = "movies.dat"
ratings_path = "ratings.dat"

filmsdict = create_films_dict(movies_path)
ratingsmat = create_ratings_matrix(ratings_path)

# Parse ratings
user_ids, item_ids, ratings = parse_ratings(ratingsmat)

# Prepare item profiles
item_profiles = prepare_item_profiles(filmsdict)

print(f"Loaded {len(user_ids)} ratings")
print(f"Loaded {len(item_profiles)} item profiles")

# ============================================
# Train-Test Split
# ============================================

# Split data (80-20)
indices = np.arange(len(ratings))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)

train_users = user_ids[train_idx]
train_items = item_ids[train_idx]
train_ratings = ratings[train_idx]

test_users = user_ids[test_idx]
test_items = item_ids[test_idx]
test_ratings = ratings[test_idx]

print(f"Training samples: {len(train_ratings)}")
print(f"Test samples: {len(test_ratings)}")

# ============================================
# Option 1: Train DAE separately and save it
# ============================================

USE_PRETRAINED_DAE = False  # Set to True to use pre-trained DAE
DAE_PATH = "trained_dae.pt"

if not USE_PRETRAINED_DAE:
    print("\n=== Training and Saving DAE Model ===")
    dae_model = train_and_save_dae(
        item_profiles_dict=item_profiles,
        item_ids=item_ids,
        hidden_dim=20,
        batch_size=64,
        n_epochs=10,
        device=device,
        save_path=DAE_PATH
    )
    print(f"DAE model saved to {DAE_PATH}")

# ============================================
# Option 2: Run ADLRS Pipeline (with or without pre-trained DAE)
# ============================================

print("\n=== Running ADLRS Pipeline ===")

# If USE_PRETRAINED_DAE is True, it will load the DAE from DAE_PATH
# If False, it will train a new DAE
result = run_adlrs_pipeline(
    user_ids=train_users,
    item_ids=train_items,
    ratings=train_ratings,
    item_profiles_dict=item_profiles,
    dae_file=DAE_PATH if USE_PRETRAINED_DAE else None,  # Load pre-trained or train new
    num_factors=20,
    k_neighbors=3,
    z_gamma=5.0,
    batch_size=64,
    dae_epochs=10,  # Only used if dae_file is None
    mf_epochs=30,
    lr=0.01,
    device=device
)

model = result['model']
user2idx = result['user2idx']
item2idx = result['item2idx']

# ============================================
# Evaluate on Test Set
# ============================================

print("\n=== Evaluating on Test Set ===")

# Filter test data to only include known users/items
test_mask = np.array([
    (u in user2idx and i in item2idx)
    for u, i in zip(test_users, test_items)
])

filtered_test_users = test_users[test_mask]
filtered_test_items = test_items[test_mask]
filtered_test_ratings = test_ratings[test_mask]

print(f"Test samples after filtering: {len(filtered_test_ratings)}")

# Make predictions
predictions = get_rating_pred(
    model,
    filtered_test_users,
    filtered_test_items,
    user2idx,
    item2idx,
    device
)

# Calculate metrics
mae = MAE(filtered_test_ratings, predictions)
rmse = RMSE(filtered_test_ratings, predictions)

print(f"\nTest MAE: {mae:.4f}")
print(f"Test RMSE: {rmse:.4f}")

# ============================================
# Evaluate Cold Items (items with < 5 ratings)
# ============================================

print("\n=== Evaluating Cold Items ===")

# Count ratings per item in training set
from collections import Counter
train_item_counts = Counter(train_items)

# Find cold items in test set
cold_item_mask = np.array([
    train_item_counts[i] < 5
    for i in filtered_test_items
])

if cold_item_mask.sum() > 0:
    cold_predictions = predictions[cold_item_mask]
    cold_ratings = filtered_test_ratings[cold_item_mask]

    cold_mae = MAE(cold_ratings, cold_predictions)
    cold_rmse = RMSE(cold_ratings, cold_predictions)

    print(f"Cold items count: {cold_item_mask.sum()}")
    print(f"Cold items MAE: {cold_mae:.4f}")
    print(f"Cold items RMSE: {cold_rmse:.4f}")
else:
    print("No cold items in test set")

# ============================================
# Top-N Recommendation Example
# ============================================

print("\n=== Top-N Recommendation Example ===")

# Get a sample user
sample_user = filtered_test_users[0]
print(f"Sample user ID: {sample_user}")

# Get all items
all_items = np.array(list(item2idx.keys()))

# Predict ratings for all items
sample_user_array = np.full(len(all_items), sample_user)
all_predictions = get_rating_pred(
    model,
    sample_user_array,
    all_items,
    user2idx,
    item2idx,
    device
)

# Get top 10 recommendations
top_k = 10
top_indices = np.argsort(all_predictions)[-top_k:][::-1]
top_items = all_items[top_indices]
top_scores = all_predictions[top_indices]

print(f"\nTop {top_k} recommendations:")
for rank, (item_id, score) in enumerate(zip(top_items, top_scores), 1):
    item_info = filmsdict.get(item_id, f"Item {item_id}")
    print(f"{rank}. {item_info.split('::')[1]} (score: {score:.2f})")

# ============================================
# Save Model
# ============================================

print("\n=== Saving ADLRS Model ===")

# Save the complete model with configuration
save_adlrs_model(
    result=result,
    filepath='adlrs_complete_model.pt',
    config={
        'num_factors': 20,
        'k_neighbors': 3,
        'z_gamma': 5.0,
        'beta': 0.01,
        'mf_epochs': 30,
        'lr': 0.01
    }
)

print("Model saved successfully!")

# ============================================
# Load Model (for inference)
# ============================================

print("\n=== Loading Saved Model ===")

# Load the saved model
loaded_result = load_adlrs_model('adlrs_complete_model.pt', device=device)

loaded_model = loaded_result['model']
loaded_user2idx = loaded_result['user2idx']
loaded_item2idx = loaded_result['item2idx']

print(f"Loaded model configuration: {loaded_result['config']}")

# Use loaded model for predictions
print("\n=== Testing Loaded Model ===")

# Make predictions with loaded model
loaded_predictions = get_rating_pred(
    loaded_model,
    filtered_test_users[:100],  # Test on first 100 samples
    filtered_test_items[:100],
    loaded_user2idx,
    loaded_item2idx,
    device
)

loaded_mae = MAE(filtered_test_ratings[:100], loaded_predictions)
print(f"Loaded model MAE (100 samples): {loaded_mae:.4f}")
print("Loaded model works correctly!")

Using device: cuda
Loaded 1000209 ratings
Loaded 3883 item profiles
Training samples: 800167
Test samples: 200042

=== Training and Saving DAE Model ===
=== Training DAE Model ===
Generating BERT embeddings...


Generating BERT embeddings:  52%|█████▏    | 30/58 [00:02<00:02, 10.53it/s]


KeyboardInterrupt: 

Loading ADLRS and making predictions.

In [None]:
"""
ADLRS Inference Script
Load a saved ADLRS model and make predictions
"""

import torch
import numpy as np

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ============================================
# Load Saved Model
# ============================================

print("\n=== Loading ADLRS Model ===")

model_path = 'adlrs_complete_model.pt'
result = load_adlrs_model(model_path, device=device)

model = result['model']
user2idx = result['user2idx']
item2idx = result['item2idx']
config = result['config']

print(f"Model loaded successfully!")
print(f"Configuration: {config}")
print(f"Number of users: {model.num_users}")
print(f"Number of items: {model.num_items}")
print(f"Latent factors: {model.f}")

# ============================================
# Make Predictions for Specific Users
# ============================================

def recommend_for_user(model, user_id, user2idx, item2idx,
                       top_k=10, device='cpu'):
    """
    Generate top-K recommendations for a specific user

    Args:
        model: trained MF model
        user_id: original user ID
        user2idx: user ID to index mapping
        item2idx: item ID to index mapping
        top_k: number of recommendations
        device: 'cpu' or 'cuda'

    Returns:
        list of (item_id, predicted_rating) tuples
    """
    if user_id not in user2idx:
        print(f"User {user_id} not found in training data")
        return []

    # Get all items
    all_items = np.array(list(item2idx.keys()))

    # Predict ratings for all items
    user_array = np.full(len(all_items), user_id)
    predictions = get_rating_pred(
        model, user_array, all_items,
        user2idx, item2idx, device
    )

    # Get top-K items
    top_indices = np.argsort(predictions)[-top_k:][::-1]
    top_items = all_items[top_indices]
    top_scores = predictions[top_indices]

    recommendations = list(zip(top_items, top_scores))
    return recommendations

# ============================================
# Example 1: Recommend for a single user
# ============================================

print("\n=== Example 1: Recommendations for Single User ===")

# Get first user from user2idx
sample_user_id = list(user2idx.keys())[0]
print(f"Generating recommendations for user {sample_user_id}")

recommendations = recommend_for_user(
    model=model,
    user_id=sample_user_id,
    user2idx=user2idx,
    item2idx=item2idx,
    top_k=10,
    device=device
)

print(f"\nTop 10 recommendations:")
for rank, (item_id, score) in enumerate(recommendations, 1):
    print(f"{rank}. Item {item_id}: predicted rating = {score:.2f}")

# ============================================
# Example 2: Predict specific user-item pairs
# ============================================

print("\n=== Example 2: Predict Specific Ratings ===")

# Create some user-item pairs to predict
test_users = list(user2idx.keys())[:5]
test_items = list(item2idx.keys())[:5]

print("Predicting ratings for 5 user-item pairs:")
predictions = get_rating_pred(
    model,
    np.array(test_users),
    np.array(test_items),
    user2idx,
    item2idx,
    device
)

for user, item, pred in zip(test_users, test_items, predictions):
    print(f"User {user} - Item {item}: predicted rating = {pred:.2f}")

# ============================================
# Example 3: Batch predictions for multiple users
# ============================================

print("\n=== Example 3: Batch Recommendations ===")

def batch_recommend(model, user_ids, user2idx, item2idx,
                   top_k=5, device='cpu'):
    """Generate recommendations for multiple users"""
    all_recommendations = {}

    for user_id in user_ids:
        recs = recommend_for_user(
            model, user_id, user2idx, item2idx,
            top_k, device
        )
        all_recommendations[user_id] = recs

    return all_recommendations

# Get recommendations for first 3 users
batch_users = list(user2idx.keys())[:3]
batch_recs = batch_recommend(
    model, batch_users, user2idx, item2idx,
    top_k=5, device=device
)

for user_id, recs in batch_recs.items():
    print(f"\nUser {user_id} top 5 items:")
    for rank, (item_id, score) in enumerate(recs, 1):
        print(f"  {rank}. Item {item_id}: {score:.2f}")

# ============================================
# Example 4: Cold-start prediction for new user
# ============================================

print("\n=== Example 4: Cold-Start Handling ===")

# Try to get recommendations for a user not in training set
new_user_id = max(user2idx.keys()) + 1000
print(f"Attempting recommendations for new user {new_user_id}")

new_user_recs = recommend_for_user(
    model, new_user_id, user2idx, item2idx,
    top_k=5, device=device
)

if not new_user_recs:
    print("Note: Model cannot make predictions for completely new users")
    print("For new users, you would need to:")
    print("  1. Collect some initial ratings")
    print("  2. Use item popularity or content-based recommendations")
    print("  3. Retrain the model to include the new user")

# ============================================
# Example 5: Get similar items using theta
# ============================================

print("\n=== Example 5: Find Similar Items ===")

def find_similar_items(item_id, theta, item2idx, top_k=5):
    """
    Find similar items based on item embeddings

    Args:
        item_id: target item ID
        theta: item embedding matrix
        item2idx: item ID to index mapping
        top_k: number of similar items to return

    Returns:
        list of (item_id, similarity) tuples
    """
    if item_id not in item2idx:
        print(f"Item {item_id} not found")
        return []

    idx = item2idx[item_id]
    item_vec = theta[idx].unsqueeze(0)

    # Compute cosine similarity with all items
    sim = torch.nn.functional.cosine_similarity(
        item_vec, theta, dim=1
    )

    # Get top-K (excluding the item itself)
    sim[idx] = -1  # Exclude self
    top_indices = torch.argsort(sim, descending=True)[:top_k]

    # Map back to item IDs
    idx2item = {v: k for k, v in item2idx.items()}
    similar_items = [
        (idx2item[i.item()], sim[i].item())
        for i in top_indices
    ]

    return similar_items

# Find similar items for a sample item
sample_item_id = list(item2idx.keys())[0]
print(f"Finding items similar to item {sample_item_id}")

similar = find_similar_items(
    item_id=sample_item_id,
    theta=result['theta'],
    item2idx=item2idx,
    top_k=5
)

print(f"\nTop 5 similar items:")
for rank, (item_id, similarity) in enumerate(similar, 1):
    print(f"{rank}. Item {item_id}: similarity = {similarity:.4f}")

# ============================================
# Save predictions to file (optional)
# ============================================

print("\n=== Saving Predictions ===")

# Generate predictions for all test users
all_test_users = list(user2idx.keys())[:100]  # First 100 users
all_predictions = {}

for user_id in all_test_users:
    recs = recommend_for_user(
        model, user_id, user2idx, item2idx,
        top_k=20, device=device
    )
    all_predictions[user_id] = recs

# Save to numpy file
np.save('predictions.npy', all_predictions)
print("Predictions saved to predictions.npy")

print("\n=== Inference Complete ===")