Modules import and set up. (Running with Google Colab Python3 on T4 GPU.)

In [None]:
# import os, time, json, copy, pickle, random, requests, re
import numpy as np
# from PIL import Image

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader

In [None]:
from tqdm import tqdm
import time

Read the movie and ratings files.

In [None]:
movies_path = "movies.dat"
ratings_path = "ratings.dat"

def create_films_dict(file_path):
    master_dict = {}
    with open(file_path, "r", encoding="latin-1") as file:
        for line in file:
            full_line = line.strip()
            id = int(full_line.split("::")[0])
            master_dict[id] = full_line
    return master_dict

def create_ratings_matrix(file_path):
    full_array = []
    with open(file_path, "r", encoding = "latin-1") as file:
        for line in file:
            full_line = line.strip()
            full_array.append(full_line)
        as_np_array = np.array(full_array)
    return as_np_array
filmsdict = create_films_dict(movies_path)
ratingsmat = create_ratings_matrix(ratings_path)

In [None]:
print(filmsdict[1])
print(ratingsmat[1])

Reference: https://huggingface.co/google-bert/bert-base-uncased

In [None]:
# Load BERT model
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased",
                                  output_hidden_states=True)

Generate vector with BERT.

In [None]:
def get_vector_batch(texts, batch_size=16):
  bert_model.eval()
  all= []
  for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i+batch_size]
    encoded_input = bert_tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
          output = bert_model(**encoded_input)
          hidden_states = output.hidden_states
          # print(len(hidden_states))
    last4 = torch.stack(hidden_states[-4:]).sum(0)
    # print(last4.shape)
    embedding = last4.mean(dim=1)
    all.append(embedding)
  # print(embedding.shape)
  return torch.cat(all)

In [None]:
#testing
vec = get_vector_batch(["hello world bro i dont know what else to Say what's wrong", "wha tis going on"])
# print(vec)
print(vec.shape)

100%|██████████| 1/1 [00:00<00:00,  3.29it/s]

torch.Size([2, 768])





Implement Deep AutoEncoder.

In [None]:
class DeepAutoEncoder(nn.Module):
  # could be modified to just take in a dims list
  def __init__(self, input_dim=768, hidden_dim=64):
    super().__init__()
    self.encoder = nn.Sequential(
        nn.Linear(input_dim, 512),
        nn.Sigmoid(),
        nn.Linear(512, 256),
        nn.Sigmoid(),
        nn.Linear(256, 128),
        nn.Sigmoid(),
        nn.Linear(128, hidden_dim)
      )
    self.decoder = nn.Sequential(
        nn.Linear(bottleneck_dim, 128),
        nn.Sigmoid(),
        nn.Linear(128, 256),
        nn.Sigmoid(),
        nn.Linear(256, 512),
        nn.Sigmoid(),
        nn.Linear(512, input_dim),
      )
  def forward(self, x):
    reduced = self.encoder(x)
    reconstructed = self.decoder(reduced)
    return reduced, reconstructed

Dataloader.

In [None]:
def get_dataloader(trainset, valset = None, batch_size = 64, num_workers = 2):
    train_loader = DataLoader(trainset, shuffle=True, num_workers=num_workers, batch_size=batch_size)
    if valset:
      val_loader = DataLoader(valset, shuffle=False, num_workers=num_workers, batch_size=batch_size)
      return (train_loader, val_loader)
    return (train_loader, None)

In [None]:
class ItemProfileDataset(Dataset):
    def __init__(self,input):
        """
        input: (num_items, 768) tensor
        """
        self.vecs = input

    def __len__(self):
        return self.vecs.size(0)

    def __getitem__(self, idx):
        v = self.vecs[idx]
        return v, v

Training DAE.

In [None]:
criterion = nn.MSELoss()
trainset = ItemProfileDataset(vec) #validation set needs to be set later
valset = None
# train_loader, val_loader = get_dataloader(trainset, valset)
# optimizer =
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
def train_one_epoch(model, train_loader, optimizer):
  start = time.time()
  model.train()
  total_loss = 0
  for x, y in tqdm(train_loader):
      x = x.to(device)
      y = y.to(device)
      # x = x.view(x.size(0), -1)
      # print(x.shape, y.shape)
      _, out = model.forward(x)
      loss = criterion(out, y)
      total_loss += loss.item()

      # out = out.view(out.size(0), -1)

      # print(out.shape)
      # print(y.shape)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
  avg_loss = total_loss / len(train_loader)
  end = time.time()
  return (avg_loss, end - start)

def validate_model(model, val_loader):
    start = time.time()
    model.eval()
    total_loss = 0
    with torch.no_grad():
      for x, y in tqdm(val_loader):
          x = x.to(device)
          y = y.to(device)
          # x = x.view(x.size(0), -1)
          # print(x.shape, y.shape)
          _, out = model.forward(x)
          loss = criterion(out, y)
          total_loss += loss.item()

    avg_loss = total_loss / len(val_loader)
    end = time.time()
    return (avg_loss, end - start)

def train_model(model, n_epochs, train_loader, val_loader, optimizer, scheduler = None, verbose = False):
    best_loss = float('inf')
    best_state = None
    # best_model = None
    best_epoch_num = 0
    train_losses = []
    total_time = 0
    for i in range(n_epochs):
        train_loss, train_time = train_one_epoch(model, train_loader, optimizer)
        val_loss, val_time = validate_model(model, val_loader)
        train_losses.append(train_loss)
        total_time += train_time + val_time

        if val_loss < best_loss:
            best_loss = val_loss
            best_state = copy.deepcopy(model.state_dict())
            best_epoch_num = i + 1
        if scheduler is not None:
            scheduler.step()
    model.load_state_dict(best_state)
    return (model, best_epoch_num, train_losses, total_time)

In [None]:
def train_and_save_model(model, optimizer, batch_size=64, n_epochs=10, scheduler=None, filepath=None, verbose=False):
    model.to(device)
    train_loader, val_loader = get_dataloader(full_trainset, full_valset, batch_size)
    if filepath is None:
        filepath = f"models/trained_{model.__class__.__name__}.pt"
    best_model, best_loss_epoch_num, train_losses, total_time = train_model(model, n_epochs, train_loader, val_loader, optimizer, scheduler, verbose)
    torch.save(best_model.state_dict(), filepath)
    return (best_model, best_loss_epoch_num, total_time)

In [None]:
model = DeepAutoEncoder()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay = 5e-4)
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda epoch: 0.95 * epoch)
best_model, best_loss_epoch_num, total_time = train_and_save_model(model, optimizer, scheduler=scheduler)

Using DAE to reduce dimensions.

In [None]:
def get_reduced(model, input):
  data = torch.FloatTensor(input).to(device)
  model.eval()
  with torch.no_grad():
    reduced, _ = model.forward(data)
  return reduced

In [None]:
# actually run it on all the data passed in

Implement factorization.

In [None]:
def cosine_similarity(X):
  X = X.numpy()
  numer = X @ X.T

  sq = np.square(X)
  sq_sum = sq.sum(axis=1)
  sqrt_sum = np.sqrt(sq_sum)
  denom = np.outer(sqrt_sum, sqrt_sum)

  result = np.divide(numer, denom)
  # print(result)
  result = torch.from_numpy(result)
  return result

In [None]:
class MF(nn.Module):
  def __init__(self, num_users, num_items, num_factors=20, theta: torch.Tensor = None,  # [num_items, d]
                 k = 3,
                 beta= 0.01):
    """
    theta: the iteem profiles acquired from BERT+DAE
    """
    super().__init__()
    self.num_users = num_users
    self.num_items = num_items
    self.f = num_factors
    self.beta = beta
    self.P = nn.Embedding(num_users, num_factors)
    self.Q = nn.Embedding(num_items, num_factors)

    #initialize randomly
    nn.init.normal_(self.P.weight, std=0.01)
    nn.init.normal_(self.Q.weight, std=0.01)

    if theta is not None:
      self.get_k_neighbors(theta, k)
    else:
      self.i_plus = None
      self.i_sims = None

    self.register_buffer("gamma", torch.ones(num_items))

  # get the k most similar items to each item
  def get_k_neighbors(self, theta, k):
    sim = cosine_similarity(theta)
    sim.fill_diagonal_(0)
    val, idx = torch.topk(sim, k=k, dim=1)
    self.register_buffer("i_plus", idx)
    self.register_buffer("i_sims", val)

  def set_gamma(self, item_count, z=5.0):
    count = item_count.float().clamp_min(1.0)
    self.gamma = z / count

  def forward(self, user_id, item_id):
    p = self.P(user_id)
    q = self.Q(item_id)
    return (p * q).sum(1)

  # objective function
  def loss(self, user_id, item_id, rating):
    pred = self.forward(user_id, item_id)
    error = 0.5 * (rating - pred).square().sum()

    p = self.P(user_id)
    q = self.Q(item_id)
    reg = self.beta * (p.square().sum() + q.square().sum())

    if (self.i_plus is not None) and (self.i_sims is not None):
      unique = torch.unique(item_id)
      q_self = self.Q(unique)
      q_neighbor = self.Q(self.i_plus[unique])
      diff = (q_self.unsqueeze(1) - q_neighbor).square().sum(dim=2)
      sim = self.i_sims[unique]
      gamma = self.gamma[unique].unsqueeze(1)
      reg += (diff * sim * gamma).sum()
    return error + reg




In [None]:
def train_MF_epoch(model, train_loader, optimizer):
  model.train()
  total_loss = 0.0
  for u, i, r in tqdm(train_loader):
      u = u.to(device)
      i = i.to(device)
      r = r.to(device)

      optimizer.zero_grad()
      loss = model.loss(u, i, r)
      loss.backward()
      optimizer.step()

      total_loss += loss.item()

  return total_loss / len(loader)

def train_MF(model, n_epochs, train_loader, optimizer, verbose=False):
  for epoch in range(1, n_epochs + 1):
      avg_loss = train_epoch(model, train_loader, optimizer, device)
      print(f"Epoch {epoch:02d} | loss = {avg_loss:.4f}")


Construct ALDRS class.

In [None]:
class RatingsDataset(Dataset):
  def __init__(self, user_ids, item_ids, ratings):
    self.user_ids = torch.as_tensor(user_ids, dtype=torch.long)
    self.item_ids = torch.as_tensor(item_ids, dtype=torch.long)
    self.ratings = torch.as_tensor(ratings,  dtype=torch.float32)

  def __len__(self):
    return len(self.ratings)

  def __getitem__(self, idx):
    return (
      self.user_ids[idx],
      self.item_ids[idx],
      self.ratings[idx],
    )

In [None]:
def run_adlrs_pipeline(
  ratings,
  users,
  items, #this is the supporting item profiles
  DAE_model,
  num_factors=20,
  k_neighbors=3,
  z_gamma=5.0,
  batch_size=64,
  num_epochs=20,
  lr=1e-2,
):

    # ---------- 0. Map raw ids to 0..N-1 (if needed) ----------
    # subject to change of the actual data
    # user2idx = {u: i for i, u in enumerate(sorted(ratings_df["user_id"].unique()))}
    # item2idx = {i: j for j, i in enumerate(sorted(ratings_df["item_id"].unique()))}

    # ratings_df = ratings_df.copy()
    # ratings_df["u_idx"] = ratings_df["user_id"].map(user2idx)
    # ratings_df["i_idx"] = ratings_df["item_id"].map(item2idx)

    # num_users = len(user2idx)
    # num_items = len(item2idx)

    # # Make sure items_df is aligned with item indices
    # items_df = items_df.copy()
    # items_df["i_idx"] = items_df["item_id"].map(item2idx)
    # items_df = items_df.dropna(subset=["i_idx"])
    # items_df["i_idx"] = items_df["i_idx"].astype(int)

    ##### Item Embedding

    item_indices = items.index.tolist()
    texts = items.copy()

    bert_vecs = get_vector_batch(texts, batch_size).to(device)
    reduced = get_reduced(DAE_model, bert_vecs).to(device)

    ##### Build ADLRS MF model & compute γ and neighbors
    model = MF(
      num_users=num_users,
      num_items=num_items,
      num_factors=num_factors,
      theta=reduced,
      k=k_neighbors,
      beta=0.01,
    ).to(device)

    # per-item rating counts for γ = z / count(i)
    item_counts = torch.bincount(
        torch.as_tensor(ratings_df["i_idx"].values, dtype=torch.long),
        minlength=num_items,
    )
    model.set_gamma(item_counts, z=z_gamma)

    # ---------- 3. Prepare rating DataLoader ----------
    train_dataset = RatingsDataset(
        ratings_df["u_idx"].values,
        ratings_df["i_idx"].values,
        ratings_df["rating"].values,
    )
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # ---------- 4. Train MF (Algorithm 1 lines 10–18) ----------
    # train_MF should implement the SGD loop that calls model.loss(...)
    trained_model = train_MF(
        model=model,
        train_loader=train_loader,
        num_epochs=num_epochs,
        lr=lr,
        device=device,
    )

    # ---------- 5. Return everything we might want later ----------
    return {
        "model": trained_model,
        "theta": reduced,               # BERT+DAE item profiles
        "user2idx": user2idx,
        "item2idx": item2idx,
    }

def get_rating_pred(model, user_id, item_id):
  model.eval()
  with torch.no_grad():
    pred = model.forward(user_id, item_id)
  return pred

Implement evaluation metrics.

In [None]:
def MAE(y_true, y_pred):
  return np.mean(np.abs(y_pred - y_true))

def RMSE(y_true, y_pred):
  return np.sqrt(np.mean((y_pred - y_true).square()))

def HR(scores, user_items, N):
  """
  Compute Hit Ratio @ N for the whole dataset.

  Args:
      scores (dict): user_id -> {item_id: predicted_score}
      user_items (dict): user_id -> set of ground-truth relevant items
      N (int): cutoff N in top-N

  Returns:
      float: Hit Ratio @ N
  """
  hits = 0
  users = 0

  for user, relevant_items in user_items.items():

      # Skip users with no ground truth items
      if len(relevant_items) == 0:
          continue

      users += 1

      # sort scores for this user, descending
      ranked_items = sorted(scores[user].items(),
                            key=lambda x: x[1],
                            reverse=True)

      # take top-N item IDs
      topN_items = [item for item, _ in ranked_items[:N]]

      # check if *any* relevant item appears in top-N
      if any(item in topN_items for item in relevant_items):
          hits += 1

  # avoid division by zero
  if users == 0:
      return 0.0

  return hits / users
