In [1]:
!pip install transformers
!pip install torch
!pip install tqdm

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
!python -c "import torch; print(torch.__version__)"

2.5.1+cu124


In [1]:
# imports

import json
import pandas as pd


from collections import defaultdict

In [2]:
user_history_filepath = "/scratch/general/vast/u1471428/hugging_face_cache/user_history_full_data.json" 

In [3]:
json_file = open(user_history_filepath, 'r')
user_history = json.load(json_file)
json_file.close()

In [4]:
def count_user_with_history_sizes(user_h):
    cnt_dict = defaultdict(int)
    for user_id in user_history.keys():
        cnt_dict[len(user_history[user_id])]+=1
    print(cnt_dict)
    
def count_user_with_history_size_above(user_h, above):
    cnt = 0
    for user_id in user_history.keys():
        if len(user_history[user_id]) >= above:
            cnt+=1
    print(cnt)

def filter_users(user_h, min_history_length, max_history_length=-1)->dict:
    filtered_users = {}
    count=0
    for user, history in user_h.items():
        if len(history) >= min_history_length:
            filtered_users[user] = history[:max_history_length]
            count+=1
        
        if count==-1:
            break
    return filtered_users
        
    

In [5]:
filtered_users = filter_users(user_history, 20, 50)
print(len(filtered_users))

7297


In [6]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from collections import Counter

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [8]:
users = list(filtered_users.keys())

# Split into train (80%) and test (20%)
train_users, test_users = train_test_split(users, test_size=0.2, random_state=42)

# Further split train users into train (80%) and validation (20%) within the training set
train_users, val_users = train_test_split(train_users, test_size=0.2, random_state=42)

# Create train, validation, and test data dictionaries
train_data = {user: filtered_users[user] for user in train_users}
val_data = {user: filtered_users[user] for user in val_users}
test_data = {user: filtered_users[user] for user in test_users}

# Print dataset sizes for verification
print(f"Train users: {len(train_users)}, Validation users: {len(val_users)}, Test users: {len(test_users)}")

Train users: 4669, Validation users: 1168, Test users: 1460


In [9]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
main_category_encoder = LabelEncoder()
category_encoder = LabelEncoder()
product_encoder = LabelEncoder()

In [10]:
main_categories = [entry["main_category"] for user in users for entry in filtered_users[user]]
main_category_encoder.fit(main_categories)
categories = [entry["categories"] for user in users for entry in filtered_users[user]]
category_encoder.fit(categories)
product_ids = [entry["product_id"] for user in users for entry in filtered_users[user]]
product_encoder.fit(product_ids)

LabelEncoder()

In [11]:
def generate_category_product_mappings(data, category_encoder, product_encoder):
    category_index_to_products = defaultdict(list)
    for user, histories in data.items():
        for history in histories:
            #print([history['categories'][-1]])
            category_idx = category_encoder.transform([history['categories'][-1]])[0]
            product_idx = product_encoder.transform([history['product_id']])[0]
            category_index_to_products[category_idx].append(product_idx)
        
        category_index_to_products = {k: list(set(v)) for k, v in category_index_to_products.items()}
        return category_index_to_products

category_index_to_products = generate_category_product_mappings(filtered_users, category_encoder, product_encoder)

In [12]:
def normalize_ratings(histories):
    all_ratings = [h["rating"] for user in histories for h in histories[user]]
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit([[r] for r in all_ratings])
    return scaler

rating_scaler = normalize_ratings(filtered_users)

def preprocess_history(history):
    texts = [
        f"{h.get('review_title','')} {h.get('features','')} {h.get('main_category')}" for h in history
    ]
    
    texts = [text for text in texts if text.strip()]
    
    tokens = tokenizer(texts, padding="max_length", truncation=True, return_tensors="pt", max_length=128)
    
    ratings = torch.tensor([rating_scaler.transform([[h["rating"]]])[0][0] for h in history], dtype=torch.float32)
    
    #categories = torch.tensor(category_encoder.transform([h["main_category"] for h in history]))
    
    categories = torch.tensor(category_encoder.transform([h["categories"] for h in history]))
    
    product_ids = torch.tensor(product_encoder.transform([h["product_id"] for h in history]))
    
    return tokens, ratings, categories, product_ids

In [13]:
class UserDataset(Dataset):
    def __init__(self, data, tokenizer, category_encoder, product_encoder, seq_len=15, pred_len=5):
        self.data = data
        self.tokenizer = tokenizer
        self.category_encoder = category_encoder # Is it required?
        self.product_encoder = product_encoder # Is it required?
        self.seq_len = seq_len
        self.pred_len = pred_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        user, history = list(self.data.items())[idx]
        tokens, ratings, categories, product_ids = preprocess_history(history)
        input_tokens = {
            "input_ids":tokens["input_ids"][:self.seq_len],
            "attention_mask": tokens["attention_mask"][:self.seq_len],
            "token_type_ids": tokens["token_type_ids"][:self.seq_len],
        }
        input_ratings = ratings[:self.seq_len]
        input_categories = categories[:self.seq_len]
        
        future_categories = categories[self.seq_len:]
        
        future_products = product_ids[self.seq_len:]
        
        target_category_vector = torch.zeros(len(self.category_encoder.classes_))
        target_category_vector[future_categories] = 1
        
        target_vector = torch.zeros(len(self.product_encoder.classes_))
        target_vector[future_products] = 1
        
        return input_tokens, input_ratings, input_categories, target_category_vector, target_vector

train_dataset = UserDataset(train_data, tokenizer, category_encoder, product_encoder)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [14]:
class SingleVectorTransformerRecommendationModel(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", num_categories=900, num_products=1000, d_model=128, nhead=8, num_encoder_layers=3):
        super(SingleVectorTransformerRecommendationModel, self).__init__()
        
        self.bert = BertModel.from_pretrained(bert_model_name)
        bert_hidden_size = self.bert.config.hidden_size
        
        self.category_embedding = nn.Embedding(num_categories, d_model)
        
        self.history_encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.history_encoder = nn.TransformerEncoder(self.history_encoder_layer, num_layers = num_encoder_layers)
         
        self.input_projection = nn.Linear(bert_hidden_size + d_model, d_model)
            
        self.product_category_fc = nn.Linear(d_model, num_categories)
        self.product_fc = nn.Linear(d_model, num_products)
        
        self.activation = nn.ReLU()
        self.layer_norm = nn.LayerNorm(d_model)
        
        self.sigmoid = nn.Sigmoid()
        self.init_weights()
        
    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
    
    def forward(self, tokens, ratings, categories, category_to_product_map=None):
        batch_size, seq_len, max_token_length = tokens["input_ids"].shape
        
        input_ids = tokens["input_ids"].view(-1, max_token_length)
        attention_mask = tokens["attention_mask"].view(-1, max_token_length)
        token_type_ids = tokens["token_type_ids"].view(-1, max_token_length)
        
        bert_output = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids
        )
        sequence_output = bert_output.last_hidden_state
        sequence_output = sequence_output[:,0,:].view(batch_size, seq_len, -1)

        
        category_embeds = self.category_embedding(categories)
        combined_features = torch.cat([sequence_output, category_embeds], dim=-1)
        projected_features = self.input_projection(combined_features)
        normalized_features = self.layer_norm(projected_features)  # Apply LayerNorm
        activated_features = self.activation(normalized_features) 
        
        history_output = self.history_encoder(activated_features)
        user_representation = history_output.mean(dim=1)
        
        normalized_output = self.layer_norm(user_representation)  # Apply LayerNorm
        activated_output = self.activation(normalized_output)  # Apply activation
        
        product_category_logits = self.product_category_fc(activated_output)
        product_logits = self.product_fc(activated_output)
        
        if category_to_product_map is not None:
            for i in range(batch_size):
                product_mask = torch.zeros_like(product_logits[i])  # [num_products]
                for category_idx, category_logit in enumerate(product_category_logits[i]):
                    if category_idx in category_to_product_map:
                        valid_product_indices = category_to_product_map[category_idx]
                        product_mask[valid_product_indices] += category_logit
                product_logits[i] *= product_mask
        
        
        
        product_probabilities = self.sigmoid(product_logits)  # [batch_size, num_products]
        product_category_probabilities = self.sigmoid(product_category_logits) 
        
        return product_probabilities, product_category_probabilities
        

In [15]:
def compute_top_k_accuracy(predicted_vector, target_vector, k=5):
    """
    Compute Top-K accuracy for multi-label classification.
    Args:
        predicted_vector (Tensor): Predicted probabilities for products [batch_size, num_products].
        target_vector (Tensor): Binary target vector [batch_size, num_products].
        k (int): Number of top predictions to consider.
    Returns:
        top_k_accuracy (float): Top-K accuracy for the batch.
    """
    # Get indices of the top-k predictions for each batch
    top_k_preds = torch.topk(predicted_vector, k=k, dim=-1).indices  # [batch_size, k]

    # Gather the target values corresponding to the top-k predictions
    true_positives = target_vector.gather(1, top_k_preds)  # [batch_size, k]

    # Count how many of the top-k predictions are correct
    top_k_correct = true_positives.sum(dim=-1)  # [batch_size]

    # Compute the accuracy as the mean of correct predictions
    top_k_accuracy = (top_k_correct > 0).float().mean().item()

    return top_k_accuracy

In [17]:
def compute_top_k_metrics(predicted_vector, target_vector, k=5):
    """
    Compute top-k accuracy and precision for classification tasks.
    
    Args:
        predicted_vector (torch.Tensor): Predicted probabilities or logits
        target_vector (torch.Tensor): Ground truth labels
        k (int, optional): Number of top predictions to consider. Defaults to 5.
    
    Returns:
        dict: A dictionary containing top-k accuracy and precision
    """
    # Get indices of the top-k predictions for each batch 
    top_k_preds = torch.topk(predicted_vector, k=k, dim=-1).indices  # [batch_size, k] 
 
    # Gather the target values corresponding to the top-k predictions 
    true_positives = target_vector.gather(1, top_k_preds)  # [batch_size, k] 
 
    # Compute top-k accuracy
    top_k_correct = true_positives.sum(dim=-1)  # [batch_size] 
    top_k_accuracy = (top_k_correct > 0).float().mean().item()
    
    # Compute top-k precision
    # Precision = (number of correct predictions in top-k) / (total number of top-k predictions)
    correct_predictions_count = true_positives.sum()
    top_k_precision = correct_predictions_count / (top_k_preds.shape[0] * k)
    
    return top_k_accuracy,top_k_precision.item()

In [26]:


def evaluate_top_k_accuracy(model, test_loader, k=5):
    """
    Evaluate average Top-K accuracy over the test set with a progress bar.
    Args:
        model (nn.Module): Trained model to evaluate.
        test_loader (DataLoader): DataLoader for the test dataset.
        k (int): Number of top predictions to consider.
    Returns:
        average_top_k_accuracy (float): Average Top-K accuracy over all test batches.
    """
    model.eval()
    total_top_k_accuracy = 0.0
    total_top_k_precision = 0.0
    total_batches = 0

    # Add a progress bar
    progress_bar = tqdm(test_loader, desc="Evaluating", leave=True)

    with torch.no_grad():
        for batch_tokens, ratings, categories, _, target_vector in progress_bar:
            # Move inputs and targets to the device
            tokens = {key: val.to(device) for key, val in batch_tokens.items()}
            target_vector = target_vector.to(device)
            categories = categories.to(device)

            # Make predictions
            predicted_vector,_ = model(tokens, ratings, categories, category_to_product_map = category_index_to_products)

            # Compute Top-K accuracy for the batch
            top_k_acc, top_k_prec = compute_top_k_metrics(predicted_vector, target_vector, k=k)

            # Update total accuracy and batch count
            total_top_k_accuracy += top_k_acc
            total_top_k_precision += top_k_prec
            total_batches += 1

            # Update progress bar
            progress_bar.set_postfix({"Batch Top-K Acc": top_k_acc,
                                     "Batch Top-K prec": top_k_prec})

    # Compute average Top-K accuracy
    average_top_k_accuracy = total_top_k_accuracy / total_batches
    print(f"Average Top-{k} Accuracy: {average_top_k_accuracy:.4f}")
    average_top_k_precision = total_top_k_precision / total_batches
    print(f"Average Top-{k} Precision: {average_top_k_precision:.4f}")
    return average_top_k_accuracy

# Example usage:
# average_top_k = evaluate_top_k_accuracy(model, test_loader, k=5)
val_dataset = UserDataset(val_data, tokenizer, category_encoder, product_encoder)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=True)




In [19]:
model = SingleVectorTransformerRecommendationModel(num_categories=len(category_encoder.classes_), num_products=len(product_encoder.classes_)).to(device)
loss_fn = nn.BCEWithLogitsLoss()
# category_loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam([
    {"params": model.bert.parameters(), "lr": 1e-5, "weight_decay": 1e-5},
    {"params": model.category_embedding.parameters(), "lr": 1e-4, "weight_decay": 1e-5},
    {"params": model.input_projection.parameters(), "lr": 1e-4, "weight_decay": 1e-5},
    {"params": model.product_category_fc.parameters(), "lr": 1e-2, "weight_decay": 1e-5},
    {"params": model.history_encoder.parameters(), "lr": 1e-4, "weight_decay": 1e-5},
    {"params": model.product_fc.parameters(), "lr": 1e-3, "weight_decay": 1e-5},
])

In [21]:
num_epochs = 2
for epoch in range(num_epochs):
    model.train()
    total_category_loss = 0.0
    total_product_loss = 0.0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}", leave=True)

    for tokens, ratings, categories, target_category_vector, target_vector in progress_bar:
        tokens = {key: val.to(device) for key, val in tokens.items()}
        ratings = ratings.to(device)
        categories = categories.to(device)
        target_vector = target_vector.to(device)
        target_category_vector = target_category_vector.to(device)
        

        optimizer.zero_grad()

        # Forward pass
        predicted_product_logits, predicted_category_logits = model(tokens, ratings, categories, category_to_product_map = category_index_to_products)

        # Compute loss
        product_loss = loss_fn(predicted_product_logits, target_vector.float())
        category_loss = loss_fn(predicted_category_logits, target_category_vector.float())
        
        loss = product_loss + category_loss
        
        loss.backward()
        
        optimizer.step()
        total_product_loss += product_loss.item()
        total_category_loss += category_loss.item()

        # Update progress bar
        progress_bar.set_postfix({
            "Prod Cat Loss": category_loss.item(),
            "Prod ID Loss": product_loss.item(),
        })

    print(f"Epoch {epoch + 1}")
    print(f"  Category Loss: {total_category_loss / len(train_loader):.4f}")
    print(f"  Product Loss: {total_product_loss / len(train_loader):.4f}")


Epoch 1: 100%|██████████| 292/292 [14:26<00:00,  2.97s/it, Prod Cat Loss=0.693, Prod ID Loss=0.974]


Epoch 1
  Category Loss: 0.6959
  Product Loss: 0.9737


Epoch 2: 100%|██████████| 292/292 [14:28<00:00,  2.98s/it, Prod Cat Loss=0.693, Prod ID Loss=0.974]

Epoch 2
  Category Loss: 0.6931
  Product Loss: 0.9737





In [22]:
torch.save(model.state_dict(), "single_vector_arch_change.pth")
print("Model's state dictionary saved to 'single_vector_arch_change.pth'")

Model's state dictionary saved to 'single_vector_arch_change.pth'


In [23]:
model.load_state_dict(torch.load("single_vector_arch_change.pth"))

  model.load_state_dict(torch.load("single_vector_arch_change.pth"))


<All keys matched successfully>

In [24]:
test_dataset = UserDataset(test_data, tokenizer, category_encoder, product_encoder)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)

In [27]:
average_top_k = evaluate_top_k_accuracy(model, test_loader, k=5)
# print(f"Epoch {epoch+1} Val result top k acc {average_top_k}")
average_top_k = evaluate_top_k_accuracy(model, test_loader, k=3)
# print(f"Epoch {epoch+1} Val result top k acc {average_top_k}")
average_top_k = evaluate_top_k_accuracy(model, test_loader, k=1)
# print(f"Epoch {epoch+1} Val result top k acc {average_top_k}")

Evaluating: 100%|██████████| 365/365 [03:19<00:00,  1.83it/s, Batch Top-K Acc=0, Batch Top-K prec=0]      


Average Top-5 Accuracy: 0.0007
Average Top-5 Precision: 0.0001


Evaluating: 100%|██████████| 365/365 [03:18<00:00,  1.84it/s, Batch Top-K Acc=0, Batch Top-K prec=0]


Average Top-3 Accuracy: 0.0000
Average Top-3 Precision: 0.0000


Evaluating: 100%|██████████| 365/365 [03:18<00:00,  1.84it/s, Batch Top-K Acc=0, Batch Top-K prec=0]

Average Top-1 Accuracy: 0.0000
Average Top-1 Precision: 0.0000



