In [1]:
!pip install transformers
!pip install torch
!pip install tqdm

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
!python -c "import torch; print(torch.__version__)"

2.5.1+cu124


### Imports

In [1]:
# imports

import json
import pandas as pd


from collections import defaultdict

In [2]:
!pwd

/uufs/chpc.utah.edu/common/home/u1471428/nsl/symbolic_recommendation


### Constants

In [3]:
#constants

user_history_filepath = "/scratch/general/vast/u1471428/hugging_face_cache/user_history_data.json" 

### Load Dataset

In [4]:
# load dataset

json_file = open(user_history_filepath, 'r')
user_history = json.load(json_file)
json_file.close()

In [17]:
def count_user_with_history_sizes(user_h):
    cnt_dict = defaultdict(int)
    for user_id in user_history.keys():
        cnt_dict[len(user_history[user_id])]+=1
    print(cnt_dict)
    
def count_user_with_history_size_above(user_h, above):
    cnt = 0
    for user_id in user_history.keys():
        if len(user_history[user_id]) >= above:
            cnt+=1
    print(cnt)

def filter_users(user_h, min_history_length, max_history_length=-1)->dict:
    filtered_users = {}
    count=0
    for user, history in user_h.items():
        if len(history) >= min_history_length:
            filtered_users[user] = history[:min_history_length]
            count+=1
        
        if count==-1:
            break
    return filtered_users
        
    

In [18]:
# count_user_with_history_sizes(user_history)
# count_user_with_history_size_above(user_history, 20)

filtered_users = filter_users(user_history, 20)
print(len(filtered_users))

7297


In [19]:
for user_id in filtered_users.keys():
    history = (filtered_users[user_id])
    print(len(history))
    print(history[0])
    break

20
{'rating': 3.0, 'review_title': "Didn't work for my needs", 'review_text': "Truth be told, I actually got this playpen as I was looking for an alternative to having a crate for the puppy I was getting. I am truly disappointed when I first used it. First, I had a real hard time putting this together. As a single 60 something year old woman, I had to painstakingly, reading instructions, try to put this together by myself. I guess I was hoping I could just open the box and 'pop' open the playpen. Instead, I had to put a load of pipe like pieces together and then string the netting and pace over it. After finally getting it together, I was ready to try the new puppy in there. The puppy was not happy (of course, probably to be expected) and I found her almost able to rip the netting apart. It also did not form the strongest of a surround set up (it was kind of flimsy). My puppy had more fun trying to hide under it when she was outside of it. So, this all being said, I am not sure how it 

In [20]:
assert all(len(user) == 20 for user in filtered_users.values()), "Not all users have a history length of 20"

In [21]:
# for user_id, user_data in filtered_users.items():
#     history_length = len(user_data)
#     print(f"User {user_id}: History length = {history_length}")

### Imports for training

In [22]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

### Prepare for Train, Validation and Test split

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [24]:
# device = torch.device("cpu")

In [34]:
# Prepare train and test set

users = list(filtered_users.keys())
train_users, test_users = train_test_split(users, test_size=0.2, random_state=42)
train_data = {user: filtered_users[user] for user in train_users}
test_data = {user: filtered_users[user] for user in test_users}

In [35]:
# Tokenizer and Label Encoder
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
category_encoder = LabelEncoder()
product_encoder = LabelEncoder()

In [36]:
#Fit Encoders

categories = [entry["categories"][-1] for user in users for entry in filtered_users[user]]
category_encoder.fit(categories)
product_ids = [entry["product_id"] for user in users for entry in filtered_users[user]]
product_encoder.fit(product_ids)

LabelEncoder()

In [37]:
def normalize_ratings(histories):
    all_ratings = [h["rating"] for user in histories for h in histories[user]]
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit([[r] for r in all_ratings])
    return scaler

rating_scaler = normalize_ratings(filtered_users)

def preprocess_history(history):
    texts = [
        f"{h.get('review_title','')} {h.get('review_text','')} {h.get('features','')} {h.get('description','')}" for h in history
    ]
    
    texts = [text for text in texts if text.strip()]
    
    tokens = tokenizer(texts, padding="max_length", truncation=True, return_tensors="pt", max_length=128)
    
    ratings = torch.tensor([rating_scaler.transform([[h["rating"]]])[0][0] for h in history], dtype=torch.float32)
    
    categories = torch.tensor(category_encoder.transform([h["categories"][-1] for h in history]))
    
    product_ids = torch.tensor(product_encoder.transform([h["product_id"] for h in history]))
    
    return tokens, ratings, categories, product_ids
    

In [38]:
# Converting into tensor OR dataloader

class UserDataset(Dataset):
    def __init__(self, data, tokenizer, category_encoder, product_encoder, seq_len=15, pred_len=5):
        self.data = data
        self.tokenizer = tokenizer
        self.category_encoder = category_encoder # Is it required?
        self.product_encoder = product_encoder # Is it required?
        self.seq_len = seq_len
        self.pred_len = pred_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        user, history = list(self.data.items())[idx]
        tokens, ratings, categories, product_ids = preprocess_history(history)
        input_tokens = {
            "input_ids":tokens["input_ids"][:self.seq_len],
            "attention_mask": tokens["attention_mask"][:self.seq_len],
            "token_type_ids": tokens["token_type_ids"][:self.seq_len],
        }
        target_ids = product_ids[self.seq_len: self.seq_len+self.pred_len]
        
        return input_tokens, ratings[:self.seq_len], categories[:self.seq_len], target_ids

train_dataset = UserDataset(train_data, tokenizer, category_encoder, product_encoder)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [39]:
# Defining the model architecture

def my_function(input_ids, attention_mask, token_type_ids):
    print("input_ids:", input_ids)
    print("attention_mask:", attention_mask)
    print("token_type_ids", token_type_ids)
    
    print("shape_input_ids:", input_ids.shape)
    print("shape_attention_mask:", attention_mask.shape)
    print("shape_token_type_ids", token_type_ids.shape)

# Call function with **tokens
# my_function(**tokens)

class TransformerRecommendationModel(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", num_categories=3, num_products=1000, d_model=128, nhead=8, num_decoder_layers=3):
        super(TransformerRecommendationModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        bert_hidden_size = self.bert.config.hidden_size 
        
        self.fc_features = nn.Linear(bert_hidden_size+1, d_model) # Bert output + rating
        self.category_embedding = nn.Embedding(num_categories, d_model)
        
        self.tgt_embedding = nn.Embedding(num_products, d_model)
        
        # Transformer Decoder
        self.decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=num_decoder_layers)
        
        self.fc_out = nn.Linear(d_model, num_products)
        self.activation = nn.ReLU()
        
        self.layer_norm = nn.LayerNorm(d_model)
        
        self.init_weights()
    
    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
        
    
    def forward(self, tokens, ratings, categories, tgt):
        batch_size, seq_len, max_token_length = tokens["input_ids"].shape
        
        input_ids = tokens["input_ids"].view(-1, max_token_length)  # [batch_size * seq_len, max_token_length]
        attention_mask = tokens["attention_mask"].view(-1, max_token_length)
        token_type_ids = tokens["token_type_ids"].view(-1, max_token_length)
        
        pooler_out = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        ).pooler_output
        
#         print("pooler output shape", pooler_out.shape)
#         batch_size, seq_len, _ = ratings.shape
        ratings = ratings.view(batch_size * seq_len, -1).float()
#         print("ratings shape", ratings.shape)
        x = torch.cat([pooler_out, ratings], dim=-1) # add ratings
        x = self.fc_features(x)
        x = self.activation(x)
        x = self.layer_norm(x)
        
#         print("x shape", x.shape)
        
        category_embeds = self.category_embedding(categories).float()
#         print("category embeds shape",category_embeds.shape)
        x = x.view(batch_size,seq_len,-1)
        x = x + category_embeds
        
        x = self.activation(x)
        x = self.layer_norm(x)
#         print("x+category shape", x.shape)
        
        
#         memory = x.repeat(1, tgt.size(1), 1).reshape(batch_size, tgt.size(1), -1)
        memory = x.view(batch_size, seq_len, -1)
#         tgt = tgt.float()
#         print("memory shape", memory.shape)
        
        
        tgt_embeds = self.tgt_embedding(tgt)
#         print("tgt_embeds shape:", tgt_embeds.shape)
        
#         print(tgt_embeds)
        
#         print(memory)
        decoded = self.decoder(tgt=tgt_embeds, memory=memory)
#         print("decoded shape", decoded.shape)
        logits = self.fc_out(decoded)
#         print("logits shape", logits.shape)
        return logits

In [40]:
# Setting configuration
model = TransformerRecommendationModel(num_categories=len(category_encoder.classes_), num_products=len(product_encoder.classes_)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [41]:
# for param in model.parameters():
#     assert param.device.type == "cpu", "Model parameters must be on CPU"

# torch.cuda.empty_cache()

In [42]:
# cnt=0
# for tokens, ratings, categories, target_ids in train_loader:
#     cnt+=1
#     print(cnt)
#     # Check tokens shape consistency
#     token_shapes = {key: tensor.shape for key, tensor in tokens.items()}
#     assert all(shape == next(iter(token_shapes.values())) for shape in token_shapes.values()), \
#         f"Inconsistent shapes in tokens: {token_shapes}"

#     # Check ratings shape consistency
#     assert all(ratings[0].shape == rating.shape for rating in ratings), \
#         f"Inconsistent shapes in ratings: {[rating.shape for rating in ratings]}"

#     # Check categories shape consistency
#     category_shape = categories.shape
#     assert all(categories[0].shape == cat.shape for cat in categories), \
#         f"Inconsistent shapes in categories: {[cat.shape for cat in categories]}"

#     # Check target_ids shape consistency
# #     target_id_shape = target_ids.shape
#     assert all(target_ids[0].shape == target.shape for target in target_ids), \
#         f"Inconsistent shapes in target_ids: {[target.shape for target in target_ids]}"

#     print("Batch shapes are consistent for tokens, ratings, categories, and target_ids.")

In [43]:
# # Writing Train loop

# for epoch in range(5):
#     model.train()
#     cnt=0
#     for tokens, ratings, categories, target_ids in train_loader:
#         cnt+=1
#         print(cnt)
#         optimizer.zero_grad()
#         tokens = {
#             "input_ids": tokens["input_ids"].to(device),
#             "attention_mask": tokens["attention_mask"].to(device),
#             "token_type_ids": tokens["token_type_ids"].to(device),
#         }
#         ratings = ratings.to(device)
#         categories = categories.to(device)
#         target_ids = target_ids.to(device)
        
#         print("target ids shape", target_ids.shape)
        
        
#         logits = model(tokens, ratings, categories, target_ids)
# #         loss = criterion(logits.view(-1, logits.size(-1)), target_ids.view(-1))

#         logits = logits.view(-1, logits.size(-1))  # [batch_size * tgt_seq_len, num_products]
#         targets = target_ids.view(-1)
#         print(targets)
#         assert targets.dtype == torch.long
        
# #         print("Target range:", target.min().item(), target.max().item())
# #         print("Logits shape:", logits.shape)
# #         assert target.max() < logits.size(-1), "Target index out of range"
# #         assert target.min() >= 0, "Target index must be non-negative"

#         loss = criterion(logits, targets)
        
#         loss.backward()
#         print(loss.item())
# #         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
#         optimizer.step()
#     print(f"Epoch {epoch+1}, Loss: {loss.item()}")

In [45]:
# Writing Train loop

for epoch in range(1):
    model.train()
    epoch_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=True)
    
    for tokens, ratings, categories, target_ids in progress_bar:
        
        optimizer.zero_grad()
        tokens = {
            "input_ids": tokens["input_ids"].to(device),
            "attention_mask": tokens["attention_mask"].to(device),
            "token_type_ids": tokens["token_type_ids"].to(device),
        }
        ratings = ratings.to(device)
        categories = categories.to(device)
        target_ids = target_ids.to(device)
        
#         print("target ids shape", target_ids.shape)
        
        
        logits = model(tokens, ratings, categories, target_ids)
#         loss = criterion(logits.view(-1, logits.size(-1)), target_ids.view(-1))
        
        predicted_indices = torch.argmax(logits, dim=-1)
#         print("Predicted items",predicted_indices.shape, predicted_indices)
        
        logits = logits.view(-1, logits.size(-1))  # [batch_size * tgt_seq_len, num_products]
        targets = target_ids.view(-1)
        
        
#         print(targets)
#         assert targets.dtype == torch.long
        
#         print("Target range:", target.min().item(), target.max().item())
#         print("Logits shape:", logits.shape)
#         assert target.max() < logits.size(-1), "Target index out of range"
#         assert target.min() >= 0, "Target index must be non-negative"

        loss = criterion(logits, targets)
        
        loss.backward()
#         print(loss.item())
#         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
#         print(loss.item())
    
#     torch.save(model.state_dict(), "baseline_model.pth")
    print("Model's state dictionary saved to 'baseline_model.pth'")
    print(f"Epoch {epoch + 1}, Average Loss: {epoch_loss / len(train_loader):.4f}")
#     print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 365/365 [20:19<00:00,  3.34s/it, loss=10.2]

Model's state dictionary saved to 'baseline_model.pth'
Epoch 1, Average Loss: 10.2518





In [46]:
torch.save(model.state_dict(), "baseline_model.pth")
print("Model's state dictionary saved to 'baseline_model.pth'")

Model's state dictionary saved to 'baseline_model.pth'


In [47]:
model.load_state_dict(torch.load("baseline_model.pth"))

  model.load_state_dict(torch.load("baseline_model.pth"))


<All keys matched successfully>

In [48]:
test_dataset = UserDataset(test_data, tokenizer, category_encoder, product_encoder)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [49]:
def compute_recall(logits, target_ids, k=5):
    print("***********")
    preds = torch.topk(logits, k=k, dim=-1).indices
    
    correct = (preds == target_ids.unsqueeze(-1)).any(dim=-1)
    
    print(preds, target_ids.unsqueeze(-1))
    
    recall = correct.float().mean().item()
    return recall

def mean_reciprocal_rank(logits, target_ids):
    rankings = torch.argsort(logits, dim=-1, descending=True)
    target_ranks = (rankings==target_ids.unsqueeze(-1)).nonzero(as_tuple=True)[-1]+1
    
    reciprocal_ranks = 1.0/target_ranks.float()
    mrr = reciprocal_ranks.mean().item()
    return mrr

def precision_at_k(logits, target_ids, k=5):
    top_k_preds = torch.topk(logits, k=k, dim=-1).indices
    
    correct = (top_k_preds == target_ids.unsqueeze(-1)).float()
    
    precision = correct.sum(dim=-1)/k
    precision = precision.mean().item()
    return precision



def evaluate_model(model, test_loader, product_encoder, k=5, output_file="predictions.txt"):
    model.eval()
    recall = 0
    mrr = 0
    precision_k = 0
    total_samples = 0
    
    with open(output_file, "w") as f:
        with torch.no_grad():
            for batch in tqdm(test_loader, desc="Evaluating", leave=True):
                
                tokens, ratings, categories, target_ids = batch
#                 print(target_ids)
                ratings = ratings.to(device)
                categories = categories.to(device)
                target_ids = target_ids.to(device)

                tokens = {
                    "input_ids": tokens["input_ids"].to(device),
                    "attention_mask": tokens["attention_mask"].to(device),
                    "token_type_ids": tokens["token_type_ids"].to(device),
                }

                logits = model(tokens, ratings, categories, target_ids)
                
                
                predicted_indices = torch.argmax(logits, dim=-1)
#                 print("Predicted items",predicted_indices)
                
                for i in range(target_ids.size(0)):
                        print(target_ids[i])
                        actual = [product_encoder.inverse_transform([target_ids[i,j].item()])[0] for j in range(target_ids.size(1))]
                        print(actual)
                        predicted = [product_encoder.inverse_transform([predicted_indices[i,j].item()])[0] for j in range(target_ids.size(1))]
                        f.write(f"{predicted} ---- {actual}\n")

                batch_size = target_ids.size(0)
                recall+= compute_recall(logits, target_ids, k=k)*batch_size
                mrr+= mean_reciprocal_rank(logits, target_ids)*batch_size
                precision_k+=precision_at_k(logits, target_ids, k=k)*batch_size
                total_samples+=batch_size
                break

            metrics = {
                "Recall": recall/total_samples,
                "MRR": mrr/total_samples,
                "Precision@{}".format(k): precision_k/total_samples,
            }
            return metrics



In [50]:
test_metrics = evaluate_model(model, train_loader,product_encoder, k=5, output_file="baseline_predictions_comparision.txt")
print("Test Metrics:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value: .4f}")

Evaluating:   0%|          | 0/365 [00:02<?, ?it/s]

tensor([28771, 38818,  8920,  9639,  3799], device='cuda:0')
['B0948G4LMS', 'B0C144331J', 'B00L9DSWGQ', 'B00O6UCXUK', 'B005C07F6K']
tensor([ 3407, 19918, 10536, 38791, 31042], device='cuda:0')
['B004T8NOR0', 'B07FK63PSH', 'B00TX1KILI', 'B0BZXB1CK6', 'B09NT5ZK6T']
tensor([ 6025, 12609, 38354,  3141,  8102], device='cuda:0')
['B00B1M2N88', 'B01AANNCP6', 'B0BXW17H4C', 'B004JU0H7I', 'B00HZFJQ12']
tensor([13234, 22257, 21550, 10111, 27026], device='cuda:0')
['B01DNYOKHS', 'B07QGVYYYX', 'B07MX3QGR1', 'B00Q4X2FSM', 'B08M1DGKK4']
tensor([27049,  2127, 14827, 36900, 13288], device='cuda:0')
['B08M6ZCRRT', 'B0034GGCY0', 'B01MD05Q8F', 'B0BQ7F2RW9', 'B01DY8MMQY']
tensor([36723, 32400, 22456, 20831, 25123], device='cuda:0')
['B0BPB8Z23R', 'B09Y7L9KBW', 'B07R7WLHXS', 'B07JVQ28QR', 'B085ZB581W']
tensor([22823,  2153,  6469, 21077, 16337], device='cuda:0')
['B07SSCCTCG', 'B0035ER4WU', 'B00C6CS8PE', 'B07KVVXLMY', 'B0728F31F6']
tensor([19792, 39702, 32026, 19259, 16719], device='cuda:0')
['B07F3Q518C', 




In [36]:
# Validation
for tokens, ratings, categories, target_ids in train_loader:
    print("Tokens:", tokens)
    print("Ratings:", ratings)
    print("Categories:", categories)
    print("Target IDs:", target_ids)
    break

Tokens: {'input_ids': tensor([[[  101, 29324,  2130,  ...,  2489,  1997,   102],
         [  101,  3008,  2022,  ...,  1010, 12109,   102],
         [  101,  2061, 14057,  ...,  2043,  2026,   102],
         ...,
         [  101,  3819,  3336,  ...,  8744,  3111,   102],
         [  101,  1996,  2069,  ...,  4906,  1025,   102],
         [  101,  1037,  2210,  ...,  2053, 11669,   102]],

        [[  101, 11937, 21756,  ...,  1012,  2061,   102],
         [  101,  2200,  3733,  ...,  7554, 20631,   102],
         [  101,  2053, 10514,  ...,  5127,  2838,   102],
         ...,
         [  101,  2204,  8962,  ...,  2675,  1007,   102],
         [  101,  2844,  5437,  ...,     0,     0,     0],
         [  101, 24325,  2023,  ...,  1006,  8698,   102]],

        [[  101,  2305,  7138,  ...,  1012,  2023,   102],
         [  101,  4268,  5527,  ...,  8026,  5761,   102],
         [  101,  4845,  1005,  ...,  2028,  3538,   102],
         ...,
         [  101,  6927,  3917,  ...,  2053,  97

In [9]:
# Evaluation