In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import pandas as pd
import random
import time
from torch.utils.data import Sampler
from collections import defaultdict
from pathlib import Path
import sys
import datetime
import json
from torch.utils.data import Dataset
from typing import Union, Optional
import torch.nn.init as init
import logging
from torch.cuda.amp import autocast, GradScaler
from torch import amp
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from transformers import AutoTokenizer, AutoModel

# Check PyTorch version and device availability
print(f"Torch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

# Check the tokenizer and model loading functionality
model_path = "/root/emotion-retrieval-embeddings/models/roberta-base-go_emotions"
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)

print(f"Tokenizer vocab size: {tokenizer.vocab_size}")
print(f"Model's config hidden size: {model.config.hidden_size}")


Torch version: 2.6.0+cu124
CUDA available: True
Tokenizer vocab size: 50265
Model's config hidden size: 768


In [4]:
print(model)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [20]:
# Redefine the updated SuperBalancedBatchSampler class
class SuperBalancedBatchSampler(Sampler):
    def __init__(self, dataset, batch_size, num_classes, labels, base_single=2, base_multi=2):
        self.dataset = dataset
        self.batch_size = batch_size
        self.num_classes = num_classes
        self.labels = labels
        self.base_single = base_single
        self.base_multi = base_multi
        self.init_goal = base_single + base_multi + 1

        self.class_to_single = defaultdict(list)
        self.class_to_multi = defaultdict(list)

        for idx, lbls in enumerate(labels):
            if isinstance(lbls, int):
                lbls = [lbls]
            if len(lbls) == 1:
                self.class_to_single[lbls[0]].append(idx)
            else:
                for l in lbls:
                    self.class_to_multi[l].append(idx)

    def __iter__(self):
        while True:
            batch_indices = set()
            class_counts = {c: 0 for c in range(self.num_classes)}

            # Phase 1: Add base_single and base_multi samples per class if available
            for c in range(self.num_classes):
                added = 0

                # Add up to base_single single-label samples
                random.shuffle(self.class_to_single[c])
                for idx in self.class_to_single[c]:
                    if idx not in batch_indices:
                        batch_indices.add(idx)
                        for l in self.labels[idx]:
                            class_counts[l] += 1
                        added += 1
                    if added >= self.base_single:
                        break

                # Add up to base_multi multi-label samples
                random.shuffle(self.class_to_multi[c])
                for idx in self.class_to_multi[c]:
                    if idx not in batch_indices:
                        batch_indices.add(idx)
                        for l in self.labels[idx]:
                            class_counts[l] += 1
                        added += 1
                    if added >= self.base_single + self.base_multi:
                        break

            # Phase 2: Iterative balancing using dynamic goal
            goal = self.init_goal
            while len(batch_indices) < self.batch_size:
                class_candidates = [c for c in range(self.num_classes) if class_counts[c] < goal]
                if not class_candidates:
                    goal += 1
                    continue

                c = random.choice(class_candidates)
                use_multi = random.random() < 0.5
                pool = self.class_to_multi[c] if use_multi else self.class_to_single[c]

                if not pool:
                    continue

                idx = random.choice(pool)
                if idx in batch_indices:
                    continue

                batch_indices.add(idx)
                for l in self.labels[idx]:
                    class_counts[l] += 1

            yield list(batch_indices)

    def __len__(self):
        return 1000000

In [None]:
# Load full test set from JSONL
jsonl_path = Path("../data/augmented_go_emotion/test.jsonl")
train_data = []
labels = []

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        sample = json.loads(line)
        train_data.append(sample)
        labels.append(sample["labels"] if isinstance(sample["labels"], list) else [sample["labels"]])

# Initialize sampler
sampler = SuperBalancedBatchSampler(
    dataset=train_data,
    batch_size=128,
    num_classes=28,
    labels=labels
)

# Draw and evaluate one batch
batch_indices = next(iter(sampler))
batch_labels = [labels[i] for i in batch_indices]

class_counter = Counter()
single_label_count = 0
multi_label_count = 0

for lbls in batch_labels:
    if len(lbls) == 1:
        single_label_count += 1
    else:
        multi_label_count += 1
    for l in lbls:
        class_counter[l] += 1

class_freq_output = {f"Class {c}": class_counter[c] for c in sorted(class_counter)}
sample_type_output = {
    "Single-label samples": single_label_count,
    "Multi-label samples": multi_label_count,
    "Total batch size": len(batch_indices)
}

(class_freq_output, sample_type_output)

({'Class 0': 13,
  'Class 1': 6,
  'Class 2': 6,
  'Class 3': 11,
  'Class 4': 6,
  'Class 5': 7,
  'Class 6': 6,
  'Class 7': 9,
  'Class 8': 6,
  'Class 9': 6,
  'Class 10': 6,
  'Class 11': 6,
  'Class 12': 6,
  'Class 13': 6,
  'Class 14': 6,
  'Class 15': 7,
  'Class 16': 6,
  'Class 17': 8,
  'Class 18': 6,
  'Class 19': 6,
  'Class 20': 7,
  'Class 21': 6,
  'Class 22': 6,
  'Class 23': 6,
  'Class 24': 6,
  'Class 25': 12,
  'Class 26': 6,
  'Class 27': 12},
 {'Single-label samples': 62,
  'Multi-label samples': 66,
  'Total batch size': 128})

### NT-next loss

In [44]:
def nt_xent_loss_vectorized_stable(embeddings: torch.Tensor, labels: list[list[int]], temperature: float = 0.07) -> torch.Tensor:
    device = embeddings.device
    N = embeddings.size(0)

    # Cosine similarity matrix
    sim_matrix = torch.matmul(embeddings, embeddings.T) / temperature

    # Build positive pair mask
    label_sets = [set(l) for l in labels]
    pos_mask = torch.zeros((N, N), dtype=torch.bool, device=device)
    for i in range(N):
        for j in range(N):
            if i != j and label_sets[i].intersection(label_sets[j]):
                pos_mask[i, j] = True

    logits_mask = ~torch.eye(N, dtype=torch.bool, device=device)

    losses = []
    for i in range(N):
        positives = sim_matrix[i][pos_mask[i]]
        all_except_i = sim_matrix[i][logits_mask[i]]

        if positives.numel() == 0:
            continue

        # log-sum-exp trick
        c = torch.max(all_except_i).detach()
        pos_exp = torch.exp(positives - c).sum()
        neg_exp = torch.exp(all_except_i - c).sum()
        loss_i = -torch.log(pos_exp / neg_exp)

        losses.append(loss_i)

    return torch.stack(losses).mean() if losses else torch.tensor(0.0, device=device)


### My loss 

In [46]:
import torch
import torch.nn as nn
import torch.nn.functional as F

def compute_entropy_weight(logits: torch.Tensor, labels: torch.Tensor, eps: float = 1e-8) -> torch.Tensor:
    """
    Compute entropy-based instance weights based on predicted class probabilities.
    """
    probs = torch.sigmoid(logits)  # Convert logits to probabilities
    entropy = - (probs * torch.log(probs + eps) + (1 - probs) * torch.log(1 - probs + eps))  # Per-class entropy
    per_sample_entropy = (entropy * labels).sum(dim=1) / (labels.sum(dim=1) + eps)  # Average only over active labels
    return per_sample_entropy.detach()

def compute_va_similarity_matrix(va: torch.Tensor) -> torch.Tensor:
    """
    Compute a cosine similarity matrix between VA embeddings of classes.
    """
    va_norm = F.normalize(va, p=2, dim=1)  # L2 normalize across valence/arousal
    return torch.matmul(va_norm, va_norm.T)  # Compute cosine similarity between class pairs


class MultiLabelContrastiveLoss(nn.Module):
    def __init__(self, temperature: float = 0.07, alpha: float = 1.0, use_entropy_weight: bool = True, va_matrix: torch.Tensor = None):
        super().__init__()
        self.temperature = temperature
        self.alpha = alpha
        self.use_entropy_weight = use_entropy_weight
        self.va_matrix = va_matrix  # shape (C, C), cosine similarity between classes

    def forward(self, embeddings: torch.Tensor, logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
        """
        embeddings: (N, D) L2-normalized
        logits: (N, C) before sigmoid
        labels: (N, C) binary multi-label (0/1)
        """
        device = embeddings.device
        N, C = labels.shape

        # 1. BCE Loss
        bce_loss = F.binary_cross_entropy_with_logits(logits, labels.float())

        # 2. Contrastive Setup
        sim_matrix = torch.matmul(embeddings, embeddings.T) / self.temperature
        logits_mask = ~torch.eye(N, dtype=torch.bool, device=device)
        label_sets = [set(torch.nonzero(lbl, as_tuple=True)[0].tolist()) for lbl in labels]

        # Optional: entropy-based weighting
        entropy_weights = compute_entropy_weight(logits, labels) if self.use_entropy_weight else torch.ones(N, device=device)

        losses = []
        for i in range(N):
            pos_mask = torch.tensor([i != j and label_sets[i].intersection(label_sets[j]) for j in range(N)], device=device)
            if pos_mask.sum() == 0:
                continue

            positives = sim_matrix[i][pos_mask]
            negatives = sim_matrix[i][logits_mask[i]]

            # Optional: VA-based soft weights
            if self.va_matrix is not None:
                va_weights = []
                for j in range(N):
                    if i == j:
                        continue
                    shared = label_sets[i].intersection(label_sets[j])
                    if shared:
                        va_weights.append(1.0)
                    else:
                        sim = max(self.va_matrix[a][b] for a in label_sets[i] for b in label_sets[j])
                        va_weights.append(sim)
                va_weights = torch.tensor(va_weights, dtype=torch.float, device=device)
            else:
                va_weights = torch.ones_like(negatives)

            # Stable log-sum-exp trick
            c = torch.max(negatives).detach()
            pos_exp = torch.exp(positives - c).sum()
            neg_exp = (torch.exp(negatives - c) * va_weights).sum()
            loss_i = -torch.log(pos_exp / (neg_exp + 1e-8)) * entropy_weights[i]
            losses.append(loss_i)

        contrastive_loss = torch.stack(losses).mean() if losses else torch.tensor(0.0, device=device)
        return bce_loss + self.alpha * contrastive_loss


In [17]:
import torch.nn.functional as F

# Dummy labels for testing
labels = torch.tensor([0])  # Example label for a batch size of 1

# Normalize the output embeddings and compute similarity
embeddings = F.normalize(outputs.last_hidden_state[:, 0], p=2, dim=0)  # Use [CLS] token output
print(f"Embeddings shape: {embeddings.shape}")

# Compute similarity (dummy loss computation for testing)
sim_matrix = torch.matmul(embeddings, embeddings.T)
loss = -torch.mean(torch.log(sim_matrix))

print(f"Calculated loss: {loss.item()}")


Embeddings shape: torch.Size([1, 768])
Calculated loss: -6.643789768218994


In [18]:
# Placeholder for DataLoader setup (use actual dataset for this)
from torch.utils.data import Dataset, DataLoader

class SimpleDataset(Dataset):
    def __init__(self, tokenizer, num_samples=10):
        self.tokenizer = tokenizer
        self.samples = ["This is sample text."] * num_samples

    def __getitem__(self, idx):
        text = self.samples[idx]
        encoding = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
        return encoding

    def __len__(self):
        return len(self.samples)

# Create a simple dataset and dataloader
dataset = SimpleDataset(tokenizer)
dataloader = DataLoader(dataset, batch_size=2)

# Test DataLoader
for batch in dataloader:
    print(batch)
    break


{'input_ids': tensor([[[   0,  713,   16, 7728, 2788,    4,    2]],

        [[   0,  713,   16, 7728, 2788,    4,    2]]]), 'attention_mask': tensor([[[1, 1, 1, 1, 1, 1, 1]],

        [[1, 1, 1, 1, 1, 1, 1]]])}


In [22]:
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.cuda.amp import GradScaler

# Define simple training loop
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
scaler = GradScaler()

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        
        optimizer.zero_grad()
        
        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = F.normalize(outputs.last_hidden_state[:, 0], p=2, dim=1)  # Use [CLS] token output
            
            # Dummy loss calculation (to be replaced with actual loss function)
            sim_matrix = torch.matmul(embeddings, embeddings.T)
            loss = -torch.mean(torch.log(sim_matrix))
        
        # Backward pass and optimization
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}")


  scaler = GradScaler()


NameError: name 'device' is not defined

In [20]:
class EmotionEmbeddingModel(nn.Module):
    def __init__(self, 
                 model_dir: Union[str, Path] = "models/roberta-base-go_emotions", 
                 dropout_rate: float = 0.3, 
                 projection_dim: int = 128):
        super().__init__()

        # Load from your locally saved GoEmotions model
        self.encoder = AutoModel.from_pretrained(str(model_dir), local_files_only=True)

        self.dropout = nn.Dropout(dropout_rate)
        hidden_size = self.encoder.config.hidden_size  # Typically 768 for RoBERTa-base

        # Attach your projection head
        self.projection_head = ProjectionHead(
            input_dim=hidden_size,
            hidden_dim=256,
            output_dim=projection_dim,
            dropout=dropout_rate
        )
        self.projection_head.apply(init_weights)  # Reinitialize only the projection head

    def forward(self, input_ids, attention_mask):
        # Extracting output from RoBERTa
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        
        # Extract CLS token embedding (the first token)
        cls_embeddings = outputs.last_hidden_state[:, 0]  # CLS token is the first token
        cls_embeddings = self.dropout(cls_embeddings)  # Apply dropout

        # Pass the CLS token embedding through the projection head
        projected = self.projection_head(cls_embeddings)
        
        # Return the projected embeddings
        return projected


In [21]:
def train_stage_one(
    model,
    train_loader,
    val_dataset,
    device,
    num_epochs=10,
    lr_encoder=1e-5,
    lr_head=5e-5,
    temperature=0.05,
    checkpoint_dir=Path(__file__).resolve().parent.parent / "checkpoints",
    log_path=Path(__file__).resolve().parent.parent / "stage1_trainning" / "stage_one_training_log.jsonl",
    steps_per_epoch=1000,
    num_classes=28,
    use_compile=False,
):
    logging.info(f"Training started with parameters: num_epochs={num_epochs}, lr_encoder={lr_encoder}, lr_head={lr_head}, temperature={temperature}")

    # Proper compile control
    if use_compile:
        model = torch.compile(model)
    model.to(device)

    checkpoint_dir.mkdir(parents=True, exist_ok=True)
    log_path.parent.mkdir(parents=True, exist_ok=True)

    criterion = BSCLossSingleLabel(temperature=temperature)
    optimizer = torch.optim.AdamW([  
        {"params": model.encoder.parameters(), "lr": lr_encoder},
        {"params": model.projection_head.parameters(), "lr": lr_head}
    ], weight_decay=1e-5)
    scaler = torch.amp.GradScaler()

    best_metric = -1.0  # Higher is better (Precision@5)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        start_time = time.time()

        for step, batch in enumerate(train_loader):
            input_ids = batch["input_ids"].to(device, non_blocking=True)
            attention_mask = batch["attention_mask"].to(device, non_blocking=True)
            labels = batch["label"].to(device, non_blocking=True)

            optimizer.zero_grad()

            with torch.amp.autocast(device_type="cuda", dtype=torch.float16):
                # Forward pass
                embeddings = model(input_ids=input_ids, attention_mask=attention_mask)
                
                # Compute the loss
                loss = criterion(embeddings, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

            if step % 100 == 0:
                logging.info(f"Epoch {epoch + 1} Step {step} | Train Loss: {loss.item():.4f}")

            if step + 1 >= steps_per_epoch:
                break
            
        avg_train_loss = total_loss / (step + 1)
        val_metric = evaluate_embeddings_macro_precision(
            model=model,
            dataset=val_dataset,
            device=device,
            batch_size=64,
            top_k=5,
            num_classes=num_classes
        )

        log_entry = {
            "epoch": epoch + 1,
            "train_loss": avg_train_loss,
            "macro_precision_at_5": val_metric,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
        }

        log_training_result(epoch + 1, step=None, train_loss=avg_train_loss, val_loss=1.0 - val_metric)

        # Now write the log_entry to a file
        with open(log_path, "a", encoding="utf-8") as f:
            f.write(json.dumps(log_entry) + "\n")

        logging.info(
            f"[Epoch {epoch + 1}] Avg Train Loss: {avg_train_loss:.4f} | "
            f"Val Macro Precision@5: {val_metric:.6f} | "
            f"Time: {time.time() - start_time:.2f}s"
        )

        # Save checkpoint if best so far
        if val_metric > best_metric:
            best_metric = val_metric
            save_checkpoint(model, optimizer, scaler, epoch + 1, best_metric, checkpoint_dir / "best_checkpoint.pt")
            logging.info(f"Saved new best model at epoch {epoch + 1} with Precision@5: {val_metric:.6f}")

    # Save final checkpoint
    save_checkpoint(model, optimizer, scaler, num_epochs, best_metric, checkpoint_dir / "final_checkpoint.pt")
    logging.info("Training complete. Final model saved.")


NameError: name '__file__' is not defined