In [None]:
%pip install transformers datasets torch scikit-learn

In [None]:
from datasets import load_dataset

# Load the GoEmotions dataset
dataset = load_dataset("go_emotions")

# Print sample data structure
print(dataset)
print(dataset["train"][0])


In [None]:
import torch
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

# Get all 28 emotion labels
label_names = dataset["train"].features["labels"].feature.names
num_labels = len(label_names)

# Convert label indices to one-hot encoded vectors
mlb = MultiLabelBinarizer(classes=range(num_labels))

def preprocess_data(example):
    example["labels"] = mlb.fit_transform([example["labels"]])[0]  # One-hot encoding
    return example

# Apply preprocessing to all splits
dataset = dataset.map(preprocess_data)

# Check processed data
print(dataset["train"][0])


In [None]:
from transformers import AutoTokenizer

# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_data(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

# Apply tokenizer
dataset = dataset.map(tokenize_data, batched=True)

# Remove unneeded columns
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Check final processed data
print(dataset["train"][0])


In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel

class MultiLabelBERT(nn.Module):
    def __init__(self, model_name, num_labels):
        super(MultiLabelBERT, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # CLS token representation
        dropped_output = self.dropout(pooled_output)
        logits = self.classifier(dropped_output)
        return logits  # No activation here, since BCEWithLogitsLoss applies sigmoid automatically

In [None]:
from torch.utils.data import DataLoader

# Set batch size
BATCH_SIZE = 64

# Create DataLoaders
train_dataloader = DataLoader(dataset["train"], batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(dataset["validation"], batch_size=BATCH_SIZE)


In [None]:
from transformers import AdamW, get_scheduler

# Define model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiLabelBERT("bert-base-uncased", num_labels).to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=0.01, weight_decay=0.01)

# Learning rate scheduler
num_training_steps = len(train_dataloader) * 10  # Assuming 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Loss function (Binary Cross-Entropy for multi-label)
criterion = nn.BCEWithLogitsLoss()

In [None]:
%pip install torch torchvision torchaudio transformers datasets tqdm scikit-learn matplotlib numpy pandas seaborn


In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np

EPOCHS = 10  # You can increase this for better performance

# Track losses and accuracies
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []

for epoch in range(EPOCHS):
    model.train()
    loop = tqdm(train_dataloader, leave=True)

    total_loss, correct, total = 0, 0, 0
    all_train_labels, all_train_preds = [], []

    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device, dtype=torch.float)  # Multi-label needs float labels

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)  # Apply BCEWithLogitsLoss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()

        # Convert logits to probabilities
        preds = torch.sigmoid(logits).detach().cpu().numpy()
        labels_np = labels.cpu().numpy()

        all_train_labels.append(labels_np)
        all_train_preds.append(preds)

        # Update progress bar
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    # Compute training accuracy
    all_train_labels = np.vstack(all_train_labels)
    all_train_preds = np.vstack(all_train_preds)
    train_acc = multi_label_accuracy(all_train_labels, all_train_preds)
    
    # Store training loss & accuracy
    train_losses.append(total_loss / len(train_dataloader))
    train_accuracies.append(train_acc)

    # Evaluate validation performance
    val_metrics = evaluate_model(model, val_dataloader)
    val_losses.append(val_metrics["Accuracy"])  # Track validation loss (use accuracy here for simplicity)
    val_accuracies.append(val_metrics["Accuracy"])

    print(f"\nEpoch {epoch+1} Summary:")
    print(f"Train Loss: {train_losses[-1]:.4f} | Train Accuracy: {train_accuracies[-1]:.4f}")
    print(f"Val Accuracy: {val_accuracies[-1]:.4f} | Val AUROC: {val_metrics['AUROC']:.4f}")

# After all epochs, evaluate on test set
test_metrics = evaluate_model(model, dataset["test"])
print(f"\nFinal Test Accuracy: {test_metrics['Accuracy']:.4f}")
print(f"Test AUROC: {test_metrics['AUROC']:.4f}")
