In [3]:
# !pip install numpy
# !pip install pandas
# !pip install scikit-learn
# !pip install torch
# !pip install transformers


from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, hamming_loss, roc_auc_score, average_precision_score
from collections import defaultdict
from torch.amp import autocast, GradScaler
import torch.nn.functional as F
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import time

## Hyperparameters
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 12
LEARNING_RATE = 1e-05
THRESHOLD = 0.5 # threshold for the sigmoid


## Dataset Class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_column):
        self.tokenizer = tokenizer
        self.texts = df['text_bangla'].tolist()    # for bangla
        self.labels = df[target_column].tolist()  # Use 'label' column
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'targets': torch.tensor(self.labels[index], dtype=torch.long)
        }

## Data
train_file_path = '/content/train.csv'
val_file_path = '/content/val.csv'
test_file_path = '/content/test.csv'

train_df = pd.read_csv(train_file_path)
val_df = pd.read_csv(val_file_path)
test_df = pd.read_csv(test_file_path)

# Specify the target column (assumed to be 'label')
target_column = 'label'
target_names = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

## Tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base", use_fast=True)


train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN, target_column)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN, target_column)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN, target_column)

#print(train_dataset[0])

## Data Loader
train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(test_dataset,
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)
## Device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

## Model
class InceptionBlock(nn.Module):
    """
    Modified Inception block with 5 branches (including 1x1 conv)
    """
    def __init__(self, in_channels=768, branch_out=32):
        super().__init__()
        # Add 1x1 branch
        # self.branch1 = nn.Sequential(
        #     nn.Conv1d(in_channels, branch_out, kernel_size=1),
        #     nn.BatchNorm1d(branch_out),
        #     nn.ReLU()
        # )
        self.branch2 = nn.Sequential(
            nn.Conv1d(in_channels, branch_out, kernel_size=2),
            nn.BatchNorm1d(branch_out),
            nn.ReLU()
        )
        self.branch3 = nn.Sequential(
            nn.Conv1d(in_channels, branch_out, kernel_size=3),
            nn.BatchNorm1d(branch_out),
            nn.ReLU()
        )
        self.branch5 = nn.Sequential(
            nn.Conv1d(in_channels, branch_out, kernel_size=5),
            nn.BatchNorm1d(branch_out),
            nn.ReLU()
        )
        self.branch7 = nn.Sequential(
            nn.Conv1d(in_channels, branch_out, kernel_size=7),
            nn.BatchNorm1d(branch_out),
            nn.ReLU()
        )

    def forward(self, x):
        # No padding needed for 1x1 branch
        # b1 = self.branch1(x)
        b2 = F.pad(self.branch2(x), (0, 1))   # Right-pad 1
        b3 = F.pad(self.branch3(x), (1, 1))   # Pad 1 on both sides
        b5 = F.pad(self.branch5(x), (2, 2))   # Pad 2 on both sides
        b7 = F.pad(self.branch7(x), (3, 3))   # Pad 3 on both sides

        out = torch.cat([b2, b3, b5, b7], dim=1)
        return out

class XLMRInceptionAttention(nn.Module):
    def __init__(self, num_classes, num_transformer_layers=3):
        super().__init__()
        self.xlmr = AutoModel.from_pretrained('xlm-roberta-base')
        hidden_size = self.xlmr.config.hidden_size  # 768

        self.dropout = nn.Dropout(0.3)
        self.branch_out = 16
        self.inception = InceptionBlock(hidden_size, self.branch_out)

        self.fused_dim = hidden_size + 4 * self.branch_out  # 768 + 160 = 928

        self.attention = nn.MultiheadAttention(
            embed_dim=self.fused_dim,
            num_heads=8,
            batch_first=True
        )

        self.dense = nn.Sequential(
            nn.Linear(self.fused_dim, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU()
        )
        self.final_dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(256, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.xlmr(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        hidden_states = self.dropout(outputs.last_hidden_state)  # (B, L, 768)

        # Inception
        x = hidden_states.permute(0, 2, 1)
        inception_out = self.inception(x)  # (B, 160, L)
        inception_out = inception_out.permute(0, 2, 1)  # (B, L, 160)

        # Fusion
        fused_features = torch.cat([hidden_states, inception_out], dim=2)  # (B, L, 928)

        key_padding_mask = ~attention_mask.bool()
        attn_output, _ = self.attention(
            fused_features, fused_features, fused_features,
            key_padding_mask=key_padding_mask
        )

        # Adaptive pooling
        pooled = F.adaptive_avg_pool1d(attn_output.permute(0, 2, 1), 1).squeeze(-1)

        # Dense + residual
        dense_out = self.dense(pooled)

        logits = self.fc(self.final_dropout(dense_out))  # (B, num_classes)
        return logits


## Setting the model
model = XLMRInceptionAttention(num_classes=len(target_names))
model.to(device)

## Loss & Optimizer
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets.long())

# define the optimizer
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-3)

## Training function
def train_model(training_loader, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    total_batches = len(training_loader)

    # Set model to training mode (activate dropout, batch norm)
    model.train()

    for batch_idx, data in enumerate(training_loader):
        ids = data['input_ids'].to(device, dtype=torch.long, non_blocking=True)
        mask = data['attention_mask'].to(device, dtype=torch.long, non_blocking=True)
        targets = data['targets'].to(device, dtype=torch.long, non_blocking=True)

        # Forward pass
        outputs = model(ids, mask)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())

        # Calculate training accuracy
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == targets)
        num_samples += targets.size(0)

        # Backward pass and optimizer step
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

    train_accuracy = float(correct_predictions) / num_samples
    average_loss = np.mean(losses)
    print(f"Training Accuracy: {train_accuracy:.4f} | Training Loss: {average_loss:.4f}")

    return model, train_accuracy, average_loss


def eval_model(validation_loader, model):
    model.eval()
    final_targets = []
    final_outputs = []
    final_probs = []
    losses = []

    with torch.no_grad():
        for data in validation_loader:
            ids = data['input_ids'].to(device, dtype=torch.long, non_blocking=True)
            mask = data['attention_mask'].to(device, dtype=torch.long, non_blocking=True)
            targets = data['targets'].to(device, dtype=torch.long, non_blocking=True)

            # Get model outputs
            outputs = model(ids, mask)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # Predictions and probabilities
            probs = torch.softmax(outputs, dim=1)  # Softmax for probabilities
            _, preds = torch.max(outputs, dim=1)  # Predicted class indices
            final_outputs.extend(preds.cpu().numpy())
            final_probs.extend(probs.cpu().numpy())
            final_targets.extend(targets.cpu().numpy())

            torch.cuda.empty_cache()

    # Convert to numpy arrays
    final_targets = np.array(final_targets)
    final_outputs = np.array(final_outputs)
    final_probs = np.array(final_probs)

    # Accuracy
    acc = accuracy_score(final_targets, final_outputs)

    # Weighted metrics
    f1 = f1_score(final_targets, final_outputs, average='weighted')
    precision = precision_score(final_targets, final_outputs, average='weighted')
    recall = recall_score(final_targets, final_outputs, average='weighted')

    # Micro-averaged metrics
    micro_f1 = f1_score(final_targets, final_outputs, average='macro')
    micro_precision = precision_score(final_targets, final_outputs, average='macro')
    micro_recall = recall_score(final_targets, final_outputs, average='macro')

    # Hamming Loss
    hamming = hamming_loss(final_targets, final_outputs)

    # AUC-ROC and AUPR
    auc_roc = roc_auc_score(final_targets, final_probs, multi_class='ovr', average='macro')

    # Average Loss
    average_loss = np.mean(losses)

    # Print metrics
    print(f"Validation Accuracy: {acc:.4f}")
    print(f"Weighted F1 Score: {f1}")
    print(f"Macro F1 Score: {micro_f1}")
    print(f"Weighted Precision: {precision}")
    print(f"Macro Precision: {micro_precision}")
    print(f"Weighted Recall: {recall}")
    print(f"Macro Recall: {micro_recall}")
    print(f"AUC-ROC: {auc_roc}")
    print("\nClassification Report:\n", classification_report(final_targets, final_outputs, target_names=target_names))

    return acc, average_loss


#Learning Rate Scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Training & Evaluation Loop
# recording starting time
start = time.time()

history = defaultdict(list)
best_acc = 0.0  # Initialize best accuracy

for epoch in range(1, EPOCHS + 1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model)

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    scheduler.step()

    # Save the best model based on accuracy
    if val_acc > best_acc:
        torch.save(model.state_dict(), "bangla_xlmr_inceptive_16.bin")
        best_acc = val_acc

# recording end time
end = time.time()
print(f"Total training and evaluation time: {end - start} seconds")


## Testing
# Loading pretrained model (best model)
print("\n\nTesting\n\n")
model = XLMRInceptionAttention(num_classes=len(target_names))
model.load_state_dict(torch.load("bangla_xlmr_inceptive_16.bin"))
model = model.to(device)

# recording starting time
start = time.time()
# Evaluate the model using the test data
eval_model(test_data_loader, model)
# recording end time
end = time.time()
print(f"Total test-set evaluation time: {end - start} seconds")

Epoch 1/12
Training Accuracy: 0.5463 | Training Loss: 1.1671
Validation Accuracy: 0.6066
Weighted F1 Score: 0.6025781196874966
Macro F1 Score: 0.6034965766323369
Weighted Precision: 0.6227345975579854
Macro Precision: 0.6298820077175903
Weighted Recall: 0.6066167290886392
Macro Recall: 0.6027723675735659
AUC-ROC: 0.8877179264036722

Classification Report:
               precision    recall  f1-score   support

       anger       0.74      0.59      0.66       756
     disgust       0.49      0.37      0.42      1518
        fear       0.60      0.50      0.55      1631
         joy       0.46      0.80      0.58      1310
     sadness       0.77      0.80      0.79      1784
    surprise       0.71      0.56      0.63      1011

    accuracy                           0.61      8010
   macro avg       0.63      0.60      0.60      8010
weighted avg       0.62      0.61      0.60      8010

Epoch 2/12
Training Accuracy: 0.6281 | Training Loss: 0.9716
Validation Accuracy: 0.6383
Weighted 