In [None]:
# !pip install numpy
# !pip install pandas
# !pip install scikit-learn
# !pip install torch
# !pip install transformers


from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, hamming_loss, roc_auc_score, average_precision_score
from collections import defaultdict
from torch.amp import autocast, GradScaler
import torch.nn.functional as F
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import time

## Hyperparameters
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 12
LEARNING_RATE = 1e-05
THRESHOLD = 0.5 # threshold for the sigmoid


## Dataset Class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_column):
        self.tokenizer = tokenizer
        self.texts = df['text_bangla'].tolist()    # for bangla
        self.labels = df[target_column].tolist()  # Use 'label' column
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'targets': torch.tensor(self.labels[index], dtype=torch.long)
        }

## Data
train_file_path = 'train.csv'
val_file_path = 'val.csv'
test_file_path = 'test.csv'

train_df = pd.read_csv(train_file_path)
val_df = pd.read_csv(val_file_path)
test_df = pd.read_csv(test_file_path)

# Specify the target column (assumed to be 'label')
target_column = 'label'
target_names = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

## Tokenizer
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglabert", use_fast=True)


train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN, target_column)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN, target_column)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN, target_column)

#print(train_dataset[0])

## Data Loader
train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(test_dataset,
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)
## Device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

## Model
class BanglaBERTBase(nn.Module):
    def __init__(self, num_classes):
        super(BanglaBERTBase, self).__init__()
        self.bert = AutoModel.from_pretrained("csebuetnlp/banglabert")
        self.drop = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        last_hidden_state = outputs.last_hidden_state  # Extract hidden states
        cls_output = self.drop(last_hidden_state[:, 0, :])  # Get [CLS] token
        return self.fc(cls_output)


## Setting the model
model = BanglaBERTBase(num_classes=len(target_names))
model.to(device)

## Loss & Optimizer
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets.long())

# define the optimizer
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-3)

## Training function
def train_model(training_loader, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    total_batches = len(training_loader)

    # Set model to training mode (activate dropout, batch norm)
    model.train()

    for batch_idx, data in enumerate(training_loader):
        ids = data['input_ids'].to(device, dtype=torch.long, non_blocking=True)
        mask = data['attention_mask'].to(device, dtype=torch.long, non_blocking=True)
        targets = data['targets'].to(device, dtype=torch.long, non_blocking=True)

        # Forward pass
        outputs = model(ids, mask)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())

        # Calculate training accuracy
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == targets)
        num_samples += targets.size(0)

        # Backward pass and optimizer step
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

    train_accuracy = float(correct_predictions) / num_samples
    average_loss = np.mean(losses)
    print(f"Training Accuracy: {train_accuracy:.4f} | Training Loss: {average_loss:.4f}")

    return model, train_accuracy, average_loss


def eval_model(validation_loader, model):
    model.eval()
    final_targets = []
    final_outputs = []
    final_probs = []
    losses = []

    with torch.no_grad():
        for data in validation_loader:
            ids = data['input_ids'].to(device, dtype=torch.long, non_blocking=True)
            mask = data['attention_mask'].to(device, dtype=torch.long, non_blocking=True)
            targets = data['targets'].to(device, dtype=torch.long, non_blocking=True)

            # Get model outputs
            outputs = model(ids, mask)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # Predictions and probabilities
            probs = torch.softmax(outputs, dim=1)  # Softmax for probabilities
            _, preds = torch.max(outputs, dim=1)  # Predicted class indices
            final_outputs.extend(preds.cpu().numpy())
            final_probs.extend(probs.cpu().numpy())
            final_targets.extend(targets.cpu().numpy())

            torch.cuda.empty_cache()

    # Convert to numpy arrays
    final_targets = np.array(final_targets)
    final_outputs = np.array(final_outputs)
    final_probs = np.array(final_probs)

    # Accuracy
    acc = accuracy_score(final_targets, final_outputs)

    # Weighted metrics
    f1 = f1_score(final_targets, final_outputs, average='weighted')
    precision = precision_score(final_targets, final_outputs, average='weighted')
    recall = recall_score(final_targets, final_outputs, average='weighted')

    # Micro-averaged metrics
    micro_f1 = f1_score(final_targets, final_outputs, average='macro')
    micro_precision = precision_score(final_targets, final_outputs, average='macro')
    micro_recall = recall_score(final_targets, final_outputs, average='macro')

    # Hamming Loss
    hamming = hamming_loss(final_targets, final_outputs)

    # AUC-ROC and AUPR
    auc_roc = roc_auc_score(final_targets, final_probs, multi_class='ovr', average='macro')

    # Average Loss
    average_loss = np.mean(losses)

    # Print metrics
    print(f"Validation Accuracy: {acc:.4f}")
    print(f"Weighted F1 Score: {f1}")
    print(f"Macro F1 Score: {micro_f1}")
    print(f"Weighted Precision: {precision}")
    print(f"Macro Precision: {micro_precision}")
    print(f"Weighted Recall: {recall}")
    print(f"Macro Recall: {micro_recall}")
    print(f"AUC-ROC: {auc_roc}")
    print("\nClassification Report:\n", classification_report(final_targets, final_outputs, target_names=target_names))

    return acc, average_loss


#Learning Rate Scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Training & Evaluation Loop
# recording starting time
start = time.time()

history = defaultdict(list)
best_acc = 0.0  # Initialize best accuracy

for epoch in range(1, EPOCHS + 1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model)

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    scheduler.step()

    # Save the best model based on accuracy
    if val_acc > best_acc:
        torch.save(model.state_dict(), "bangla_banglaBERT_best.bin")
        best_acc = val_acc

# recording end time
end = time.time()
print(f"Total training and evaluation time: {end - start} seconds")


## Testing
# Loading pretrained model (best model)
print("\n\nTesting\n\n")
model = BanglaBERTBase(num_classes=len(target_names))
model.load_state_dict(torch.load("bangla_banglaBERT_best.bin"))
model = model.to(device)

# recording starting time
start = time.time()
# Evaluate the model using the test data
eval_model(test_data_loader, model)
# recording end time
end = time.time()
print(f"Total test-set evaluation time: {end - start} seconds")

Epoch 1/12
Training Accuracy: 0.5863 | Training Loss: 1.0643
Validation Accuracy: 0.6567
Weighted F1 Score: 0.6560102647645687
Macro F1 Score: 0.6519418581047699
Weighted Precision: 0.6596304811167607
Macro Precision: 0.6475131274857787
Weighted Recall: 0.6566791510611736
Macro Recall: 0.662027457370901
AUC-ROC: 0.9140971602230635

Classification Report:
               precision    recall  f1-score   support

       anger       0.60      0.74      0.66       756
     disgust       0.53      0.54      0.54      1518
        fear       0.64      0.59      0.61      1631
         joy       0.65      0.56      0.60      1310
     sadness       0.84      0.82      0.83      1784
    surprise       0.62      0.73      0.67      1011

    accuracy                           0.66      8010
   macro avg       0.65      0.66      0.65      8010
weighted avg       0.66      0.66      0.66      8010

Epoch 2/12
Training Accuracy: 0.6914 | Training Loss: 0.8087
Validation Accuracy: 0.6880
Weighted F

  model.load_state_dict(torch.load("bangla_banglaBERT_best.bin"))
