## Transformer-Based Experiment: Using `bert-base-uncased`

This notebook evaluates the performance of the `bert-base-uncased` model for multiclass classification of primary progressive aphasia (PPA) subtypes using clinical interview transcripts.

### Objective

To benchmark a transformer-based model against traditional machine learning pipelines by using direct fine-tuning for text classification.

### Preventing Data Leakage

To ensure valid evaluation, a **Group K-Fold cross-validation** strategy is applied:

- Each participant (`SubjectID`) appears in only one fold.
- This ensures that no data from the same individual is present in both training and testing sets, preventing data leakage and overestimation of performance.

### Experiment Details

- **Model**: `bert-base-uncased` from Hugging Face Transformers
- **Tokenization**: Applied using `AutoTokenizer` with truncation, padding, and a maximum length of 128 tokens
- **Training**:
  - Optimizer: AdamW
  - Epochs: 10
  - Batch size: 16
- **Evaluation Metrics**:
  - F1-score (weighted)
  - Balanced Accuracy
  - Precision
  - Recall
  - Hamming Loss
  - AUC (One-vs-Rest multiclass setting)

### Dataset Description

The dataset contains transcribed utterances labeled by subtype. It includes four target classes:

- Logopenic Variant (lvPPA)
- Semantic Variant (svPPA)
- Nonfluent Variant (nfvPPA)
- Healthy Controls

Each entry is associated with:
- `SubjectID` (participant ID)
- `Text` (utterance)
- `Subtype` (target label)

### Output

The notebook prints:

- Fold-wise performance metrics
- Averaged scores across all five folds

### Notes

This approach complements other experiments in the study by allowing the transformer model to operate in an end-to-end fine-tuning fashion, rather than as a feature extractor.


In [None]:
# imports
import pandas as pd
import io
import os
from sklearn.metrics import f1_score, balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import copy
import random
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from sklearn.model_selection import GroupKFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import torch
from tqdm import tqdm
from transformers import AutoTokenizer
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import (
    f1_score,
    balanced_accuracy_score,
    precision_score,
    recall_score,
    hamming_loss,
    roc_auc_score
)


In [None]:
# import the data here

In [None]:
# For reproducibility
seed_value = 42
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
np.random.seed(seed_value)
random.seed(seed_value)

In [None]:
label_encoder = LabelEncoder()
df['Subtype'] = label_encoder.fit_transform(df['Subtype'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        label = self.labels[index]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

MAX_LEN = 128
dataset = TextDataset(df['Text'].to_numpy(), df['Subtype'].to_numpy(), tokenizer, MAX_LEN)


In [None]:
#define the device for computations
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
df['SubjectID'].isnull().any()

In [None]:

# define parameters
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 10


# define 5-fold cross-validation
groups = df['SubjectID']  
kf = GroupKFold(n_splits=5)

# initialize metrics storage
f1_scores = []
balanced_accuracies = []
precisions = []
recalls = []
hamming_losses = []
auc_scores = []

# perform cross-validation
for fold, (train_index, val_index) in enumerate(kf.split(df, groups=groups)):
    print(f"\nFold {fold + 1}")

    # split the data for the current fold
    train_texts, val_texts = df.iloc[train_index]['Text'], df.iloc[val_index]['Text']
    train_labels, val_labels = df.iloc[train_index]['Subtype'], df.iloc[val_index]['Subtype']

    # create datasets and dataloaders
    train_dataset = TextDataset(train_texts.to_numpy(), train_labels.to_numpy(), tokenizer, MAX_LEN)
    val_dataset = TextDataset(val_texts.to_numpy(), val_labels.to_numpy(), tokenizer, MAX_LEN)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # initialize model and optimizer for each fold
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=len(df['Subtype'].unique())
    )
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)

    # training loop
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

    # evaluation loop
    model.eval()
    true_labels = []
    pred_labels = []
    probabilities = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            probs = torch.softmax(logits, dim=-1)  # get probabilities for AUC

            true_labels.extend(labels.cpu().numpy())
            pred_labels.extend(predictions.cpu().numpy())
            probabilities.extend(probs.cpu().numpy())

    # calculate metrics for this fold
    f1 = f1_score(true_labels, pred_labels, average='weighted')
    balanced_acc = balanced_accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, average='weighted')
    recall = recall_score(true_labels, pred_labels, average='weighted')
    hamming = hamming_loss(true_labels, pred_labels)

    # calculate AUC (one-vs-rest for multiclass)
    try:
        auc = roc_auc_score(
            true_labels, probabilities, multi_class='ovr', average='weighted'
        )
    except ValueError:
        auc = np.nan  # handle edge cases where AUC is undefined

    # append metrics for this fold
    f1_scores.append(f1)
    balanced_accuracies.append(balanced_acc)
    precisions.append(precision)
    recalls.append(recall)
    hamming_losses.append(hamming)
    auc_scores.append(auc)

    print(
        f"Fold {fold + 1} - F1-Score: {f1:.4f}, Balanced Accuracy: {balanced_acc:.4f}, "
        f"Precision: {precision:.4f}, Recall: {recall:.4f}, Hamming Loss: {hamming:.4f}, AUC: {auc:.4f}"
    )

# calculate and print the average metrics across all folds
avg_f1 = np.mean(f1_scores)
avg_balanced_acc = np.mean(balanced_accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_hamming = np.mean(hamming_losses)
avg_auc = np.nanmean(auc_scores)

print("\n5-Fold Cross-Validation Results:")
print(f"Average F1-Score: {avg_f1:.4f}")
print(f"Average Balanced Accuracy: {avg_balanced_acc:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average Hamming Loss: {avg_hamming:.4f}")
print(f"Average AUC: {avg_auc:.4f}")


