In [None]:
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval
from datasets import Dataset
import numpy as np
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from transformers import (
    AlbertTokenizer,
    AlbertConfig,
    AlbertForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)


  from .autonotebook import tqdm as notebook_tqdm
2025-03-04 17:24:29.077386: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741109069.103438  557400 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741109069.111708  557400 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-04 17:24:29.139773: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [3]:
if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
  device_name = tf.test.gpu_device_name()
  # The device name should look like the following:
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


I0000 00:00:1741109078.387200  557400 gpu_device.cc:2022] Created device /device:GPU:0 with 13775 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:06.0, compute capability: 7.5


In [4]:
train_df = pd.read_csv("Data/augmented_train_split.csv")
val_df = pd.read_csv("Data/val_split.csv")
dev_df = pd.read_csv("Data/dev.csv")

In [5]:
train_df["text"] = train_df["text"].astype(str)
val_df["text"] = val_df["text"].astype(str)
dev_df["text"] = dev_df["text"].astype(str)

In [6]:
# ✅ **Model Name**
MODEL_NAME = "albert-base-v2"

# ✅ **Load Tokenizer**
tokenizer = AlbertTokenizer.from_pretrained(MODEL_NAME)

# ✅ **Load ALBERT Config with Modified Dropout**
config = AlbertConfig.from_pretrained(
    MODEL_NAME,
    num_labels=2,  # Adjust based on your classification task
    hidden_dropout_prob=0.2,  # Increase dropout for hidden layers
)

# ✅ **Ensure Labels are Integers**
train_df["label"] = train_df["label"].astype(int)
val_df["label"] = val_df["label"].astype(int)
dev_df["label"] = dev_df["label"].astype(int)

# ✅ **Convert Pandas DataFrames to Hugging Face Datasets**
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
dev_dataset = Dataset.from_pandas(dev_df)

# ✅ **Tokenization Function**
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

# ✅ **Compute Class Weights**
labels = train_df["label"].values
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(labels), y=labels)
log_class_weights = torch.tensor(np.log1p(class_weights), dtype=torch.float32).to("cuda")

print(f"Class Weights: {class_weights}")  # Debugging output

# ✅ **Modify Trainer to Apply Loss Weights**
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """Compute loss using weighted CrossEntropyLoss"""
        labels = inputs["labels"].view(-1)  # Ensure labels are 1D
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits

        # Apply weighted loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=log_class_weights)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# ✅ **Load ALBERT Model**
model = AlbertForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

# ✅ **Training Arguments**
training_args = TrainingArguments(
    output_dir="./results_albert",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=8,  # Set high, but early stopping will stop it early
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    fp16=True,
    learning_rate=1e-5,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",  # Monitor Validation Loss
    logging_dir="./logs_albert",
    logging_steps=10,
    report_to="none",
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    greater_is_better=False,  # Lower validation loss is better
)

# ✅ **Define Evaluation Metrics**
def compute_metrics(pred):
    predictions = np.argmax(pred.predictions, axis=1)
    return {"f1": f1_score(pred.label_ids, predictions)}

# ✅ **Use Custom Trainer (Train with val set, dev set reserved for final eval)**
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Use validation set for evaluation
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0)],  # Stop if val loss increases for 2 consecutive epochs
)

# ✅ **Train ALBERT**
trainer.train()

# ✅ **Final Evaluation on Dev Set**
print("\n✅ Evaluating on Dev Set...")
results = trainer.predict(dev_dataset)
y_pred = np.argmax(results.predictions, axis=1).tolist()

# ✅ **Fix F1 Score Retrieval**
final_f1 = results.metrics.get("test_f1", 0.0)
print(f"✅ Final Dev F1 Score for ALBERT: {final_f1:.4f}")

Map: 100%|██████████| 12057/12057 [00:08<00:00, 1348.41 examples/s]
Map: 100%|██████████| 1675/1675 [00:01<00:00, 1341.06 examples/s]
Map: 100%|██████████| 2094/2094 [00:01<00:00, 1391.15 examples/s]
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class Weights: [0.55236394 5.27427822]


Epoch,Training Loss,Validation Loss,F1
1,0.5248,0.544039,0.234234
2,0.4939,0.470271,0.389916
3,0.4539,0.441532,0.41206
4,0.3658,0.432288,0.425225
5,0.3206,0.493964,0.38949
6,0.3588,0.430961,0.460888
7,0.3536,0.430228,0.454955
8,0.3499,0.434535,0.457014



✅ Evaluating on Dev Set...


✅ Final Dev F1 Score for ALBERT: 0.4765
