# Day 29: Validation Strategies and Early Stopping

In this notebook, we'll focus on implementing effective validation strategies and early stopping techniques for fine-tuning large language models.

## Overview

1. Setup and dependencies
2. Implementing K-fold cross-validation
3. Early stopping implementation
4. Analyzing learning curves

## 1. Setup and Dependencies

In [None]:
!pip install -q transformers datasets peft evaluate accelerate scikit-learn matplotlib seaborn

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from sklearn.model_selection import KFold
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType
)
import evaluate
from torch.utils.data import Subset

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

## 2. Loading and Preparing the Dataset

In [None]:
# Define the base model
base_model_name = "roberta-base"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load a small dataset for demonstration
# We'll use a subset of SST-2 for faster execution
dataset = load_dataset("glue", "sst2")
train_dataset = dataset["train"].select(range(1000))  # Use 1000 examples for training
eval_dataset = dataset["validation"].select(range(200))  # Use 200 examples for validation

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

# Create a data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## 3. Implementing K-Fold Cross-Validation

K-fold cross-validation helps ensure our model's performance is robust and not dependent on a specific train-validation split.

In [None]:
# Define metrics for evaluation
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [None]:
# Function to create a model with LoRA adapters
def create_lora_model():
    # Load the base model
    base_model = AutoModelForSequenceClassification.from_pretrained(
        base_model_name,
        num_labels=2,  # Binary classification for sentiment
        return_dict=True
    )
    
    # Define LoRA configuration
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=["query", "key", "value"],
        bias="none"
    )
    
    # Create the PEFT model
    peft_model = get_peft_model(base_model, lora_config)
    
    return peft_model

In [None]:
# Implement K-fold cross-validation
def k_fold_cross_validation(dataset, k=5, num_epochs=2):
    # Initialize KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    
    # Store results for each fold
    fold_results = []
    
    # Get indices of the dataset
    indices = list(range(len(dataset)))
    
    # Perform K-fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(indices)):
        print(f"\nTraining fold {fold+1}/{k}")
        
        # Create train and validation subsets
        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)
        
        # Create a new model for each fold
        model = create_lora_model().to(device)
        
        # Define training arguments
        training_args = TrainingArguments(
            output_dir=f"./results/fold_{fold+1}",
            learning_rate=3e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            num_train_epochs=num_epochs,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            push_to_hub=False,
            report_to="none",
            logging_steps=10
        )
        
        # Create the trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_subset,
            eval_dataset=val_subset,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )
        
        # Train the model
        trainer.train()
        
        # Evaluate the model
        eval_results = trainer.evaluate()
        
        # Store results
        fold_results.append({
            "fold": fold + 1,
            "accuracy": eval_results["eval_accuracy"],
            "loss": eval_results["eval_loss"]
        })
        
        print(f"Fold {fold+1} accuracy: {eval_results['eval_accuracy']:.4f}")
    
    return fold_results

In [None]:
# Run K-fold cross-validation with a small K for demonstration
# In practice, you would use K=5 or K=10
k_fold_results = k_fold_cross_validation(tokenized_train, k=3, num_epochs=2)

In [None]:
# Analyze K-fold results
k_fold_df = pd.DataFrame(k_fold_results)
print("K-fold Cross-Validation Results:")
print(k_fold_df)

# Calculate mean and standard deviation
mean_accuracy = k_fold_df["accuracy"].mean()
std_accuracy = k_fold_df["accuracy"].std()

print(f"\nMean accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}")

## 4. Early Stopping Implementation

Early stopping prevents overfitting by monitoring validation performance and stopping training when performance plateaus or degrades.

In [None]:
# Function to train with early stopping and track metrics
def train_with_early_stopping(patience=3, num_epochs=10):
    # Create a new model
    model = create_lora_model().to(device)
    
    # Define training arguments with early stopping
    training_args = TrainingArguments(
        output_dir="./results/early_stopping",
        learning_rate=3e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=num_epochs,  # Maximum number of epochs
        weight_decay=0.01,
        evaluation_strategy="epoch",  # Evaluate after each epoch
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        report_to="none",
        logging_steps=10,
        metric_for_best_model="accuracy",  # Monitor accuracy for early stopping
    )
    
    # Create the trainer with early stopping callback
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
    )
    
    # Train the model
    trainer.train()
    
    # Get training history
    training_history = trainer.state.log_history
    
    # Extract metrics from history
    train_metrics = []
    eval_metrics = []
    
    for entry in training_history:
        if "loss" in entry and "epoch" in entry and "eval_accuracy" not in entry:
            train_metrics.append({
                "epoch": entry["epoch"],
                "loss": entry["loss"]
            })
        elif "eval_accuracy" in entry and "epoch" in entry:
            eval_metrics.append({
                "epoch": entry["epoch"],
                "accuracy": entry["eval_accuracy"],
                "loss": entry["eval_loss"]
            })
    
    return {
        "train_metrics": train_metrics,
        "eval_metrics": eval_metrics,
        "model": model,
        "trainer": trainer
    }

In [None]:
# Train with early stopping
print("Training with early stopping...")
early_stopping_results = train_with_early_stopping(patience=2, num_epochs=10)

## 5. Analyzing Learning Curves

Learning curves help us understand the training dynamics and identify potential issues like overfitting.

In [None]:
# Convert metrics to DataFrames
train_df = pd.DataFrame(early_stopping_results["train_metrics"])
eval_df = pd.DataFrame(early_stopping_results["eval_metrics"])

# Plot learning curves
plt.figure(figsize=(12, 5))

# Plot loss curves
plt.subplot(1, 2, 1)
plt.plot(train_df["epoch"], train_df["loss"], label="Training Loss")
plt.plot(eval_df["epoch"], eval_df["loss"], label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss Curves")
plt.legend()
plt.grid(True)

# Plot accuracy curve
plt.subplot(1, 2, 2)
plt.plot(eval_df["epoch"], eval_df["accuracy"], label="Validation Accuracy", marker="o")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy Curve")
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Analyze when early stopping occurred
best_epoch = eval_df.loc[eval_df["accuracy"].idxmax()]["epoch"]
best_accuracy = eval_df["accuracy"].max()

print(f"Best epoch: {best_epoch}")
print(f"Best validation accuracy: {best_accuracy:.4f}")
print(f"Total epochs trained: {len(eval_df)}")

if len(eval_df) < 10:  # If we trained fewer than the maximum epochs
    print(f"Early stopping occurred after epoch {len(eval_df)}")

## 6. Evaluating the Final Model

Let's evaluate our final model on the test set.

In [None]:
# Evaluate on the test set
trainer = early_stopping_results["trainer"]
test_results = trainer.evaluate(eval_dataset)

print("Test Results:")
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")

## Conclusion

In this notebook, we've implemented and explored validation strategies and early stopping techniques for fine-tuning large language models. We've seen how to:

1. Implement K-fold cross-validation to get a robust estimate of model performance
2. Use early stopping to prevent overfitting and reduce training time
3. Analyze learning curves to understand training dynamics

These techniques are essential for developing high-performing models and ensuring they generalize well to unseen data. By carefully validating our models and stopping training at the right time, we can create more efficient and effective language models.