# Day 29: Hyperparameter Optimization for LLMs - Part 1

In this notebook, we'll implement hyperparameter optimization techniques for fine-tuning large language models. We'll focus on systematic approaches to hyperparameter search and validation strategies.

## Overview

1. Setup and dependencies
2. Defining the model and dataset
3. Implementing a hyperparameter search framework
4. Grid search implementation
5. Random search implementation
6. Analyzing search results

## 1. Setup and Dependencies

In [None]:
!pip install -q transformers datasets peft evaluate accelerate optuna pandas matplotlib seaborn

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType
)
import evaluate
import optuna
from optuna.visualization import plot_param_importances, plot_optimization_history
from functools import partial
import json
import time
from datetime import datetime

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

## 2. Defining the Model and Dataset

We'll use a RoBERTa model with LoRA adapters for a text classification task. For demonstration purposes, we'll use the SST-2 sentiment analysis dataset.

In [None]:
# Define the base model
base_model_name = "roberta-base"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load the dataset
dataset = load_dataset("glue", "sst2")
print(dataset)

# Look at a few examples
for i in range(3):
    print(f"Example {i+1}:")
    print(f"Text: {dataset['train'][i]['sentence']}")
    print(f"Label: {dataset['train'][i]['label']}")
    print()

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Prepare the datasets for training
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

# Create a data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## 3. Implementing a Hyperparameter Search Framework

We'll create a framework for hyperparameter search that can be used with different search strategies.

In [None]:
# Define metrics for evaluation
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [None]:
# Function to create a model with LoRA adapters
def create_lora_model(lora_r, lora_alpha, lora_dropout, target_modules):
    # Load the base model
    base_model = AutoModelForSequenceClassification.from_pretrained(
        base_model_name,
        num_labels=2,  # Binary classification for sentiment
        return_dict=True
    )
    
    # Define LoRA configuration
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules=target_modules,
        bias="none"
    )
    
    # Create the PEFT model
    peft_model = get_peft_model(base_model, lora_config)
    
    return peft_model

In [None]:
# Function to train and evaluate a model with given hyperparameters
def train_and_evaluate(hyperparams, output_dir="./results/hp_search", num_epochs=3):
    # Extract hyperparameters
    learning_rate = hyperparams["learning_rate"]
    batch_size = hyperparams["batch_size"]
    weight_decay = hyperparams["weight_decay"]
    lora_r = hyperparams["lora_r"]
    lora_alpha = hyperparams["lora_alpha"]
    lora_dropout = hyperparams["lora_dropout"]
    target_modules = hyperparams["target_modules"]
    
    # Create a unique output directory for this run
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_output_dir = f"{output_dir}/{timestamp}"
    os.makedirs(run_output_dir, exist_ok=True)
    
    # Save hyperparameters
    with open(f"{run_output_dir}/hyperparams.json", "w") as f:
        json.dump(hyperparams, f, indent=2)
    
    # Create the model
    model = create_lora_model(lora_r, lora_alpha, lora_dropout, target_modules)
    model = model.to(device)
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=run_output_dir,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size * 2,
        num_train_epochs=num_epochs,
        weight_decay=weight_decay,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        report_to="none",
        metric_for_best_model="accuracy"
    )
    
    # Create the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    
    # Train the model
    start_time = time.time()
    trainer.train()
    training_time = time.time() - start_time
    
    # Evaluate the model
    eval_results = trainer.evaluate()
    
    # Add training time to results
    eval_results["training_time"] = training_time
    
    # Save evaluation results
    with open(f"{run_output_dir}/eval_results.json", "w") as f:
        json.dump(eval_results, f, indent=2)
    
    return eval_results

## 4. Grid Search Implementation

Let's implement a grid search to systematically explore the hyperparameter space.

In [None]:
# Define the hyperparameter grid
param_grid = {
    "learning_rate": [1e-5, 3e-5, 5e-5],
    "batch_size": [16, 32],
    "weight_decay": [0.01, 0.1],
    "lora_r": [4, 8, 16],
    "lora_alpha": [16, 32],
    "lora_dropout": [0.05, 0.1],
    "target_modules": [["query", "key", "value"], ["query", "key", "value", "dense"]]
}

In [None]:
# Function to generate all combinations of hyperparameters
def generate_grid_combinations(param_grid):
    keys = list(param_grid.keys())
    values = list(param_grid.values())
    combinations = [{}]
    
    for i, key in enumerate(keys):
        combinations = [dict(comb, **{key: val}) for comb in combinations for val in values[i]]
    
    return combinations

# Calculate the number of combinations
num_combinations = 1
for values in param_grid.values():
    num_combinations *= len(values)

print(f"Grid search will evaluate {num_combinations} combinations of hyperparameters.")

# For demonstration, we'll use a smaller subset
# In practice, you would evaluate all combinations
grid_combinations = generate_grid_combinations(param_grid)
subset_size = min(3, len(grid_combinations))  # Just use 3 combinations for demonstration
grid_subset = grid_combinations[:subset_size]

print(f"Using a subset of {subset_size} combinations for demonstration.")
for i, params in enumerate(grid_subset):
    print(f"\nCombination {i+1}:")
    for key, value in params.items():
        print(f"{key}: {value}")

In [None]:
# Run grid search on the subset
grid_search_results = []

for i, hyperparams in enumerate(grid_subset):
    print(f"\nEvaluating combination {i+1}/{len(grid_subset)}")
    
    # Train and evaluate with these hyperparameters
    eval_results = train_and_evaluate(
        hyperparams,
        output_dir="./results/grid_search",
        num_epochs=2  # Reduced for demonstration
    )
    
    # Store results
    result = {
        "hyperparams": hyperparams,
        "eval_accuracy": eval_results["eval_accuracy"],
        "eval_loss": eval_results["eval_loss"],
        "training_time": eval_results["training_time"]
    }
    grid_search_results.append(result)
    
    print(f"Accuracy: {eval_results['eval_accuracy']:.4f}, Loss: {eval_results['eval_loss']:.4f}, Time: {eval_results['training_time']:.2f}s")

In [None]:
# Analyze grid search results
grid_results_df = pd.DataFrame([
    {
        "learning_rate": r["hyperparams"]["learning_rate"],
        "batch_size": r["hyperparams"]["batch_size"],
        "weight_decay": r["hyperparams"]["weight_decay"],
        "lora_r": r["hyperparams"]["lora_r"],
        "lora_alpha": r["hyperparams"]["lora_alpha"],
        "lora_dropout": r["hyperparams"]["lora_dropout"],
        "target_modules": "-".join(r["hyperparams"]["target_modules"]),
        "accuracy": r["eval_accuracy"],
        "loss": r["eval_loss"],
        "training_time": r["training_time"]
    }
    for r in grid_search_results
])

# Sort by accuracy
grid_results_df = grid_results_df.sort_values("accuracy", ascending=False)

# Display results
print("Grid Search Results (sorted by accuracy):")
grid_results_df

## 5. Random Search Implementation

Now, let's implement a random search using Optuna, which is more efficient for high-dimensional spaces.

In [None]:
# Define the objective function for Optuna
def objective(trial):
    # Sample hyperparameters
    hyperparams = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-4, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [16, 32, 64]),
        "weight_decay": trial.suggest_float("weight_decay", 0.01, 0.2),
        "lora_r": trial.suggest_categorical("lora_r", [4, 8, 16, 32]),
        "lora_alpha": trial.suggest_categorical("lora_alpha", [16, 32, 64]),
        "lora_dropout": trial.suggest_float("lora_dropout", 0.0, 0.2),
        "target_modules": trial.suggest_categorical(
            "target_modules",
            [
                ["query", "key", "value"],
                ["query", "key", "value", "dense"],
                ["query", "key", "value", "output"]
            ]
        )
    }
    
    # Train and evaluate with these hyperparameters
    eval_results = train_and_evaluate(
        hyperparams,
        output_dir="./results/random_search",
        num_epochs=2  # Reduced for demonstration
    )
    
    # Return the metric to optimize
    return eval_results["eval_accuracy"]

In [None]:
# Create a study object and optimize the objective function
study = optuna.create_study(direction="maximize", study_name="lora_hyperparameter_optimization")

# Run a few trials for demonstration
# In practice, you would run more trials
study.optimize(objective, n_trials=3)

In [None]:
# Print the best hyperparameters
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value:.4f}")
print("  Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

## 6. Analyzing Search Results

Let's visualize the results of our hyperparameter search to gain insights.

In [None]:
# Visualize parameter importances
try:
    fig = plot_param_importances(study)
    fig.show()
except Exception as e:
    print(f"Could not plot parameter importances: {e}")

In [None]:
# Visualize optimization history
try:
    fig = plot_optimization_history(study)
    fig.show()
except Exception as e:
    print(f"Could not plot optimization history: {e}")

In [None]:
# Convert Optuna trials to DataFrame for analysis
optuna_results = []
for trial in study.trials:
    if trial.state == optuna.trial.TrialState.COMPLETE:
        result = {
            "trial_number": trial.number,
            "accuracy": trial.value,
            **trial.params
        }
        optuna_results.append(result)

optuna_df = pd.DataFrame(optuna_results)

# Display results
print("Random Search Results (sorted by accuracy):")
optuna_df.sort_values("accuracy", ascending=False)

## 7. Comparing Grid Search and Random Search

Let's compare the results of our grid search and random search approaches.

In [None]:
# Prepare data for comparison
grid_best = grid_results_df.iloc[0] if not grid_results_df.empty else None
random_best = optuna_df.sort_values("accuracy", ascending=False).iloc[0] if not optuna_df.empty else None

if grid_best is not None and random_best is not None:
    print("Best Grid Search Configuration:")
    print(f"  Accuracy: {grid_best['accuracy']:.4f}")
    print(f"  Learning Rate: {grid_best['learning_rate']}")
    print(f"  Batch Size: {grid_best['batch_size']}")
    print(f"  LoRA Rank: {grid_best['lora_r']}")
    print(f"  LoRA Alpha: {grid_best['lora_alpha']}")
    
    print("\nBest Random Search Configuration:")
    print(f"  Accuracy: {random_best['accuracy']:.4f}")
    print(f"  Learning Rate: {random_best['learning_rate']}")
    print(f"  Batch Size: {random_best['batch_size']}")
    print(f"  LoRA Rank: {random_best['lora_r']}")
    print(f"  LoRA Alpha: {random_best['lora_alpha']}")
else:
    print("Not enough data to compare grid search and random search.")

## Conclusion

In this notebook, we've implemented and compared grid search and random search approaches for hyperparameter optimization of LLMs with LoRA adapters. We've seen how to:

1. Define a hyperparameter search space for LoRA fine-tuning
2. Implement grid search to systematically explore the hyperparameter space
3. Use Optuna for more efficient random search
4. Analyze and compare the results of different search strategies

Key takeaways:

- Grid search provides a comprehensive exploration but becomes impractical for high-dimensional spaces
- Random search is more efficient and can find good configurations with fewer trials
- Hyperparameter optimization is crucial for getting the best performance from LLMs
- Different hyperparameters have varying levels of importance for model performance

In Part 2, we'll explore more advanced techniques including early stopping, validation strategies, and preventing catastrophic forgetting.