### Fine-Tune to check HELLASWAG works
This notebook uses the HELLASWAG inspired data generated to fine-tune a LLM. Then, it also uses a portion of the dataset to measure the model improvements.

In [1]:
from pathlib import Path
import json
import evaluate
from datasets import load_dataset, Dataset, DatasetDict, Value
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding)
from peft import LoraConfig, get_peft_model
import torch, random, numpy as np

from shared_models import HellaSwagEntry

  from .autonotebook import tqdm as notebook_tqdm


#### Load Data

In [2]:
DATA_PATH = "../data/hellaswag_format/personal_chat_sessions_train_hellaswag.jsonl"

In [3]:
def load_jsonl_pydantic(path: Path):
    """Yield HellaSwagEntry objects parsed with Pydantic."""
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            yield HellaSwagEntry.model_validate_json(line)

records = list(load_jsonl_pydantic(Path(DATA_PATH)))
print(f"Loaded {len(records):,} examples")

Loaded 22,282 examples


#### Explode into Hugging Face Dataset

In [4]:
def explode_examples(ex: HellaSwagEntry):
    endings = [ex.ending0, ex.ending1, ex.ending2, ex.ending3, ex.ending4]
    return [
        {
            "text": f"{ex.context.strip()} [SEP] {endings[i].strip()}",
            "label": int(ex.label == i),   # 1 if correct ending else 0
            "choice_id": i,
        }
        for i in range(5)
    ]

flat = [row for entry in records for row in explode_examples(entry)]

ds = Dataset.from_list(flat).cast_column("label",Value( "int8"))
ds = ds.train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict(train=ds["train"], test=ds["test"])

dataset

Casting the dataset: 100%|██████████| 111410/111410 [00:00<00:00, 405379.12 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'choice_id'],
        num_rows: 89128
    })
    test: Dataset({
        features: ['text', 'label', 'choice_id'],
        num_rows: 22282
    })
})

#### Pick a model

In [5]:
model_checkpoint = "distilbert-base-uncased"  # pick any seq-cls model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
if tokenizer.pad_token is None:                # safety
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

def preprocess(batch):
    return tokenizer(batch["text"],
                     truncation=True,
                     max_length=128)

tokenized = dataset.map(preprocess, batched=True,
                        remove_columns=["text"])

Map: 100%|██████████| 89128/89128 [00:45<00:00, 1963.18 examples/s]
Map: 100%|██████████| 22282/22282 [00:10<00:00, 2144.77 examples/s]


#### Setting the metrics

In [6]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

#### Building the LoRA Model

In [7]:
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2  # binary (correct vs wrong)
)
base_model.resize_token_embeddings(len(tokenizer))

peft_cfg = LoraConfig(
    r=4, lora_alpha=32, target_modules=["q_lin", "v_lin"],
    lora_dropout=0.01, bias="none", task_type="SEQ_CLS",
)
model = get_peft_model(base_model, peft_cfg)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Fine tune setup

In [8]:
ckpt_dir = Path("../data/models/checkpoints")
ckpt_dir.mkdir(parents=True, exist_ok=True)

args = TrainingArguments(
    output_dir=str(ckpt_dir),
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=1e-4,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_steps=50,
)

data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=base_model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


#### Measure Baseline

In [None]:
baseline_metrics = trainer.evaluate()
print("Baseline accuracy:", baseline_metrics["eval_accuracy"])

  trainer = Trainer(


Baseline accuracy: 0.6691051072614667


#### Let's Fine Tune

In [9]:
trainer.model = model             # swap in PEFT model
train_metrics = trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy
1,0.3799,0.372303,0.0045,0.856117
2,0.3622,0.368939,0.0045,0.856386
3,0.3922,0.371873,0.0045,0.856925


#### Final Accuracy

In [10]:
final_metrics = trainer.evaluate()
print("Final accuracy:", final_metrics["eval_accuracy"])

Final accuracy: 0.856924872094067
