Add the project root to Python's import path so local modules (src/...) can be imported easily.

In [None]:
import sys
from pathlib import Path

project_path = Path.cwd().parent

sys.path.append(str(project_path.resolve()))

Import dataset helpers, evaluation utilities, Hugging Face Transformers and PEFT components used for training and tokenization.

In [None]:
import os
from src.dataset.load_data_soda import SODADataLoader
from src.utils.story_eval import get_compute_metrics_function_for_stories
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType
)

Enable tokenizer parallelism via environment variable. This can speed up tokenization on multi-core machines.

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

Create the SODA dataset loader with simple filtering options and retrieve the dataset object.

In [None]:
soda_dataset_obj = SODADataLoader(
    percent_of_all_splits=10,
    min_story_length=20,
    max_story_length=250,
    join_dialogue_and_speakers=True,
    add_characters_in_narrative=True,
    add_turns_count_in_narrative=True
)
soda_ds = soda_dataset_obj.dataset

In [None]:
model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Set LoRA finetuning config values.

In [None]:
lora_config = LoraConfig(
    r=16,  # Rank of the update matrices. Lower r = fewer parameters.
    lora_alpha=32,  # Alpha scaling factor.
    target_modules=["q_proj", "v_proj"],  # Apply LoRA to query and value projections in attention
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM  # Specify the task type for seq2seq models
)

In [None]:
model = get_peft_model(base_model, lora_config)

Define a preprocessing function to tokenize inputs (narrative) and targets (dialogue). Then map it over the dataset.

In [None]:
max_source_length = 256
max_target_length = 1024

def preprocess(example):
    inputs = tokenizer(
        example["narrative"],
        max_length=max_source_length,
        truncation=True,
    )
    targets = tokenizer(
        text_target=example["dialogue"],
        max_length=max_target_length,
        truncation=True,
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_datasets = soda_ds.map(
    preprocess,
    batched=True,
    remove_columns=soda_ds["train"].column_names,
)

Create a data collator to batch examples correctly for seq2seq training.

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Set training arguments for the Hugging Face Trainer (epochs, batch size, save/eval strategies, etc.).

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart-soda-lora-finetuned-local",
    run_name="story2dialogue-SODA-BERT-LoRA-local",
    label_names=["labels"],
    
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-4,
    adam_epsilon=1e-8,
    label_smoothing_factor=0.0,
    weight_decay=0.01,
    fp16=True,
    seed=42,
    
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
    greater_is_better=True,
    predict_with_generate=True,
    generation_num_beams=4,
    dataloader_num_workers=2,
    
    logging_strategy="steps",
    logging_steps=250,
    report_to="none",
)

Prepare the function used to compute evaluation metrics during training and evaluation.

In [None]:
compute_metrics_eval = get_compute_metrics_function_for_stories(tokenizer)

Create the Seq2SeqTrainer by wiring model, tokenizer, datasets, data collator and metric function together.

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics_eval
)

Start training the model. This will run for the number of epochs set in the training arguments.

In [None]:
trainer.train()

Run predictions on the test split to get model outputs for evaluation.

In [None]:
raw_predictions = trainer.predict(tokenized_datasets["test"])

In [None]:
cal_metrices_events_comp_fn = get_compute_metrics_function_for_stories(
    tokenizer,
    metrics_prefix="test/",
    save_preds=True,
    save_preds_filename="preds.csv"
)
metrics = cal_metrices_events_comp_fn(raw_predictions)