Andrew Marasco \
*Automated Dialogue Summarization for Messaging Platform* \
BART MVP \
Flatiron School Capstone Project #2 \
January, 2026

## STEP 1.0
Installing & Restarting Runtime

In [1]:
!pip -q install -U transformers datasets evaluate accelerate sentencepiece rouge_score


## Step 1.1

Imports + Global Settings for Model

In [2]:
import numpy as np
from datasets import load_dataset
import evaluate

from transformers import(
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

MODEL_NAME = "facebook/bart-base"
MAX_SOURCE_LEN = 512
MAX_TARGET_LEN = 64
SEED = 42

In [3]:
import transformers
print(transformers.__version__)

5.0.0


In [4]:
!pip -q install -U transformers accelerate

## STEP 1.2

Loading SAMSum Dataset, Confirming Splits

In [6]:
ds = load_dataset("knkarthick/samsum")
print(ds)
print("Train:", len(ds["train"]), "Val:", len(ds["validation"]), "Test:", len(ds["test"]))

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14731
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
})
Train: 14731 Val: 818 Test: 819


## STEP 1.3

Check Point

In [7]:
from transformers import AutoTokenizer
tok_tmp = AutoTokenizer.from_pretrained(MODEL_NAME)

def token_len(text):
    return len(tok_tmp(text, truncation=False, add_special_tokens=True)["input_ids"])

sample = ds["train"].shuffle(seed=SEED).select(range(500))
dialog_lens = [token_len(x["dialogue"]) for x in sample]
sum_lens = [token_len(x["summary"]) for x in sample]

print("Dialogue token lengths (500 sample):",
      "median =", int(np.median(dialog_lens)),
      "p95 =", int(np.percentile(dialog_lens, 95)),
      "max =", int(np.max(dialog_lens)))

print("Summary token lengths (500 sample):",
      "median =", int(np.median(sum_lens)),
      "p95 =", int(np.percentile(sum_lens, 95)),
      "max =", int(np.max(sum_lens)))


Dialogue token lengths (500 sample): median = 120 p95 = 367 max = 809
Summary token lengths (500 sample): median = 24 p95 = 57 max = 78


## STEP 2.0

Loading Tokenizer and Model

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

Loading weights:   0%|          | 0/259 [00:00<?, ?it/s]

## STEP 2.1

Preprocessing, Tokenizing (static padding + '-100' label masking)

In [9]:
def preprocess(batch):
  inputs = batch["dialogue"]
  targets = batch["summary"]

  model_inputs = tokenizer(
      inputs,
      max_length=MAX_SOURCE_LEN,
      truncation=True,
      padding="max_length",
  )

  labels = tokenizer(
      text_target=targets,
      max_length=MAX_TARGET_LEN,
      truncation=True,
      padding="max_length",
  )["input_ids"]

  labels = [
      [(tok if tok != tokenizer.pad_token_id else -100) for tok in seq]
      for seq in labels
  ]
  model_inputs["labels"] = labels
  return model_inputs

tokenized = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)
tokenized

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14731
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
})

## Step 2.2

Confirming -100 Masking

In [11]:
i = 0
print("Example dialogue:\n", ds["train"][i]["dialogue"][:300], "...\n")
print("Example summary:\n", ds["train"][i]["summary"], "\n")

print("Label ids (first 30):", tokenized["train"][i]["labels"][:30])
print("Count of -100 in labels:", sum(x == -100 for x in tokenized["train"][i]["labels"]))


Example dialogue:
 Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-) ...

Example summary:
 Amanda baked cookies and will bring Jerry some tomorrow. 

Label ids (first 30): [0, 10127, 5219, 17241, 15269, 8, 40, 836, 6509, 103, 3859, 4, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]
Count of -100 in labels: 51


## Step 3.0

Creating ROUGE Metric + compute_metric

In [12]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
  preds, labels = eval_pred
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

  pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
  label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)

  result = rouge.compute(
      predictions=pred_str,
      references=label_str,
      use_stemmer=False,
  )
  return {k: round(v, 4) for k, v in result.items()}

## Step 3.1

Creating Training Arguments for Model

In [18]:
args = Seq2SeqTrainingArguments(
    output_dir=".bart_samsum_mvp",
    seed=SEED,

    eval_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,

    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    fp16=True,

    learning_rate=5e-5,
    num_train_epochs=1,
    predict_with_generate=True,

    # Generation settings for evaluation
    generation_max_length=64,
    generation_num_beams=4,
)

Step 3.1b

In [20]:
model.generation_config.no_repeat_ngram_size = 3
model.generation_config.repetition_penalty = 1.2
model.generation_config.max_length = 64
model.generation_config.num_beams = 4

## Step 3.2

Creating Data Collator and Trainer

In [21]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

## Step 3.3

Baseline Evaluation before Training

In [22]:
trainer.evaluate()

{'eval_loss': 4.124423980712891,
 'eval_model_preparation_time': 0.0031,
 'eval_rouge1': 0.2956,
 'eval_rouge2': 0.0936,
 'eval_rougeL': 0.2257,
 'eval_rougeLsum': 0.2261,
 'eval_runtime': 336.1547,
 'eval_samples_per_second': 2.433,
 'eval_steps_per_second': 1.217}

Step 3.4

Actual Training

In [23]:
trainer.train()

Step,Training Loss,Validation Loss,Model Preparation Time,Rouge1,Rouge2,Rougel,Rougelsum
500,14.527634,1.551138,0.0031,0.4748,0.2388,0.3973,0.3979


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=921, training_loss=14.758721168344106, metrics={'train_runtime': 567.3865, 'train_samples_per_second': 25.963, 'train_steps_per_second': 1.623, 'total_flos': 4491013883166720.0, 'train_loss': 14.758721168344106, 'epoch': 1.0})

Step 3.4a - Evaluating after training

In [24]:
trainer.evaluate()

{'eval_loss': 1.5029836893081665,
 'eval_model_preparation_time': 0.0031,
 'eval_rouge1': 0.4863,
 'eval_rouge2': 0.2476,
 'eval_rougeL': 0.4049,
 'eval_rougeLsum': 0.4054,
 'eval_runtime': 169.8984,
 'eval_samples_per_second': 4.815,
 'eval_steps_per_second': 2.407,
 'epoch': 1.0}

Step 3.4b - Check for any Label Bugs

In [25]:
i = 0
labels = tokenized["train"][i]["labels"]
print("First 40 label ids:", labels[:40])
print("Num -100:", sum(x == -100 for x in labels))
print("Num non--100:", sum(x != -100 for x in labels))


First 40 label ids: [0, 10127, 5219, 17241, 15269, 8, 40, 836, 6509, 103, 3859, 4, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]
Num -100: 51
Num non--100: 13


Step 3.4c - Confirming Decoder labels decode to actual summary

In [26]:
i = 0
label_ids = [x if x != -100 else tokenizer.pad_token_id for x in tokenized["train"][i]["labels"]]
decoded = tokenizer.decode(label_ids, skip_special_tokens=True)

print("REFERENCE SUMMARY:\n", ds["train"][i]["summary"])
print("\nDECODED LABELS:\n", decoded)


REFERENCE SUMMARY:
 Amanda baked cookies and will bring Jerry some tomorrow.

DECODED LABELS:
 Amanda baked cookies and will bring Jerry some tomorrow.


## Step 3.5

Evaluation Using ROUGE

In [27]:
metrics = trainer.evaluate()
metrics

{'eval_loss': 1.5029836893081665,
 'eval_model_preparation_time': 0.0031,
 'eval_rouge1': 0.4863,
 'eval_rouge2': 0.2476,
 'eval_rougeL': 0.4049,
 'eval_rougeLsum': 0.4054,
 'eval_runtime': 165.9506,
 'eval_samples_per_second': 4.929,
 'eval_steps_per_second': 2.465,
 'epoch': 1.0}

## Step 3.6

Saving Model (for faster reloads for demos)

In [28]:
trainer.save_model("./bart_samsum_mvp_model")
tokenizer.save_pretrained("./bart_samsum_mvp_model")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('./bart_samsum_mvp_model/tokenizer_config.json',
 './bart_samsum_mvp_model/tokenizer.json')

## Step 4.0

Evaluating on the test split

In [29]:
test_metrics = trainer.evaluate(eval_dataset=tokenized["test"])
test_metrics

{'eval_loss': 1.546617031097412,
 'eval_model_preparation_time': 0.0031,
 'eval_rouge1': 0.4719,
 'eval_rouge2': 0.2326,
 'eval_rougeL': 0.3958,
 'eval_rougeLsum': 0.3956,
 'eval_runtime': 166.673,
 'eval_samples_per_second': 4.914,
 'eval_steps_per_second': 2.46,
 'epoch': 1.0}

## Step 4.1

Building small demo function

In [38]:
import torch

def summarize_dialogue(dialogue, num_beams=4):
  inputs = tokenizer(
      dialogue,
      return_tensors="pt",
      truncation=True,
      max_length=MAX_SOURCE_LEN,
      padding=True,
  )
  inputs = {k: v.to(model.device) for k, v in inputs.items()}

  with torch.no_grad():
    output_ids = model.generate(
        **inputs,
        max_new_tokens=64,
        num_beams=num_beams,
        no_repeat_ngram_size=3,
        repetition_penalty=1.2,
        early_stopping=True,
    )

  return tokenizer.decode(output_ids[0], skip_special_tokens=True)

## Step 4.2

Generating 8 Demo Examples

In [39]:
demo_idxs = [0, 3, 7, 25, 50, 122, 333, 667]

for i in demo_idxs:
  dialogue = ds["test"][i]["dialogue"]
  ref = ds["test"][i]["summary"]
  pred = summarize_dialogue(dialogue)

  print("="*100)
  print(f"TEST IDX: {i}\n")
  print("DIALOGUE: \n", dialogue)
  print("\nREFERENCE SUMMARY:\n", ref)
  print("\nMODEL SUMMARY:\n", pred)

TEST IDX: 0

DIALOGUE: 
 Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

REFERENCE SUMMARY:
 Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

MODEL SUMMARY:
 Hannah doesn't know Betty's number. She texted Larry last time they were at the park.
TEST IDX: 3

DIALOGUE: 
 Will: hey babe, what do you want for dinner tonight?
Emma:  gah, don't even worry about it tonight
Will: what do you mean? everything ok?
Emma: not really, but it's ok, don't worry about cooking though, I'm not hungry
Will: Well what time will you be home?
Emma: soon, hopefully
Will: you sure? Maybe you want me to pick you up?
Emma: