Andrew Marasco \
*Automated Dialogue Summarization for Messaging Platform* \
BART MVP \
Flatiron School Capstone Project #2 \
January, 2026

## STEP 1.0
Installing & Restarting Runtime

In [1]:
!pip -q install -U transformers datasets evaluate accelerate sentencepiece rouge_score


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m153.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


## Step 1.1

Imports + Global Settings for Model

In [2]:
import numpy as np
from datasets import load_dataset
import evaluate

from transformers import(
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

MODEL_NAME = "facebook/bart-base"
MAX_SOURCE_LEN = 512
MAX_TARGET_LEN = 64
SEED = 42

In [3]:
import transformers
print(transformers.__version__)

5.1.0


In [4]:
!pip -q install -U transformers accelerate

## STEP 1.2

Loading SAMSum Dataset, Confirming Splits

In [5]:
ds = load_dataset("knkarthick/samsum")
print(ds)
print("Train:", len(ds["train"]), "Val:", len(ds["validation"]), "Test:", len(ds["test"]))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/14731 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14731
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
})
Train: 14731 Val: 818 Test: 819


## STEP 1.3

Check Point

In [6]:
from transformers import AutoTokenizer
tok_tmp = AutoTokenizer.from_pretrained(MODEL_NAME)

def token_len(text):
    return len(tok_tmp(text, truncation=False, add_special_tokens=True)["input_ids"])

sample = ds["train"].shuffle(seed=SEED).select(range(500))
dialog_lens = [token_len(x["dialogue"]) for x in sample]
sum_lens = [token_len(x["summary"]) for x in sample]

print("Dialogue token lengths (500 sample):",
      "median =", int(np.median(dialog_lens)),
      "p95 =", int(np.percentile(dialog_lens, 95)),
      "max =", int(np.max(dialog_lens)))

print("Summary token lengths (500 sample):",
      "median =", int(np.median(sum_lens)),
      "p95 =", int(np.percentile(sum_lens, 95)),
      "max =", int(np.max(sum_lens)))


config.json: 0.00B [00:00, ?B/s]



vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Dialogue token lengths (500 sample): median = 120 p95 = 367 max = 809
Summary token lengths (500 sample): median = 24 p95 = 57 max = 78


## STEP 2.0

Loading Tokenizer and Model

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/259 [00:00<?, ?it/s]

## STEP 2.1

Preprocessing, Tokenizing (static padding + '-100' label masking)

In [8]:
def preprocess(batch):
  inputs = batch["dialogue"]
  targets = batch["summary"]

  model_inputs = tokenizer(
      inputs,
      max_length=MAX_SOURCE_LEN,
      truncation=True,
      padding="max_length",
  )

  labels = tokenizer(
      text_target=targets,
      max_length=MAX_TARGET_LEN,
      truncation=True,
      padding="max_length",
  )["input_ids"]

  labels = [
      [(tok if tok != tokenizer.pad_token_id else -100) for tok in seq]
      for seq in labels
  ]
  model_inputs["labels"] = labels
  return model_inputs

tokenized = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)
tokenized

Map:   0%|          | 0/14731 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14731
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
})

## Step 2.2

Confirming -100 Masking

In [9]:
i = 0
print("Example dialogue:\n", ds["train"][i]["dialogue"][:300], "...\n")
print("Example summary:\n", ds["train"][i]["summary"], "\n")

print("Label ids (first 30):", tokenized["train"][i]["labels"][:30])
print("Count of -100 in labels:", sum(x == -100 for x in tokenized["train"][i]["labels"]))


Example dialogue:
 Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-) ...

Example summary:
 Amanda baked cookies and will bring Jerry some tomorrow. 

Label ids (first 30): [0, 10127, 5219, 17241, 15269, 8, 40, 836, 6509, 103, 3859, 4, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]
Count of -100 in labels: 51


## Step 3.0

Creating ROUGE Metric + compute_metric

In [10]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
  preds, labels = eval_pred
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

  pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
  label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)

  result = rouge.compute(
      predictions=pred_str,
      references=label_str,
      use_stemmer=False,
  )
  return {k: round(v, 4) for k, v in result.items()}

Downloading builder script: 0.00B [00:00, ?B/s]

## Step 3.1

Creating Training Arguments for Model

In [11]:
args = Seq2SeqTrainingArguments(
    output_dir=".bart_samsum_mvp",
    seed=SEED,

    eval_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,

    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    fp16=True,

    learning_rate=5e-5,
    num_train_epochs=1,
    predict_with_generate=True,

    # Generation settings for evaluation
    generation_max_length=64,
    generation_num_beams=4,
)

Step 3.1b

In [12]:
model.generation_config.no_repeat_ngram_size = 3
model.generation_config.repetition_penalty = 1.2
model.generation_config.max_length = 64
model.generation_config.num_beams = 4

## Step 3.2

Creating Data Collator and Trainer

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

## Step 3.3

Baseline Evaluation before Training

In [14]:
trainer.evaluate()

{'eval_loss': 4.124423980712891,
 'eval_model_preparation_time': 0.003,
 'eval_rouge1': 0.2958,
 'eval_rouge2': 0.0935,
 'eval_rougeL': 0.226,
 'eval_rougeLsum': 0.2259,
 'eval_runtime': 321.4054,
 'eval_samples_per_second': 2.545,
 'eval_steps_per_second': 1.273}

Step 3.4

Actual Training

In [15]:
trainer.train()

Step,Training Loss,Validation Loss,Model Preparation Time,Rouge1,Rouge2,Rougel,Rougelsum
500,14.525027,1.551265,0.003,0.4713,0.2355,0.3942,0.3944


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=921, training_loss=14.759862744458724, metrics={'train_runtime': 533.9519, 'train_samples_per_second': 27.589, 'train_steps_per_second': 1.725, 'total_flos': 4491013883166720.0, 'train_loss': 14.759862744458724, 'epoch': 1.0})

Step 3.4a - Evaluating after training

In [16]:
trainer.evaluate()

{'eval_loss': 1.5033433437347412,
 'eval_model_preparation_time': 0.003,
 'eval_rouge1': 0.4866,
 'eval_rouge2': 0.2503,
 'eval_rougeL': 0.4078,
 'eval_rougeLsum': 0.4078,
 'eval_runtime': 161.0304,
 'eval_samples_per_second': 5.08,
 'eval_steps_per_second': 2.54,
 'epoch': 1.0}

Step 3.4b - Check for any Label Bugs

In [17]:
i = 0
labels = tokenized["train"][i]["labels"]
print("First 40 label ids:", labels[:40])
print("Num -100:", sum(x == -100 for x in labels))
print("Num non--100:", sum(x != -100 for x in labels))


First 40 label ids: [0, 10127, 5219, 17241, 15269, 8, 40, 836, 6509, 103, 3859, 4, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]
Num -100: 51
Num non--100: 13


Step 3.4c - Confirming Decoder labels decode to actual summary

In [18]:
i = 0
label_ids = [x if x != -100 else tokenizer.pad_token_id for x in tokenized["train"][i]["labels"]]
decoded = tokenizer.decode(label_ids, skip_special_tokens=True)

print("REFERENCE SUMMARY:\n", ds["train"][i]["summary"])
print("\nDECODED LABELS:\n", decoded)


REFERENCE SUMMARY:
 Amanda baked cookies and will bring Jerry some tomorrow.

DECODED LABELS:
 Amanda baked cookies and will bring Jerry some tomorrow.


## Step 3.5

Evaluation Using ROUGE

In [19]:
metrics = trainer.evaluate()
metrics

{'eval_loss': 1.5033433437347412,
 'eval_model_preparation_time': 0.003,
 'eval_rouge1': 0.4866,
 'eval_rouge2': 0.2503,
 'eval_rougeL': 0.4078,
 'eval_rougeLsum': 0.4078,
 'eval_runtime': 160.725,
 'eval_samples_per_second': 5.089,
 'eval_steps_per_second': 2.545,
 'epoch': 1.0}

## Step 3.6

Saving Model (for faster reloads for demos)

In [28]:
trainer.save_model("./bart_samsum_mvp_model")
tokenizer.save_pretrained("./bart_samsum_mvp_model")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('./bart_samsum_mvp_model/tokenizer_config.json',
 './bart_samsum_mvp_model/tokenizer.json')

## Step 4.0

Evaluating on the test split

In [20]:
test_metrics = trainer.evaluate(eval_dataset=tokenized["test"])
test_metrics

{'eval_loss': 1.5471426248550415,
 'eval_model_preparation_time': 0.003,
 'eval_rouge1': 0.4725,
 'eval_rouge2': 0.2327,
 'eval_rougeL': 0.3961,
 'eval_rougeLsum': 0.3967,
 'eval_runtime': 157.9134,
 'eval_samples_per_second': 5.186,
 'eval_steps_per_second': 2.596,
 'epoch': 1.0}

## Step 4.1

Building small demo function

In [21]:
import torch

def summarize_dialogue(dialogue, num_beams=4):
  inputs = tokenizer(
      dialogue,
      return_tensors="pt",
      truncation=True,
      max_length=MAX_SOURCE_LEN,
      padding=True,
  )
  inputs = {k: v.to(model.device) for k, v in inputs.items()}

  with torch.no_grad():
    output_ids = model.generate(
        **inputs,
        max_new_tokens=64,
        num_beams=num_beams,
        no_repeat_ngram_size=3,
        repetition_penalty=1.2,
        early_stopping=True,
    )

  return tokenizer.decode(output_ids[0], skip_special_tokens=True)

## Step 4.2

Generating 10 Demo Examples

In [24]:
demo_idxs = [0, 3, 7, 25, 50, 89, 122, 333, 555, 667]

for i in demo_idxs:
  dialogue = ds["test"][i]["dialogue"]
  ref = ds["test"][i]["summary"]
  pred = summarize_dialogue(dialogue)

  print("="*100)
  print(f"TEST IDX: {i}\n")
  print("DIALOGUE: \n", dialogue)
  print("\nREFERENCE SUMMARY:\n", ref)
  print("\nMODEL SUMMARY:\n", pred)

TEST IDX: 0

DIALOGUE: 
 Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

REFERENCE SUMMARY:
 Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

MODEL SUMMARY:
 Hannah doesn't know Betty's number. She texted Larry last time they were at the park together.
TEST IDX: 3

DIALOGUE: 
 Will: hey babe, what do you want for dinner tonight?
Emma:  gah, don't even worry about it tonight
Will: what do you mean? everything ok?
Emma: not really, but it's ok, don't worry about cooking though, I'm not hungry
Will: Well what time will you be home?
Emma: soon, hopefully
Will: you sure? Maybe you want me to pick you 

## Step 4.3

Great - 7, 89, 333 \
Decent - 555 \
Failure - 25

## Step 4.4

Improving Coherence for Demos

In [25]:
def summarize_dialogue(dialogue, num_beams=4):
  inputs = tokenizer(
      dialogue,
      return_tensors="pt",
      truncation=True,
      max_length=MAX_SOURCE_LEN,
      padding=True,
  )
  inputs = {k: v.to(model.device) for k, v in inputs.items()}

  with torch.no_grad():
    output_ids = model.generate(
        **inputs,
        max_new_tokens=64,
        num_beams=num_beams,
        length_penalty=1.0,
        min_new_tokens=10,
        no_repeat_ngram_size=3,
        repetition_penalty=1.1,
        early_stopping=True,
    )

  return tokenizer.decode(output_ids[0], skip_special_tokens=True)

## Step 4.5: