Andrew Marasco \
*Automated Dialogue Summarization for Messaging Platform* \
BART MVP \
Flatiron School Capstone Project #2 \
January, 2026

## STEP 1.0
Installing & Restarting Runtime

In [2]:
!pip -q install -U transformers datasets evaluate accelerate sentencepiece


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/515.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m419.8/515.2 kB[0m [31m12.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[?25h

## Step 1.1

Imports + Global Settings for Model

In [3]:
import numpy as np
from datasets import load_dataset
import evaluate

from transformers import(
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

MODEL_NAME = "facebook/bart-base"
MAX_SOURCE_LEN = 512
MAX_TARGET_LEN = 64
SEED = 42

## STEP 1.2

Loading SAMSum Dataset, Confirming Splits

In [4]:
ds = load_dataset("knkarthick/samsum")
print(ds)
print("Train:", len(ds["train"]), "Val:", len(ds["validation"]), "Test:", len(ds["test"]))

README.md: 0.00B [00:00, ?B/s]



train.csv: 0.00B [00:00, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/14731 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14731
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
})
Train: 14731 Val: 818 Test: 819


## STEP 1.3

Check Point

In [5]:
from transformers import AutoTokenizer
tok_tmp = AutoTokenizer.from_pretrained(MODEL_NAME)

def token_len(text):
    return len(tok_tmp(text, truncation=False, add_special_tokens=True)["input_ids"])

sample = ds["train"].shuffle(seed=SEED).select(range(500))
dialog_lens = [token_len(x["dialogue"]) for x in sample]
sum_lens = [token_len(x["summary"]) for x in sample]

print("Dialogue token lengths (500 sample):",
      "median =", int(np.median(dialog_lens)),
      "p95 =", int(np.percentile(dialog_lens, 95)),
      "max =", int(np.max(dialog_lens)))

print("Summary token lengths (500 sample):",
      "median =", int(np.median(sum_lens)),
      "p95 =", int(np.percentile(sum_lens, 95)),
      "max =", int(np.max(sum_lens)))


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Dialogue token lengths (500 sample): median = 120 p95 = 367 max = 809
Summary token lengths (500 sample): median = 24 p95 = 57 max = 78


## STEP 2.0

Loading Tokenizer and Model

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/259 [00:00<?, ?it/s]

## STEP 2.1

Preprocessing, Tokenizing (static padding + '-100' label masking)

In [9]:
def preprocess(batch):
  inputs = batch["dialogue"]
  targets = batch["summary"]

  model_inputs = tokenizer(
      inputs,
      max_length=MAX_SOURCE_LEN,
      truncation=True,
      padding="max_length",
  )

  labels = tokenizer(
      text_target=targets,
      max_length=MAX_TARGET_LEN,
      truncation=True,
      padding="max_length",
  )["input_ids"]

  labels = [
      [(tok if tok != tokenizer.pad_token_id else -100) for tok in seq]
      for seq in labels
  ]
  model_inputs["labels"] = labels
  return model_inputs

tokenized = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)
tokenized

Map:   0%|          | 0/14731 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14731
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
})