# LLM For Chat Summarization
In order to make ACME corp's chat client more efficent we are implementing a summarization feature powered by AI. The initial training was done using the SAMSum dataset found on Huggingface. The goal is to create a summarization feature to simplify catching up on group chats for the subscribers since large message counts can cause the users to be overwhelmed. Through the power of AI models we will be able to summarize their unread messages and create a summary. I will use ROUGE scores to evaluate the summarization feature's performance, as that is the industry standard. First we need to load in the dependencies, dataset, and perform initial set ups.

In [None]:
# Step 1: Install dependencies
!pip install datasets transformers torch evaluate rouge_score

from transformers import BertTokenizer, GPT2Tokenizer, EncoderDecoderModel
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup, DataCollatorForSeq2Seq
from tqdm.auto import tqdm
import evaluate
import numpy as np
import os

# Step 2: Hugging Face login
from huggingface_hub import login
# Replace YOUR_ACCESS_TOKEN with your Hugging Face token (needs read permission)
login("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")

# Step 3: Load SAMSum dataset
from datasets import load_dataset
dataset = load_dataset("knkarthick/samsum")
print(dataset)

# Step 4: Explore dataset structure
print(dataset['train'][0])  # Example dialogue and summary
train_dialogues = dataset['train']['dialogue']
train_summaries = dataset['train']['summary']

# Step 5: Analyze basic characteristics
avg_dialogue_len = sum(len(d.split()) for d in train_dialogues) / len(train_dialogues)
avg_summary_len = sum(len(s.split()) for s in train_summaries) / len(train_summaries)
print(f"Average dialogue length: {avg_dialogue_len:.1f} words")
print(f"Average summary length: {avg_summary_len:.1f} words")

# Step 6: Prepare data for model input
from transformers import AutoTokenizer

model_checkpoint = "facebook/bart-base"  # Or "t5-small" / "bert2bert"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_function(examples):
    inputs = [dialogue for dialogue in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenize summaries (targets)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Step 7: Create train-validation splits
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]

# Step 8: Build DataLoaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_dataloader = DataLoader(val_dataset, batch_size=8)

print("Training and validation DataLoaders are ready.")



Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=eb56bcdaed06e3d2411fc0ce41b3eb47cb77c5bd5b4425c87518ab3f37127827
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.6 rouge_score-0.1.2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/14731 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14731
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
})
{'id': '13818513', 'dialogue': "Amanda: I baked  cookies. Do you want some?\nJerry: Sure!\nAmanda: I'll bring you tomorrow :-)", 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}
Average dialogue length: 93.8 words
Average summary length: 20.3 words


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/14731 [00:00<?, ? examples/s]



Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Training and validation DataLoaders are ready.


# Model set up:
The next step is to set up the model. I will use pretrained models off of hugging face to speed up training.

In [None]:


# Step 2: Load pre-trained BERT encoder and GPT-2 decoder
encoder_checkpoint = "bert-base-uncased"
decoder_checkpoint = "gpt2"

# Initialize tokenizers
encoder_tokenizer = BertTokenizer.from_pretrained(encoder_checkpoint)
decoder_tokenizer = GPT2Tokenizer.from_pretrained(decoder_checkpoint)
# GPT2 does not have pad token by default
decoder_tokenizer.pad_token = decoder_tokenizer.eos_token

# Step 3: Combine encoder and decoder
model = EncoderDecoderModel.from_encoder_decoder_pretrained(encoder_checkpoint, decoder_checkpoint)

# Configure generation settings
model.config.decoder_start_token_id = decoder_tokenizer.bos_token_id
model.config.eos_token_id = decoder_tokenizer.eos_token_id
model.config.pad_token_id = decoder_tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size
model.config.max_length = 128
model.config.min_length = 20
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

# Step 4: Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Encoder-decoder model ready for fine-tuning.")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'transformer.h.10.crossattention.c_proj.bias', 'transformer.h.10.cros

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Encoder-decoder model ready for fine-tuning.


# Training steps:
Next is training the model. The splits may need adjusted to speed up total train time, and evaluations will be included with the training to track improvements.

In [None]:


# Load ROUGE for evaluation
rouge = evaluate.load("rouge")

# Tokenization Helper Function
def preprocess_function(examples):
    dialogues = [" ".join(d) if isinstance(d, list) else d for d in examples["dialogue"]]
    summaries = [" ".join(s) if isinstance(s, list) else s for s in examples["summary"]]

    # Tokenize inputs
    model_inputs = encoder_tokenizer(
        dialogues,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    # Tokenize targets
    with decoder_tokenizer.as_target_tokenizer():
        labels = decoder_tokenizer(
            summaries,
            max_length=128,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

# Data loaders
data_collator = DataCollatorForSeq2Seq(tokenizer=encoder_tokenizer, model=model)
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=4, shuffle=True, collate_fn=data_collator)
val_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=4, collate_fn=data_collator)

# Optimizer & Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Early Stopping & Checkpointing
best_val_loss = float("inf")
patience, patience_counter = 2, 0
save_path = "best_model.pt"

# Training Loop
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in progress_bar:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        total_train_loss += loss.item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        progress_bar.set_postfix({"train_loss": loss.item()})

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"\nEpoch {epoch+1} Training Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    total_val_loss = 0
    predictions, references = [], []

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validation"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_val_loss += loss.item()

            # Generate summaries
            generated_tokens = model.generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128
            )
            decoded_preds = decoder_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = decoder_tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

            predictions.extend(decoded_preds)
            references.extend(decoded_labels)

    avg_val_loss = total_val_loss / len(val_dataloader)
    rouge_score = rouge.compute(predictions=predictions, references=references)

    print(f"Validation Loss: {avg_val_loss:.4f}")
    print(f"ROUGE: {rouge_score}")

    # Early Stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), save_path)
        print("Model improved and checkpoint saved.")
    else:
        patience_counter += 1
        print(f" No improvement. Early stopping patience: {patience_counter}/{patience}")
        if patience_counter >= patience:
            print(" Early stopping triggered.")
            break

print("Training complete!")
model.load_state_dict(torch.load(save_path))







Map:   0%|          | 0/14731 [00:00<?, ? examples/s]



Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Epoch 1/3:   0%|          | 0/3683 [00:00<?, ?it/s]




Epoch 1 Training Loss: 0.7247


Validation:   0%|          | 0/205 [00:00<?, ?it/s]



Validation Loss: 0.6604
ROUGE: {'rouge1': np.float64(0.18941743186513177), 'rouge2': np.float64(0.03875878435362977), 'rougeL': np.float64(0.14729901818162006), 'rougeLsum': np.float64(0.14723116031803551)}
✅ Model improved and checkpoint saved.


Epoch 2/3:   0%|          | 0/3683 [00:00<?, ?it/s]


Epoch 2 Training Loss: 0.6061


Validation:   0%|          | 0/205 [00:00<?, ?it/s]

Validation Loss: 0.6268
ROUGE: {'rouge1': np.float64(0.22938396430697733), 'rouge2': np.float64(0.05668801463881078), 'rougeL': np.float64(0.17955126065648624), 'rougeLsum': np.float64(0.17918736775728075)}
✅ Model improved and checkpoint saved.


Epoch 3/3:   0%|          | 0/3683 [00:00<?, ?it/s]


Epoch 3 Training Loss: 0.5287


Validation:   0%|          | 0/205 [00:00<?, ?it/s]

Validation Loss: 0.6193
ROUGE: {'rouge1': np.float64(0.2448493742067912), 'rouge2': np.float64(0.06312447547734897), 'rougeL': np.float64(0.19081173181562705), 'rougeLsum': np.float64(0.19078156989400513)}
✅ Model improved and checkpoint saved.
Training complete!


<All keys matched successfully>

# Observations from training:
Model needs adjusting, training time and total accuracy was a little low. Took on average 40 minutes to go through each epoch. Ended up with: 52% training loss, 61% validation loss, ROUGE-1 of .244, ROUGE-2 of .063, and ROUGE-L of .190. There's a lot of room for improvement overall. Will need to select a different model off of huggingface to both improve accuracy and speed of training.

# Evaluation of inital model:
Below is simple evaluation of the model performance

In [None]:
# Set model to evaluation mode
model.eval()
predictions, references = [], []

for batch in val_dataloader:
    inputs = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"]

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=4
        )

    decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    predictions.extend(decoded_preds)
    references.extend(decoded_labels)

# Compute ROUGE
rouge_results = rouge.compute(predictions=predictions, references=references)

# Print results safely (supports both legacy and modern evaluate versions)
for key, value in rouge_results.items():
    if isinstance(value, dict):
        print(f"{key}: {value['fmeasure']:.4f}")
    else:
        print(f"{key}: {value:.4f}")

# Optional: visualize the results
import matplotlib.pyplot as plt

rouge_scores = {k: (v['fmeasure'] if isinstance(v, dict) else v) for k, v in rouge_results.items()}
plt.bar(rouge_scores.keys(), rouge_scores.values(), color='skyblue')
plt.title("Final ROUGE Scores")
plt.ylabel("Score")
plt.ylim(0, 1)
plt.show()


NameError: name 'model' is not defined

# Evaluation summary:
The model needs improvement. The current ROUGE scores are less than ideal, but a decent starting point. Through improvements it should be much better performing.