Step 1:  Dataset Exploration and Preparation

- Load the SAMSum dataset and explore its structure.
- Analyze the characteristics of the dialogues and summaries.
- Prepare the data for input to the BERT model:
- Implement appropriate tokenization.
- Create training and validation splits.
- Build data loaders for efficient model training.

In [1]:
#Load the SAMSum dataset from the datasets library
%pip install datasets transformers torch pandas
from datasets import load_dataset
from transformers import AutoTokenizer
import pandas as pd
# Load the SAMSum dataset
ds = load_dataset("knkarthick/samsum")

df_train = pd.DataFrame(ds['train'])
print(f"Dataset Splits: {ds.keys()}")
print(f"Average dialogue length: {df_train['dialogue'].apply(lambda x: len(x.split())).mean()}")

# Preparation for the Model
# An encoder-decoder model like BART or T5 is suitable for text summarization tasks.
# Tokenize the dialogues using a pre-trained tokenizer
model_ckpt = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["dialogue"], 
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        text_target=examples["summary"], 
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the preprocessing function to the dataset
tokenized_ds = ds.map(preprocess_function, batched=True)
print(tokenized_ds["train"][0])



Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm


Dataset Splits: dict_keys(['train', 'validation', 'test'])
Average dialogue length: 93.79274998302898


Map: 100%|██████████| 818/818 [00:00<00:00, 1630.97 examples/s]

{'id': '13818513', 'dialogue': "Amanda: I baked  cookies. Do you want some?\nJerry: Sure!\nAmanda: I'll bring you tomorrow :-)", 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.', 'input_ids': [0, 10127, 5219, 35, 38, 17241, 1437, 15269, 4, 1832, 47, 236, 103, 116, 50118, 39237, 35, 9136, 328, 50118, 10127, 5219, 35, 38, 581, 836, 47, 3859, 48433, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 




Splits and Data Loaders
- The kmkarthick/samsum dataset provides pre-defined training and validation splits.

In [2]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq

#Training and Validation Splits
train_set = tokenized_datasets["train"]
val_set = tokenized_datasets["validation"]

# DataLoader Creation
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_ckpt)
train_dataloader = DataLoader(
    tokenized_datasets["train"], 
    batch_size=8, 
    shuffle=True, 
    collate_fn=data_collator
    )
# The SAMSum dataset is now loaded and preprocessed for training a text summarization model.
val_dataloader = DataLoader(
    tokenized_datasets["validation"],
    batch_size=8,
    shuffle=False,
    collate_fn=data_collator
)

print("Status: Success. DataLoaders include both 'input_ids' and 'labels' for training.")


NameError: name 'tokenized_datasets' is not defined

Step 2: Model Architecture Implementation
- Implement an encoder-decoder architecture using BERT.
- Configure the model for the summarization task.
- Set up the necessary components:
    - Encoder (BERT-based)
    - Generation mechanism to include the decoder. A decoder example can be Chat GPT-2 or model on huggingface. 
        - Try to find a free model that will give you a proof-of-concept for text. 

In [None]:
from transformers import (EncoderDecoderModel, AutoTokenizer, GenerationConfig, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer)
import torch

#Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

#Define Encoder and Decoder Configurations
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "bert-base-uncased",
    "gpt2"
)
#Set Special Tokens
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id

#Generate Configurations
model.generation_config = GenerationConfig(
    decoder_start_token_id=model.config.decoder_start_token_id,
    eos_token_id=model.config.eos_token_id,
    pad_token_id=model.config.pad_token_id,
    max_length=128,
    min_length=30,
    no_repeat_ngram_size=3,
    early_stopping=True,
    length_penalty=2.0,
    num_beams=4
)
print("Model and Tokenizer are set up for text summarization.")

# Proof of Concept Inference
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    with torch.no_grad():
        summary_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=128,
            min_length=30,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example dialogue
dialogue = """John: Hey, how are you?
Mary: I'm good, thanks! How about you?"""
summary = generate_summary(dialogue)
print(f"Generated Summary: {summary}")


Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'transformer.h.10.crossattention.c_proj.bias', 'transformer.h.10.cros

Model and Tokenizer are set up for text summarization.
Generated Summary: [unused12] [unused193] [unused193] [unused0] [unused39] [unused887] [unused335] [unused333] ∅ [unused324] [unused509] [unused279] [unused302] ᵈ [unused279] [unused905] [unused10] [unused700]ple [unused10] [unused351] [unused816] 2 [unused279] [unused461] [unused423] [unused10] [unused285] [unused351] [unused816] [unused402] 2 [unused279] thought ₗ [unused351] [unused646] [unused423] 代bag [unused334] [unused526] [unused12] [unused361] [unused39] [unused887] [unused321] [unused333] [unused639] [unused351] [unused816] people [unused279] [unused461] [unused12] [unused770] [unused760] [unused279] [unused782] け [unused321] [unused351] [unused816] ᵈ [unused279] [unused461] [unused321] [unused521] [unused193] [unused193] ⁺ most [unused10] [unused361] ʎ [unused760] [unused279] [unused461] [unused335] [unused282] [unused252] [unused830] [unused321] [unused351] ם cooperation [unused346] [unused10] [unused285] [unused321] [u

Step 3: Training and Optimization
- Implement the training loop.
- Set up appropriate loss functions and evaluation metrics.
- Configure optimization parameters.
- Implement early stopping and checkpointing.
- Monitor training progress.
- Manage computational resources effectively.

In [None]:
# Tokenize the dataset with BERT tokenizer
def preprocess_for_bert(examples):
    model_inputs = bert_tokenizer(
        examples["dialogue"], 
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    labels = bert_tokenizer(
        text=examples["summary"], 
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize dataset
tokenized_ds_bert = ds.map(preprocess_for_bert, batched=True)

# Create data collator for this model
data_collator_bert = DataCollatorForSeq2Seq(bert_tokenizer, model=model)

# Define compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = bert_tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, bert_tokenizer.pad_token_id)
    decoded_labels = bert_tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v * 100, 4) for k, v in result.items()}

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="bert_gpt2_summarizer_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    greater_is_better=True,
)

# Implement Early Stopping and Trainer
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds_bert["train"],
    eval_dataset=tokenized_ds_bert["validation"],
    tokenizer=bert_tokenizer,
    data_collator=data_collator_bert,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

# Start Training
trainer.train()

# Final Model Evaluation
eval_results = trainer.evaluate()
trainer.save_model("bert_gpt2_summarizer_model")
print(f"Evaluation Results: {eval_results}")

Map:   0%|          | 0/14731 [00:00<?, ? examples/s]

Map: 100%|██████████| 14731/14731 [00:08<00:00, 1674.79 examples/s]
  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None, 'pad_token_id': 0}.


Epoch,Training Loss,Validation Loss
