In [1]:
!pip install --upgrade transformers accelerate peft datasets bitsandbytes

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [2]:
import json
import pandas as pd
from datasets import load_dataset

# Define the path to your JSON file
file_path = "booksum_text_summary_pairs.json"

# Load the JSON data
with open(file_path, 'r', encoding='utf-8') as f:
    booksum_data = json.load(f)

# Initialize lists to hold input texts and summaries
input_texts = []
target_summaries = []
invalid_summaries = 0

# Define a function to validate summary quality
def is_valid_summary(summary):
    # Criteria for a valid summary
    if len(summary.split()) < 5:
        return False
    placeholder_keywords = [
        "Read the full text", "translation HERE", "click here",
        "summary of the long text", "this is not the full text"
    ]
    for keyword in placeholder_keywords:
        if keyword.lower() in summary.lower():
            return False
    return True

# Iterate over each entry in the dataset
for entry in booksum_data:
    text = entry.get("text", "").strip()
    summaries = entry.get("summary", [])

    # Ensure summaries is a list
    if not isinstance(summaries, list):
        summaries = [summaries] if summaries else []

    for summary in summaries:
        # Extract the summary text
        if isinstance(summary, dict):
            summary_text = summary.get("text", "").strip()
        elif isinstance(summary, str):
            summary_text = summary.strip()
        else:
            summary_text = ""

        # Validate the summary
        if summary_text and is_valid_summary(summary_text):
            input_texts.append(text)
            target_summaries.append(summary_text)
        else:
            invalid_summaries += 1

print(f"Total valid input-summary pairs created: {len(input_texts)}")
print(f"Entries with invalid or placeholder summaries: {invalid_summaries}")

# Create a DataFrame from the extracted pairs
data = pd.DataFrame({
    "input_text": input_texts,
    "summary": target_summaries
})

# Save the processed data to a CSV file
data.to_csv("processed_booksum.csv", index=False)

Total valid input-summary pairs created: 299
Entries with invalid or placeholder summaries: 15


In [3]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
import torch

# ===========================================
# 1. Load and Split the Dataset
# ===========================================

# Load the processed CSV data
data = pd.read_csv("/content/processed_booksum.csv")

# Inspect the data structure
print(data.head())

# Assuming the CSV has columns: 'input_text' and 'summary'
# If there are additional columns, adjust accordingly

# Split into train, validation, and test sets (80% train, 10% val, 10% test)
train_data = data.sample(frac=0.8, random_state=42)
val_test_data = data.drop(train_data.index)
val_data = val_test_data.sample(frac=0.5, random_state=42)
test_data = val_test_data.drop(val_data.index)

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_data.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_data.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_data.reset_index(drop=True))

# ===========================================
# 2. Load Tokenizer and Model
# ===========================================

# Define model name
model_name = "allenai/led-base-16384"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Ensure pad token is set

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    gradient_checkpointing=True,  # Saves memory by not storing intermediate activations
    use_cache=False               # Necessary when using gradient_checkpointing
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ===========================================
# 3. Preprocessing Function
# ===========================================

def process_data_to_model_inputs(batch):
    """
    Tokenizes the input_text and summary, creates global attention masks,
    and prepares labels by replacing pad tokens with -100.
    """
    # Tokenize the inputs
    inputs = tokenizer(
        batch["input_text"],
        padding="max_length",
        truncation=True,
        max_length=1024,  # Adjust as needed
    )

    # Tokenize the summaries (labels)
    outputs = tokenizer(
        batch["summary"],
        padding="max_length",
        truncation=True,
        max_length=256,    # Adjust as needed
    )

    # Assign tokenized inputs to the batch
    batch["input_ids"] = inputs["input_ids"]
    batch["attention_mask"] = inputs["attention_mask"]

    # Create global attention mask for LED (first token has global attention)
    # Assuming all inputs have at least one token
    batch["global_attention_mask"] = [
        [1] + [0] * (len(input_id) - 1) for input_id in inputs["input_ids"]
    ]

    # Prepare labels, replacing pad tokens with -100
    batch["labels"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in outputs["input_ids"]
    ]

    return batch

# ===========================================
# 4. Preprocess the Datasets
# ===========================================

# Define batch size for mapping
batch_size = 2

# Preprocess training dataset
train_dataset = train_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["input_text", "summary"],  # Remove original columns
)

# Preprocess validation dataset
val_dataset = val_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["input_text", "summary"],
)

# Preprocess test dataset
test_dataset = test_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["input_text", "summary"],
)

# Set the format of the datasets to PyTorch tensors
columns_to_return = ["input_ids", "attention_mask", "global_attention_mask", "labels"]

train_dataset.set_format(
    type="torch",
    columns=columns_to_return,
)

val_dataset.set_format(
    type="torch",
    columns=columns_to_return,
)

test_dataset.set_format(
    type="torch",
    columns=columns_to_return,
)


                                          input_text  \
0  BOOK I.\n\n\n    Of Mans First Disobedience, a...   
1  BOOK I.\n\n\n    Of Mans First Disobedience, a...   
2  BOOK I.\n\n\n    Of Mans First Disobedience, a...   
3  General Introduction\n\nFor the Independent Jo...   
4  [Enter THOMAS; he crosses the stage; FAG follo...   

                                             summary  
0  Milton's epic poem opens on the fiery lake of ...  
1  Paradise Lost opens with Satan on the surface ...  
2  Milton's speaker begins Paradise Lost by stati...  
3  In this paper, Hamilton continues his defense ...  
4  The play begins with a preface written by the ...  


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

TypeError: Seq2SeqTrainingArguments.__init__() got an unexpected keyword argument 'use_cache'

In [4]:
# ===========================================
# 5. Define Training Arguments
# ===========================================

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,          # Adjust based on GPU memory
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,          # Effective batch size = 2 * 4 = 8
    evaluation_strategy="steps",
    eval_steps=50,                           # Evaluate every 50 steps
    save_steps=50,                            # Save checkpoint every 50 steps
    save_total_limit=2,                       # Limit the total number of checkpoints
    learning_rate=1e-5,                        # Start with 1e-5; adjust if necessary
    weight_decay=0.01,
    num_train_epochs=10,                      # Adjust based on dataset size and observed convergence
    fp16=torch.cuda.is_available(),            # Enable mixed-precision training if using GPU
    optim="adamw_torch",
    logging_steps=10,
    load_best_model_at_end=True,              # Automatically load the best model at the end
    metric_for_best_model="loss",             # Use validation loss to determine best model
    greater_is_better=False,                  # Lower loss is better
    report_to="none",                          # Disable reporting to external services
    gradient_checkpointing=True              # Already set in model loading
                               # Already set in model loading
)

# ===========================================
# 6. Define Data Collator
# ===========================================

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,                  # As defined in preprocessing
    pad_to_multiple_of=8 if torch.cuda.is_available() else None
)

# ===========================================
# 7. Initialize Trainer
# ===========================================

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Stops training if no improvement in 3 eval steps
)

# ===========================================
# 8. Start Training
# ===========================================

trainer.train()

# ===========================================
# 9. Save the Fine-Tuned Model
# ===========================================

trainer.save_model("fine-tuned-led")

  trainer = Seq2SeqTrainer(
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss,Validation Loss
50,13.6444,3.369979
100,13.0077,3.326897
150,12.5504,3.304107
200,12.3297,3.298748
250,12.1975,3.293231
300,12.0145,3.291284


There were missing keys in the checkpoint model loaded: ['led.encoder.embed_tokens.weight', 'led.decoder.embed_tokens.weight', 'lm_head.weight'].
