# Setup

In [None]:
MODEL_DIR = "bart-large-cnn"

In [None]:
!git lfs install
!git clone https://huggingface.co/booksouls/bart-large-cnn

Git LFS initialized.
Cloning into 'bart-large-cnn'...
remote: Enumerating objects: 176, done.[K
remote: Counting objects: 100% (173/173), done.[K
remote: Compressing objects: 100% (173/173), done.[K
remote: Total 176 (delta 55), reused 0 (delta 0), pack-reused 3 (from 1)[K
Receiving objects: 100% (176/176), 1.12 MiB | 1.70 MiB/s, done.
Resolving deltas: 100% (55/55), done.
Filtering content: 100% (19/19), 6.05 GiB | 37.05 MiB/s, done.


In [None]:
!pip install -q -U \
    transformers[torch]==4.40.2 \
    datasets==2.19.1 \
    evaluate==0.4.2 \
    rouge_score==0.1.2 \
    sentencepiece==0.2.0 \
    accelerate==0.30.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.2

# Data Loading

In [None]:
from datasets import load_dataset

dataset = load_dataset("booksouls/booksum-cleaned")

dataset

Downloading readme:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/103M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8145 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1259 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1215 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['chapter', 'summary'],
        num_rows: 8145
    })
    validation: Dataset({
        features: ['chapter', 'summary'],
        num_rows: 1259
    })
    test: Dataset({
        features: ['chapter', 'summary'],
        num_rows: 1215
    })
})

In [None]:
sample = dataset["train"][0]

print("Chapter:")
print("--------")
print(sample["chapter"])

print()

print("Summary:")
print("--------")
print(sample["summary"])

Chapter:
--------
THE FIRST BOOK

I. Of my grandfather Verus I have learned to be gentle and meek, and to
refrain from all anger and passion. From the fame and memory of him that
begot me I have learned both shamefastness and manlike behaviour. Of my
mother I have learned to be religious, and bountiful; and to forbear,
not only to do, but to intend any evil; to content myself with a spare
diet, and to fly all such excess as is incidental to great wealth. Of my
great-grandfather, both to frequent public schools and auditories, and
to get me good and able teachers at home; and that I ought not to think
much, if upon such occasions, I were at excessive charges.

II. Of him that brought me up, not to be fondly addicted to either of
the two great factions of the coursers in the circus, called Prasini,
and Veneti: nor in the amphitheatre partially to favour any of the
gladiators, or fencers, as either the Parmularii, or the Secutores.
Moreover, to endure labour; nor to need many things; when

# Tokenization

In [None]:
MAX_SOURCE_LENGTH = 1024
MAX_TARGET_LENGTH = 256

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

In [None]:
def tokenize(books):
    result = tokenizer(
        books["chapter"],
        max_length=MAX_SOURCE_LENGTH,
        padding="max_length",
        truncation=True,
    )
    labels = tokenizer(
        text_target=books["summary"],
        max_length=MAX_TARGET_LENGTH,
        padding="max_length",
        truncation=True,
    )
    result["labels"] = labels["input_ids"]
    return result

tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    remove_columns=["chapter", "summary"],
)

tokenized_dataset

Map:   0%|          | 0/8145 [00:00<?, ? examples/s]

Map:   0%|          | 0/1259 [00:00<?, ? examples/s]

Map:   0%|          | 0/1215 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 8145
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1259
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1215
    })
})

# Training

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR)

In [None]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import numpy as np

def postprocess_output(output):
    return [
        # rougeLsum expects newline after each sentence.
        "\n".join(nltk.sent_tokenize(text.strip()))
        for text in output
    ]

def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # Replace -100s used for padding as we can't decode them.
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    predictions_decoded = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels_decoded = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=postprocess_output(predictions_decoded),
        references=postprocess_output(labels_decoded),
        use_stemmer=True,
    )

    result["gen_len"] = np.mean([
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ])

    return {key: round(value, 6) for key, value in result.items()}

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir=MODEL_DIR,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=32,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=20,
    predict_with_generate=True,
    push_to_hub=True,
    hub_strategy="checkpoint",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,2.7772,2.416368,0.384201,0.080916,0.184422,0.357845,255.520254


Non-default generation parameters: {'max_length': 256, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_DIR}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.3811,2.353522,0.384243,0.082073,0.18616,0.357318,255.032566


Non-default generation parameters: {'max_length': 256, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_DIR}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
2,2.2397,2.335172,0.389754,0.084422,0.188367,0.363838,255.181096


Non-default generation parameters: {'max_length': 256, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_DIR}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
3,2.1367,2.325289,0.390032,0.084695,0.188212,0.363517,255.674345


Non-default generation parameters: {'max_length': 256, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_DIR}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
4,2.0535,2.332619,0.396595,0.08825,0.191535,0.36918,255.524226


Non-default generation parameters: {'max_length': 256, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_DIR}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
5,1.9846,2.334203,0.401321,0.089767,0.193986,0.373929,255.390786


Non-default generation parameters: {'max_length': 256, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_DIR}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
6,1.9287,2.339814,0.399138,0.08949,0.193197,0.372272,255.493249


Non-default generation parameters: {'max_length': 256, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_DIR}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
7,1.8845,2.350146,0.400111,0.089385,0.193749,0.372579,254.785544


Non-default generation parameters: {'max_length': 256, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_DIR}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
8,1.8518,2.356118,0.39905,0.089006,0.192645,0.371683,254.877681


Non-default generation parameters: {'max_length': 256, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_DIR}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
9,1.8124,2.362492,0.398983,0.088744,0.192504,0.371333,254.986497


Non-default generation parameters: {'max_length': 256, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=630, training_loss=0.1812362791999938, metrics={'train_runtime': 7918.752, 'train_samples_per_second': 10.286, 'train_steps_per_second': 0.08, 'total_flos': 1.7633513436925133e+17, 'train_loss': 0.1812362791999938, 'epoch': 9.989690721649485})

In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_DIR}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
10,1.8408,2.362902,0.399668,0.088895,0.192257,0.372412,254.482923


Non-default generation parameters: {'max_length': 256, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
