In [None]:
!pip install -q -U \
    transformers[torch]==4.40.2 \
    datasets==2.19.1 \
    evaluate==0.4.2 \
    rouge_score==0.1.2 \
    sentencepiece==0.2.0 \
    accelerate==0.30.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.

In [None]:
import datasets
import evaluate
import nltk
import numpy as np
import transformers

In [None]:
!git lfs install
!git clone https://huggingface.co/booksouls/pegasus-xsum

Git LFS initialized.
Cloning into 'pegasus-xsum'...
remote: Enumerating objects: 146, done.[K
remote: Counting objects: 100% (143/143), done.[K
remote: Compressing objects: 100% (143/143), done.[K
remote: Total 146 (delta 44), reused 0 (delta 0), pack-reused 3 (from 1)[K
Receiving objects: 100% (146/146), 2.85 MiB | 11.50 MiB/s, done.
Resolving deltas: 100% (44/44), done.
Filtering content: 100% (18/18), 4.25 GiB | 45.00 MiB/s, done.


In [None]:
MODEL_NAME = "pegasus-xsum"
MAX_SOURCE_LENGTH = 512
MAX_TARGET_LENGTH = 256

In [None]:
dataset = datasets.load_dataset("booksouls/booksum-cleaned")

Downloading readme:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/103M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8145 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1259 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1215 [00:00<?, ? examples/s]

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

In [None]:
def tokenize(books):
    result = tokenizer(
        books["chapter"],
        max_length=MAX_SOURCE_LENGTH,
        padding="max_length",
        truncation=True,
    )
    labels = tokenizer(
        text_target=books["summary"],
        max_length=MAX_TARGET_LENGTH,
        padding="max_length",
        truncation=True,
    )
    result["labels"] = labels["input_ids"]
    return result

tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    remove_columns=["chapter", "summary"]
)

Map:   0%|          | 0/8145 [00:00<?, ? examples/s]

Map:   0%|          | 0/1259 [00:00<?, ? examples/s]

Map:   0%|          | 0/1215 [00:00<?, ? examples/s]

In [None]:
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def postprocess_output(output):
    return [
        # rouge-Lsum expects newline after each sentence.
        "\n".join(nltk.sent_tokenize(text.strip()))
        for text in output
    ]

def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # Replace -100s used for padding as we can't decode them.
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    predictions_decoded = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels_decoded = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=postprocess_output(predictions_decoded),
        references=postprocess_output(labels_decoded),
        use_stemmer=True,
    )

    result["gen_len"] = np.mean([
        np.count_nonzero(prediction != tokenizer.pad_token_id)
        for prediction in predictions
    ])

    return {
        key: round(value, 6)
        for key, value in result.items()
    }

In [None]:
training_args = transformers.Seq2SeqTrainingArguments(
    output_dir=MODEL_NAME,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=64,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    predict_with_generate=True,
    push_to_hub=True,
    hub_strategy="checkpoint",
)

optimizer = transformers.Adafactor(
    model.parameters(),
    lr=1e-4,
    scale_parameter=False,
    relative_step=False,
)

trainer = transformers.Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,4.3049,3.789676,0.237924,0.032123,0.148748,0.205741,247.694202


Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_NAME}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.6962,2.995634,0.248096,0.034499,0.159467,0.213062,252.77919


Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_NAME}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
2,3.0292,2.694836,0.256644,0.036827,0.160481,0.2217,252.403495


Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_NAME}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
3,2.8302,2.661316,0.264978,0.037959,0.158569,0.23096,252.957109


Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_NAME}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
4,2.7789,2.64169,0.267058,0.039428,0.161049,0.23348,252.521048


Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
4,2.7789,2.64169,0.267058,0.039428,0.161049,0.23348,252.521048


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_NAME}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
5,2.7486,2.629568,0.262943,0.039119,0.159378,0.229928,253.563145


Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_NAME}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
6,2.7269,2.620732,0.273681,0.041908,0.161841,0.241066,253.263701


Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_NAME}/last-checkpoint")


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
7,2.7109,2.615431,0.271269,0.040401,0.160898,0.238712,253.250993


Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_NAME}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
8,2.7004,2.61192,0.274069,0.041333,0.16243,0.241458,253.531374


Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


In [None]:
trainer.train(resume_from_checkpoint=f"{MODEL_NAME}/last-checkpoint")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
9,2.6674,2.610904,0.272545,0.040774,0.161934,0.23992,253.127085


Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


TrainOutput(global_step=630, training_loss=0.26674346923828124, metrics={'train_runtime': 8465.4896, 'train_samples_per_second': 9.621, 'train_steps_per_second': 0.074, 'total_flos': 1.1755641495984538e+17, 'train_loss': 0.26674346923828124, 'epoch': 9.989933709796219})