In [None]:
!pip install --upgrade pip
!pip install transformers==4.20.0
!pip install datasets
!pip install huggingface-hub
!pip install nltk
!pip install rouge-score
!pip install keras-nlp==0.3.1

In [23]:
import nltk
import numpy as np
np.random.seed(26)
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
MAX_INPUT_LENGTH = 1024  # Maximum length of the input to the model
MIN_TARGET_LENGTH = 5  # Minimum length of the output by the model
MAX_TARGET_LENGTH = 512  # Maximum length of the output by the model
BATCH_SIZE = 8  # Batch-size for training our model
LEARNING_RATE = 2e-5  # Learning-rate for training our model
MAX_EPOCHS = 1  # Maximum number of epochs we will train the model for
N_ROWS = 1000

# This notebook is built on the t5-small checkpoint from the Hugging Face Model Hub
MODEL_CHECKPOINT = "t5-small"

In [None]:
train = pd.read_csv("cnn_dailymail/train.csv")
test = pd.read_csv("cnn_dailymail/test.csv")
val = pd.read_csv("cnn_dailymail/validation.csv")

sub_train = train.sample(N_ROWS*3)
sub_train.to_csv("cnn_dailymail/sub_train.csv", index=False)

sub_test = test.sample(N_ROWS)
sub_test.to_csv("cnn_dailymail/sub_test.csv", index=False)

sub_val = test.sample(N_ROWS)
sub_val.to_csv("cnn_dailymail/sub_validation.csv", index=False)

del train, test, val, sub_train, sub_test, sub_val

In [12]:
data_files = {"train": "sub_train.csv", "test": "sub_test.csv", "validation": "sub_validation.csv"}
dataset = load_dataset("cnn_dailymail/", data_files=data_files)
dataset



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 1000
    })
})

In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [14]:
prefix = "summarize: "

In [15]:
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=MAX_TARGET_LENGTH, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [17]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'article', 'highlights', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['id', 'article', 'highlights', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['id', 'article', 'highlights', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

In [18]:
batch_size = 8
model_name = "t5-small-summarization-fine-tuned"
model_dir = f"Models/{model_name}"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=30,
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="steps",
    save_steps=20,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1"
)

In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer)
metric = load_metric("rouge")

  metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [20]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [21]:
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.d67b370cd9d75f81ad4eb421ee7b8db09e0b6a6c693b8c2b423af5d7bcac6205
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "len

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

storing https://huggingface.co/t5-small/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/fee5a3a0ae379232608b6eed45d2d7a0d2966b9683728838412caccc41b4b0ed.ddacdc89ec88482db20c676f0861a336f3d0409f94748c209847b49529d73885
creating metadata file for /root/.cache/huggingface/transformers/fee5a3a0ae379232608b6eed45d2d7a0d2966b9683728838412caccc41b4b0ed.ddacdc89ec88482db20c676f0861a336f3d0409f94748c209847b49529d73885
loading weights file https://huggingface.co/t5-small/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/fee5a3a0ae379232608b6eed45d2d7a0d2966b9683728838412caccc41b4b0ed.ddacdc89ec88482db20c676f0861a336f3d0409f94748c209847b49529d73885
All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use 

In [24]:
trainer.train()

loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.d67b370cd9d75f81ad4eb421ee7b8db09e0b6a6c693b8c2b423af5d7bcac6205
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "len

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
10,2.4548,2.276212,23.9505,10.8498,19.5321,22.514,19.0
20,2.6386,2.106733,24.4162,11.4641,20.1768,23.0314,19.0
30,2.2858,2.009944,24.5058,11.5887,20.271,23.1456,19.0
40,2.3341,1.949514,24.1663,11.5113,20.0773,22.8588,19.0
50,2.1831,1.90494,24.1339,11.5102,20.0376,22.7882,19.0
60,2.1315,1.874144,24.2018,11.528,20.0809,22.8313,19.0
70,1.9825,1.857059,24.2134,11.4393,20.0693,22.8242,19.0
80,2.0999,1.845397,24.0055,11.2597,19.8927,22.7012,19.0
90,2.0487,1.836158,24.0779,11.3784,19.9966,22.7717,19.0
100,1.9003,1.830714,24.1857,11.4273,20.0077,22.8402,19.0


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: article, id, highlights. If article, id, highlights are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


Step,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: article, id, highlights. If article, id, highlights are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to Models/t5-small-summarization-fine-tuned/checkpoint-20
Configuration saved in Models/t5-small-summarization-fine-tuned/checkpoint-20/config.json
Model weights saved in Models/t5-small-summarization-fine-tuned/checkpoint-20/pytorch_model.bin
tokenizer config file saved in Models/t5-small-summarization-fine-tuned/checkpoint-20/tokenizer_config.json
Special tokens file saved in Models/t5-small-summarization-fine-tuned/checkpoint-20/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: article, id, 

TrainOutput(global_step=375, training_loss=2.044940500895182, metrics={'train_runtime': 3057.7909, 'train_samples_per_second': 0.981, 'train_steps_per_second': 0.123, 'total_flos': 810923665391616.0, 'train_loss': 2.044940500895182, 'epoch': 1.0})

In [28]:
!zip -r /content/models.zip /content/Models

  adding: content/Models/ (stored 0%)
  adding: content/Models/t5-small-summarization-fine-tuned/ (stored 0%)
  adding: content/Models/t5-small-summarization-fine-tuned/runs/ (stored 0%)
  adding: content/Models/t5-small-summarization-fine-tuned/runs/Mar13_13-36-39_4b604639c0f6/ (stored 0%)
  adding: content/Models/t5-small-summarization-fine-tuned/runs/Mar13_13-36-39_4b604639c0f6/1678714744.3052428/ (stored 0%)
  adding: content/Models/t5-small-summarization-fine-tuned/runs/Mar13_13-36-39_4b604639c0f6/1678714744.3052428/events.out.tfevents.1678714744.4b604639c0f6.122.2 (deflated 63%)
  adding: content/Models/t5-small-summarization-fine-tuned/runs/Mar13_13-36-39_4b604639c0f6/events.out.tfevents.1678714613.4b604639c0f6.122.0 (deflated 72%)
  adding: content/Models/t5-small-summarization-fine-tuned/runs/Mar13_13-36-39_4b604639c0f6/1678714613.7530663/ (stored 0%)
  adding: content/Models/t5-small-summarization-fine-tuned/runs/Mar13_13-36-39_4b604639c0f6/1678714613.7530663/events.out.tfeve