In [1]:
!pip install transformers datasets evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24936 sha256=19519dc179f7b4f9636073be5ae42e5377ed653aea337b89659e68d1ced98488
  Stored in directory: /root/.cache/pip/wheels/b0/3f/ac/cc3bc304f50c77ef38d79d8e4e2684313de39af543cb4eb3da
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.1 rouge_score-0.1.2
[0m

In [2]:
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm.notebook import tqdm
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.utils import shuffle
import evaluate
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Prepare dataset

### Creating directories

In [3]:
DIR="t5"

!mkdir -p "{DIR}/dataset"
!mkdir -p "{DIR}/model"
!mkdir -p "{DIR}/tokenizer"

!pwd
!ls -r "{DIR}"

/notebooks/pretrained
tokenizer  model  dataset


### Getting Dataset

In [4]:
dataset = load_dataset("billsum")

train_test_valid = dataset["train"].train_test_split(test_size=0.2, seed=20)
test_valid = train_test_valid["test"].train_test_split(test_size=0.5, seed=20)

Downloading builder script:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/832 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset billsum/default (download: 64.14 MiB, generated: 259.80 MiB, post-processed: Unknown size, total: 323.94 MiB) to /root/.cache/huggingface/datasets/billsum/default/3.0.0/d1e95173aed3acb71327864be74ead49b578522e4c7206048b2f2e5351b57959...


Downloading data:   0%|          | 0.00/67.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

Dataset billsum downloaded and prepared to /root/.cache/huggingface/datasets/billsum/default/3.0.0/d1e95173aed3acb71327864be74ead49b578522e4c7206048b2f2e5351b57959. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

# Preprocess

### Tokenization

In [8]:
checkpoint = "t5-small"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading tokenizer_config.json:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [10]:
def preprocess_function(examples, max_length_inputs=1024, max_length_labels=128):
    inputs = ["summarize: " + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=max_length_inputs, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_length_labels, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
train_test_valid

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 15159
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3790
    })
})

In [12]:
test_valid

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1895
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1895
    })
})

In [13]:
tokenized_train = train_test_valid.map(preprocess_function, batched=True)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [14]:
tokenized_train

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15159
    })
    test: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3790
    })
})

In [15]:
tokenized_validation = test_valid["train"].map(preprocess_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [16]:
tokenized_validation

Dataset({
    features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1895
})

In [17]:
tokenized_test = test_valid["test"].map(preprocess_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [18]:
tokenized_test

Dataset({
    features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1895
})

### Data Collator

In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

### Evaluation Metric

In [20]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

### Model

In [21]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/231M [00:00<?, ?B/s]

# Train

### Hyper-Parameters

In [25]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./output/results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=8,
    predict_with_generate=True,
    fp16=True,
)

In [26]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train["train"],
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Using cuda_amp half precision backend


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [27]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: title, summary, text. If title, summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15159
  Num Epochs = 8
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7584
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.3621,1.754534,0.2257,0.1757,0.2151,0.2151,19.0
2,1.9091,1.66525,0.2322,0.1824,0.2229,0.2231,18.9995
3,1.8365,1.621972,0.2352,0.1855,0.2268,0.2268,19.0
4,1.7821,1.594288,0.236,0.1866,0.2278,0.2279,19.0
5,1.7531,1.574268,0.2367,0.188,0.2285,0.2285,19.0
6,1.7337,1.564112,0.2369,0.1887,0.2289,0.2289,19.0
7,1.7221,1.557661,0.2376,0.1895,0.2293,0.2294,19.0
8,1.7181,1.555388,0.2375,0.1895,0.2293,0.2294,19.0


Saving model checkpoint to ./output/results/checkpoint-500
Configuration saved in ./output/results/checkpoint-500/config.json
Model weights saved in ./output/results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./output/results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./output/results/checkpoint-500/special_tokens_map.json
Copy vocab file to ./output/results/checkpoint-500/spiece.model
Deleting older checkpoint [output/results/checkpoint-3500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: title, summary, text. If title, summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1895
  Batch size = 16
Saving model checkpoint to ./output/results/checkpoint-1000
Configuration saved in ./output/results/checkpoint-1000/config.j

TrainOutput(global_step=7584, training_loss=1.829566307711702, metrics={'train_runtime': 10786.3099, 'train_samples_per_second': 11.243, 'train_steps_per_second': 0.703, 'total_flos': 3.282634189622477e+16, 'train_loss': 1.829566307711702, 'epoch': 8.0})

# Test

In [28]:
trainer.evaluate(eval_dataset=tokenized_test)

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: title, summary, text. If title, summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1895
  Batch size = 16


{'eval_loss': 1.5680652856826782,
 'eval_rouge1': 0.2362,
 'eval_rouge2': 0.1893,
 'eval_rougeL': 0.2277,
 'eval_rougeLsum': 0.2277,
 'eval_gen_len': 19.0,
 'eval_runtime': 128.472,
 'eval_samples_per_second': 14.75,
 'eval_steps_per_second': 0.926,
 'epoch': 8.0}

# Inference

In [29]:
text = """summarize: The Inflation Reduction Act lowers prescription drug costs, 
health care costs, and energy costs. It's the most aggressive action on tackling the 
climate crisis in American history, which will lift up American workers and create good-paying, 
union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and 
corporations to pay their fair share. And no one making under $400,000 per year will 
pay a penny more in taxes."""

### Load Trained Model

In [30]:
model_output = "output/results/checkpoint-7500"

tokenizer = AutoTokenizer.from_pretrained(model_output)
inputs = tokenizer(text, return_tensors="pt").input_ids
model_ = AutoModelForSeq2SeqLM.from_pretrained(model_output)
outputs = model_.generate(inputs, max_new_tokens=100, do_sample=False)

Didn't find file output/results/checkpoint-7500/added_tokens.json. We won't load it.
loading file output/results/checkpoint-7500/spiece.model
loading file output/results/checkpoint-7500/tokenizer.json
loading file None
loading file output/results/checkpoint-7500/special_tokens_map.json
loading file output/results/checkpoint-7500/tokenizer_config.json
loading configuration file output/results/checkpoint-7500/config.json
Model config T5Config {
  "_name_or_path": "output/results/checkpoint-7500",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id

### Decode

In [31]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Inflation Reduction Act aims to lower prescription drug costs, health care costs, and energy costs.'

In [32]:
text = """summarize: Cake is a flour confection made from flour, sugar, and other ingredients and is usually baked. 
In their oldest forms, cakes were modifications of bread, but cakes now cover a wide range of preparations 
that can be simple or elaborate and which share features with desserts such as pastries, meringues, custards, and pies.
The most common ingredients include flour, sugar, eggs, fat (such as butter, oil, or margarine), a liquid, 
and a leavening agent, such as baking soda or baking powder. Common additional ingredients include dried, candied, or 
fresh fruit, nuts, cocoa, and extracts such as vanilla, with numerous substitutions for the primary ingredients. Cakes 
can also be filled with fruit preserves, nuts, or dessert sauces (like custard, jelly, cooked fruit, whipped cream, or 
syrups), iced with buttercream or other icings, and decorated with marzipan, piped borders, or candied fruit."""

inputs = tokenizer(text, return_tensors="pt").input_ids
outputs = model_.generate(inputs, max_new_tokens=100, do_sample=False)
tokenizer.decode(outputs[0], skip_special_tokens=True)

'flour confection made from flour, sugar, and other ingredients and is usually baked.'

In [33]:
text = """summarize: 31 minutos (English: 31 minutes) is a Chilean comedy television series and a children's music 
virtual band created by the production company Aplaplac (owned by Álvaro Díaz, Pedro Peirano and Juan Manuel Egaña) 
that began to be transmitted on March 15, 2003 by the signal of Televisión Nacional de Chile (TVN). The program 
is a parody to 60 minutos, a controversial news program of the same channel, transmitted in the 1970s and 1980s. It 
focuses on the adventures of the team of a news program of little prestige in which something unexpected 
always happens, whose presenter is Tulio Triviño. The program's notes are educational and leave an explicit 
or implicit message, while others are quite ridiculous.

In its first period, the series had three seasons, from 2003 to 2005, in addition to a participation for the 2003 Chilean 
Telethon and a Christmas special that same year. On March 27, 2008, the series was taken to the cinema under the 
title of 31 minutos, la película.

After the third season and for the next nine years the series had no new episodes. In 2012, the production company 
Aplaplac confirmed that the series would return to television with a fourth season, which was released on 
October 4, 2014 through TVN, and its last original episode was broadcast on the night of December 27, 2014. 
During its run, the series received universal acclaim from critics and viewers alike, with praise directed to its 
clever humour, soundtrack, accessibility towards children about complex issues and helping to revitalize the 
Chilean puppetry tradition.

From 2004 to 2007, it was broadcast throughout Latin America by Nickelodeon and from 2015, it began to be broadcast 
by Cartoon Network. It also broadcasts in Mexico on Canal Once and Once Niños, and its most recent season is 
available in the Netflix Latin America catalog.

31 minutos has performed throughout Chile and Mexico, making the program a musical band. On their tours they 
perform the songs broadcast on the program and their musical works outside of it."""

inputs = tokenizer(text, return_tensors="pt").input_ids
outputs = model_.generate(inputs, max_new_tokens=100, do_sample=False)
tokenizer.decode(outputs[0], skip_special_tokens=True)

"31 minutos is a Chilean comedy television series and a children's music virtual band created by the production company Aplaplac (owned by lvaro Daz, Pedro Peirano and Juan Manuel Egaa) that began to be transmitted on March 15, 2003 by the signal of Televisión Nacional de Chile (TVN). The program is a parody to 60 minutos, a controversial news program"

In [38]:
!tar -zcvf "pretrained.tar.gz" "output/results/checkpoint-7500"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
output/results/checkpoint-7500/
output/results/checkpoint-7500/trainer_state.json
output/results/checkpoint-7500/pytorch_model.bin
output/results/checkpoint-7500/training_args.bin
output/results/checkpoint-7500/optimizer.pt
output/results/checkpoint-7500/tokenizer.json
output/results/checkpoint-7500/config.json
output/results/checkpoint-7500/rng_state.pth
output/results/checkpoint-7500/scheduler.pt
output/results/checkpoint-7500/tokenizer_config.json
output/results/checkpoint-7500/scaler.pt
output/results/checkpoint-7500/special_tokens_map.json
output/results/checkpoint-7500/spiece.model
