## Install the dependencies

In [None]:
!pip install transformers datasets evaluate rouge_score

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Dataset loading

In [None]:
# load billsum dataset
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")

In [None]:
billsum = billsum.train_test_split(test_size=0.2)

In [None]:
billsum['train'][0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 18120 of the Penal Code is amended to read:\n18120.\n(a) A person subject to a gun violence restraining order issued pursuant to this division shall not have in his or her custody or control, own, purchase, possess, or receive any firearms or ammunition while that order is in effect.\n(b) (1) Upon issuance of a gun violence restraining order issued pursuant to this division, the court shall order the restrained person to surrender all firearms and ammunition in the restrained person’s custody or control, or which the restrained person possesses or owns pursuant to paragraph (2).\n(2) The surrender ordered pursuant to paragraph (1) shall occur by immediately surrendering all firearms and ammunition in a safe manner, upon request of any law enforcement officer, to the control of the officer, after being served with the restraining order. A law enforcement officer serving a gun violence restraini

## preprocessing

In [None]:
from transformers import AutoTokenizer

In [None]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

## for dynamic padding

In [None]:
# for padding dynamic
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## evaluation matrix

In [None]:
import evaluate

rouge = evaluate.load("rouge")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    print('results of rouge: {}'.format(result.items()))

    return {k: round(v, 4) for k, v in result.items()}

## Train

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
# !pip install transformers[torch]
!pip install accelerate -U



In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_first_t5_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.617571,0.1314,0.0412,0.1087,0.1087,19.0
2,No log,2.528976,0.1412,0.0538,0.1197,0.1199,19.0
3,No log,2.476114,0.1802,0.0866,0.1541,0.1543,19.0
4,No log,2.441154,0.1924,0.0976,0.1642,0.1646,19.0
5,No log,2.417815,0.1949,0.0996,0.1669,0.1671,19.0
6,No log,2.400876,0.1955,0.1001,0.1674,0.1678,19.0
7,No log,2.388021,0.1949,0.0994,0.1668,0.167,19.0
8,No log,2.37989,0.196,0.1,0.1675,0.1678,19.0
9,2.588900,2.374719,0.1967,0.1,0.1682,0.1684,19.0
10,2.588900,2.372993,0.1957,0.0997,0.1673,0.1676,19.0


TrainOutput(global_step=620, training_loss=2.5650133194461944, metrics={'train_runtime': 725.5992, 'train_samples_per_second': 13.63, 'train_steps_per_second': 0.854, 'total_flos': 2677060833116160.0, 'train_loss': 2.5650133194461944, 'epoch': 10.0})

## Inference

In [None]:
from transformers import pipeline

text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
summarizer = pipeline("summarization", model="/content/my_first_t5_billsum_model")
summarizer(text)

Your max_length is set to 200, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


[{'summary_text': "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}]

In [None]:
from transformers import AutoTokenizer

text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
tokenizer = AutoTokenizer.from_pretrained("/content/my_first_t5_billsum_model")
inputs = tokenizer(text, return_tensors="pt").input_ids

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("/content/my_first_t5_billsum_model")
outputs = model.generate(inputs, max_new_tokens=20, do_sample=False)

In [None]:
outputs

tensor([[   0,   37,   86,   89, 6105,  419, 8291, 1983, 1364,    7, 7744, 2672,
         1358,    6,  533,  124, 1358,    6,   11,  827, 1358]])

In [None]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs'

## Evaluation

In [None]:
# Model evaluation using
def compute_metrics_str(predictions, labels):
    # predictions, labels = eval_pred
    # decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=predictions, references=labels, use_stemmer=True)

    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    # result["gen_len"] = np.mean(prediction_lens)
    print('results of rouge: {}'.format(result.items()))

    return {k: round(v, 4) for k, v in result.items()}, result

accuracy_matrix, results = compute_metrics_str(['again kerala'],['again kerala'])

In [None]:
accuracy_matrix

{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}

In [None]:
results

{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}

## onnx conversion

In [None]:
# load model and tokenizer
model_id = "/content/my_first_t5_billsum_model"
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
dummy_model_input = tokenizer("This is a sample", return_tensors="pt")

In [None]:
from pathlib import Path
import transformers
from transformers.onnx import FeaturesManager
from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM

# load model and tokenizer
model_id = "/content/my_first_t5_billsum_model"
feature = "seq2seq-lm"
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# load config
model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise(model, feature=feature)
onnx_config = model_onnx_config(model.config)

# export
onnx_inputs, onnx_outputs = transformers.onnx.export(
        preprocessor=tokenizer,
        model=model,
        config=onnx_config,
        opset=13,
        output=Path("/content/my_first_t5_billsum_model/t5-model.onnx")
)

verbose: False, log level: Level.ERROR



## onnx inference

In [None]:
from onnxruntime import InferenceSession

session = InferenceSession("/content/my_first_t5_billsum_model/t5-model.onnx", providers=['CUDAExecutionProvider', 'AzureExecutionProvider', 'CPUExecutionProvider'])



In [None]:
model_id = "/content/my_first_t5_billsum_model"

text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
tokenizer = AutoTokenizer.from_pretrained("/content/my_first_t5_billsum_model")

model_input = tokenizer(text, return_tensors="np")



In [None]:
model_input['decoder_input_ids'] = model_input['input_ids']
model_input['decoder_attention_mask'] = model_input['attention_mask']

In [None]:
outputs = session.run(input_feed=model_input, output_names="last_hidden_state")

TypeError: ignored