In [1]:
!pip install pandas



In [2]:
import pandas as pd

In [3]:
df = pd.read_json('./BillData.json', orient='index')
df['idx'] = range(1, len(df) + 1)

In [4]:
df = df.rename(columns={"DESCRIPTION": "summary", "BILL_TEXT": "text"})
df.head()

Unnamed: 0,summary,BILL_URL,text,idx
HF955,Financial assurance required for feedlot permi...,www.revisor.mn.gov/bills/text.php?number=HF095...,Subd. 7f.\nFinancial assurance.\n(a) Before t...,1
HF911,Board of Animal Health modified.,www.revisor.mn.gov/bills/text.php?number=HF091...,"11\n two at-large members, one member who is ...",2
HF860,Local food promotion and education event fundi...,www.revisor.mn.gov/bills/text.php?number=HF086...,"APPROPRIATION.\n $25,000 in fiscal year 2024 a...",3
HF798,Future of Advanced Agriculture Research in Min...,www.revisor.mn.gov/bills/text.php?number=HF079...,FUTURE OF ADVANCED AGRICULTURE RESEARCH IN MI...,4
HF695,Agriculture-related business assistance fundin...,www.revisor.mn.gov/bills/text.php?number=HF069...,APPROPRIATIONS; BUSINESS ASSISTANCE GRANTS.\n ...,5


In [5]:
df.shape

(16, 4)

In [6]:
from transformers import AutoTokenizer
from datasets import Dataset

In [7]:
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

In [8]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [9]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
tokenized_ds = dataset.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['summary', 'BILL_URL', 'text', 'idx', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 12
    })
    test: Dataset({
        features: ['summary', 'BILL_URL', 'text', 'idx', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4
    })
})

In [12]:
# !pip install rouge_score


In [13]:
import evaluate

rouge = evaluate.load("rouge")

In [14]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [15]:
access_token = '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'

In [16]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small", use_auth_token=access_token)

In [17]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [18]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [19]:
from transformers import logging

In [20]:
logging.set_verbosity_info()

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="summarization_model",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=20,
    num_train_epochs=1000,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    report_to=None
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, __index_level_0__, text, idx, BILL_URL. If summary, __index_level_0__, text, idx, BILL_URL are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12
  Num Epochs = 1000
  Instantaneous batch size per device = 16
  Total train batch

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,5.58798,0.1847,0.0217,0.1447,0.1447,19.0
2,No log,5.577743,0.1847,0.0217,0.1447,0.1447,19.0
3,No log,5.558047,0.1847,0.0217,0.1447,0.1447,19.0
4,No log,5.547746,0.1847,0.0217,0.1447,0.1447,19.0
5,No log,5.519469,0.1847,0.0217,0.1447,0.1447,19.0
6,No log,5.502828,0.1847,0.0217,0.1447,0.1447,19.0
7,No log,5.488652,0.1847,0.0217,0.1447,0.1447,19.0
8,No log,5.461714,0.1847,0.0217,0.1447,0.1447,19.0
9,No log,5.443684,0.1855,0.0217,0.1455,0.1455,19.0
10,No log,5.426117,0.1855,0.0217,0.1455,0.1455,19.0


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, __index_level_0__, text, idx, BILL_URL. If summary, __index_level_0__, text, idx, BILL_URL are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4
  Batch size = 16
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, __index_level_0__, text, idx, BILL_URL. If summary, __index_level_0__, text, idx, BILL_URL are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4
  Batch size = 16
Generate config GenerationConfig {
  "deco

In [None]:
trainer.save_model()

In [None]:
text = df['text'][0]
text = f'summarize: {text}'

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="./summarization_model/")

In [None]:
summarizer(text)

In [None]:
text

In [None]:
summary = df['summary'][0]
summary