In [1]:
#This is following the Summarization Tutorial for Transformers found on HuggingFace website
#https://huggingface.co/docs/transformers/tasks/summarization

In [2]:
pip install transformers datasets evaluate rouge_score


Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting absl-py (from rouge_score)
  Obtaining dependency information for absl-py from https://files.pythonhosted.org/packages/01/e4/dc0a1dcc4e74e08d7abedab278c795eef54a224363bb18f5692f416d834f/absl_py-2.0.0-py3-none-any.whl.metadata
  Downloading absl_py-2.0.0-py3-none-any.whl.metadata (2.3 kB)
Collecting nltk (from rouge_score)
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [

In [3]:
from datasets import load_dataset

# Load the BillSum dataset from hugging face
billsum = load_dataset("billsum", split="ca_test")

Downloading builder script: 100%|██████████| 3.66k/3.66k [00:00<00:00, 8.95MB/s]
Downloading metadata: 100%|██████████| 1.80k/1.80k [00:00<00:00, 16.5MB/s]
Downloading readme: 100%|██████████| 6.70k/6.70k [00:00<00:00, 33.7MB/s]
Downloading data: 100%|██████████| 67.3M/67.3M [00:02<00:00, 31.2MB/s]
Generating train split: 100%|██████████| 18949/18949 [00:00<00:00, 30669.16 examples/s]
Generating test split: 100%|██████████| 3269/3269 [00:00<00:00, 31337.19 examples/s]
Generating ca_test split: 100%|██████████| 1237/1237 [00:00<00:00, 22608.09 examples/s]


In [4]:
# Split into Train and Test Set 

#Test size is proportion to include in test split
# Most is going to be used to train our model
billsum = billsum.train_test_split(test_size=0.2)

In [5]:
billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 248
    })
})

In [6]:
from transformers import AutoTokenizer 

tokenizer = AutoTokenizer.from_pretrained("t5-small")

Downloading (…)okenizer_config.json: 100%|██████████| 2.32k/2.32k [00:00<00:00, 8.13MB/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 4.67MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 47.1MB/s]


In [7]:
# We want T5 to know we are summarizing

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map: 100%|██████████| 989/989 [00:01<00:00, 821.79 examples/s]
Map: 100%|██████████| 248/248 [00:00<00:00, 823.91 examples/s]


In [11]:
# Create a batch of examples

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small")

In [12]:
# Include a metric to evaluate your models performance

import evaluate

rouge = evaluate.load("rouge")

Downloading builder script: 100%|██████████| 6.27k/6.27k [00:00<00:00, 26.5MB/s]


In [13]:
#Compute the Rogue Metric for the Model

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


# Time to Train the Model!

In [14]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

Downloading (…)lve/main/config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 8.94MB/s]
Downloading model.safetensors: 100%|██████████| 242M/242M [00:03<00:00, 75.1MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 1.79MB/s]


In [20]:
training_args = Seq2SeqTrainingArguments(
    output_dir="sample_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False,
)

In [24]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Please USE CUDA to Train this Model. I am on Mac so I don't have access to a GPU. I might rent a cloud gpu!

In [25]:
trainer.train()

  0%|          | 0/248 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  if unfinished_sequences.max() == 0:
                                                
 25%|██▌       | 62/248 [05:33<13:58,  4.51s/it]

{'eval_loss': 2.725334405899048, 'eval_rouge1': 0.1284, 'eval_rouge2': 0.0376, 'eval_rougeL': 0.1051, 'eval_rougeLsum': 0.105, 'eval_gen_len': 19.0, 'eval_runtime': 33.586, 'eval_samples_per_second': 7.384, 'eval_steps_per_second': 0.476, 'epoch': 1.0}


                                                 
 50%|█████     | 124/248 [12:53<19:37,  9.50s/it]

{'eval_loss': 2.5248260498046875, 'eval_rouge1': 0.137, 'eval_rouge2': 0.0465, 'eval_rougeL': 0.1128, 'eval_rougeLsum': 0.1128, 'eval_gen_len': 19.0, 'eval_runtime': 32.1068, 'eval_samples_per_second': 7.724, 'eval_steps_per_second': 0.498, 'epoch': 2.0}


                                                 
 75%|███████▌  | 186/248 [19:09<08:40,  8.39s/it]

{'eval_loss': 2.46444034576416, 'eval_rouge1': 0.1406, 'eval_rouge2': 0.0498, 'eval_rougeL': 0.1149, 'eval_rougeLsum': 0.1149, 'eval_gen_len': 19.0, 'eval_runtime': 29.6516, 'eval_samples_per_second': 8.364, 'eval_steps_per_second': 0.54, 'epoch': 3.0}


                                                 
100%|██████████| 248/248 [27:18<00:00,  6.61s/it]

{'eval_loss': 2.447948932647705, 'eval_rouge1': 0.1431, 'eval_rouge2': 0.05, 'eval_rougeL': 0.116, 'eval_rougeLsum': 0.116, 'eval_gen_len': 19.0, 'eval_runtime': 28.9727, 'eval_samples_per_second': 8.56, 'eval_steps_per_second': 0.552, 'epoch': 4.0}
{'train_runtime': 1638.4635, 'train_samples_per_second': 2.414, 'train_steps_per_second': 0.151, 'train_loss': 2.9873620310137348, 'epoch': 4.0}





TrainOutput(global_step=248, training_loss=2.9873620310137348, metrics={'train_runtime': 1638.4635, 'train_samples_per_second': 2.414, 'train_steps_per_second': 0.151, 'train_loss': 2.9873620310137348, 'epoch': 4.0})

In [27]:
trainer.save_model("./sample_billsum_model/")

In [22]:
#The Model is now trained

## Lets Test our New Model!

In [35]:
from transformers import pipeline
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

summarizer = pipeline("summarization", model='./sample_billsum_model/', max_length=100)
summarizer(text)

[{'summary_text': "the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs . it's the most aggressive action on tackling the climate crisis in American history . no one making under $400,000 per year will pay a penny more in taxes ."}]