## Importing Modules

In [1]:
import os
import pathlib
import numpy as np
import pandas as pd
import evaluate
from transformers import pipeline
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoTokenizer, TFAutoModel
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
import torch
from transformers import DefaultDataCollator

if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

  from .autonotebook import tqdm as notebook_tqdm



GPU is available!


In [2]:
MAIN_PATH = str(pathlib.Path().resolve())
DATASET_PATH = MAIN_PATH + '\\datasets'
MODEL_PATH = MAIN_PATH + '\\models'

In [3]:
models = os.listdir(MODEL_PATH)
models

['bert-base-cased',
 'bert-base-multilingual-cased',
 'bert-base-uncased',
 'bert-large-cased',
 'bert-large-uncased',
 'flan-t5-base',
 'flan-t5-large',
 'flan-t5-small',
 'gpt2',
 'gpt2-large',
 'gpt2-medium']

In [4]:
model_path = MODEL_PATH + '\\' + models[5]
model_path

'D:\\Python\\LLM_Environment\\models\\flan-t5-base'

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

## Import Dataset

In [6]:
# Load the dataset
billsum = load_dataset("billsum", split="ca_test")

In [7]:
billsum

Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 1237
})

In [8]:
billsum = billsum.train_test_split(test_size=0.2)

In [9]:
billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 248
    })
})

In [10]:
example = billsum["train"][0]
for key in example:
    print("A key of the example: \"{}\"".format(key))
    print("The value corresponding to the key-\"{}\"\n \"{}\"".format(key, example[key]))

A key of the example: "text"
The value corresponding to the key-"text"
 "The people of the State of California do enact as follows:


SECTION 1.
The Legislature finds and declares all of the following:
(a) Corinthian Colleges, Inc., was the target of consumer and taxpayer protection enforcement efforts by the federal government, the Attorney General, and other state and federal authorities.
(b) Based on findings of harm to students enrolled at Corinthian Colleges campuses, the United States Department of Education announced debt relief programs to assist students, including all of the following:
(1) A student who attended a Corinthian Colleges campus that closed on April 27, 2015, and withdrew any time after June 20, 2014, is eligible to apply for a closed school loan discharge, so long as the student does not transfer earned credit and subsequently
complete
completes
a comparable program at another institution.
(2) A student who believes he or she was a victim of fraud or other violat

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [12]:
tokenized_text = tokenizer(example['text'])
for key in tokenized_text:
    print(key)
    print(tokenized_text[key])

Token indices sequence length is longer than the specified maximum sequence length for this model (3215 > 512). Running this sequence through the model will result in indexing errors


input_ids
[37, 151, 13, 8, 1015, 13, 1826, 103, 3, 35, 2708, 38, 6963, 10, 180, 3073, 9562, 1300, 37, 28204, 12902, 11, 15884, 7, 66, 13, 8, 826, 10, 41, 9, 61, 2487, 77, 17, 29022, 1888, 7, 6, 1542, 5, 6, 47, 8, 2387, 13, 3733, 11, 15375, 1711, 7961, 2231, 57, 8, 2822, 789, 6, 8, 10154, 2146, 6, 11, 119, 538, 11, 2822, 5779, 5, 41, 115, 61, 6719, 30, 7469, 13, 6263, 12, 481, 3, 15097, 44, 2487, 77, 17, 29022, 1888, 7, 25784, 6, 8, 907, 1323, 1775, 13, 2855, 2162, 2814, 4956, 1356, 12, 2094, 481, 6, 379, 66, 13, 8, 826, 10, 5637, 71, 1236, 113, 5526, 3, 9, 2487, 77, 17, 29022, 1888, 7, 4730, 24, 3168, 30, 1186, 14141, 1230, 6, 11, 28, 26, 60, 210, 136, 97, 227, 1515, 16047, 1412, 6, 19, 5573, 12, 1581, 21, 3, 9, 3168, 496, 2289, 12445, 6, 78, 307, 38, 8, 1236, 405, 59, 2025, 4964, 998, 11, 3, 14064, 743, 743, 7, 3, 9, 13289, 478, 44, 430, 6568, 5, 6499, 71, 1236, 113, 7228, 3, 88, 42, 255, 47, 3, 9, 7584, 13, 7712, 42, 119, 17880, 13, 538, 973, 57, 2487, 77, 17, 29022, 1888, 7, 54, 158

In [13]:
def preprocess_function(examples):
    # Prepends the string "summarize: " to each document in the 'text' field of the input examples.
    # This is done to instruct the T5 model on the task it needs to perform, which in this case is summarization.
    inputs = ["summarize: " + doc for doc in examples["text"]]

    # Tokenizes the prepended input texts to convert them into a format that can be fed into the T5 model.
    # Sets a maximum token length of 1024, and truncates any text longer than this limit.
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Tokenizes the 'summary' field of the input examples to prepare the target labels for the summarization task.
    # Sets a maximum token length of 128, and truncates any text longer than this limit.
    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    # Assigns the tokenized labels to the 'labels' field of model_inputs.
    # The 'labels' field is used during training to calculate the loss and guide model learning.
    model_inputs["labels"] = labels["input_ids"]

    # Returns the prepared inputs and labels as a single dictionary, ready for training.
    return model_inputs

In [14]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map: 100%|██████████| 989/989 [00:01<00:00, 830.14 examples/s]
Map: 100%|██████████| 248/248 [00:00<00:00, 595.71 examples/s]


In [15]:
tokenized_billsum['test'][0]['text']

'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 1797.5 of the Fish and Game Code is amended to read:\n1797.5.\nFor the purposes of this chapter, the following terms shall have the following meanings:\n(a) “Bank” means a conservation bank, mitigation bank, or conservation and mitigation bank.\n(b) “Bank enabling instrument” means a written agreement with the department regarding the establishment, use, operation, and maintenance of the bank.\n(c) “Bank sponsor” means the person or entity responsible for establishing and operating a bank.\n(d) “Conservation bank” means a publicly or privately owned and operated site that is to be conserved and managed in accordance with a written agreement with the department that includes provisions for the issuance of credits, on which important habitat, including habitat for threatened, endangered, or other special status species, exists, has been, or will be created to do any of the following:\n(1) Compensate for 

In [16]:
tokenized_billsum['test'][0]['summary']

'Existing law requires the Department of Fish and Wildlife to administer the Significant Natural Areas Program, and requires the department, among other things, to develop and maintain a spatial data system that identifies those areas in the state that are most essential for maintaining habitat connectivity, including wildlife corridors and habitat linkages. Existing law requires the department, contingent upon the provision of certain funding, to investigate, study, and identify those areas in the state that are most essential as wildlife corridors and habitat linkages and prioritize vegetative data development in those areas. Existing law requires the department to seek input from representatives of other state agencies, local government, federal agencies, nongovernmental conservation organizations, landowners, agriculture, recreation, scientific entities, and industry in determining essential wildlife corridors and habitat linkages.\nThis bill would declare that it is the policy of 

In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_path)

In [18]:
rouge = evaluate.load("rouge")

In [19]:
def compute_metrics(eval_pred):
    # Unpacks the evaluation predictions tuple into predictions and labels.
    predictions, labels = eval_pred

    # Decodes the tokenized predictions back to text, skipping any special tokens (e.g., padding tokens).
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replaces any -100 values in labels with the tokenizer's pad_token_id.
    # This is done because -100 is often used to ignore certain tokens when calculating the loss during training.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decodes the tokenized labels back to text, skipping any special tokens (e.g., padding tokens).
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Computes the ROUGE metric between the decoded predictions and decoded labels.
    # The use_stemmer parameter enables stemming, which reduces words to their root form before comparison.
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Calculates the length of each prediction by counting the non-padding tokens.
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    # Computes the mean length of the predictions and adds it to the result dictionary under the key "gen_len".
    result["gen_len"] = np.mean(prediction_lens)

    # Rounds each value in the result dictionary to 4 decimal places for cleaner output, and returns the result.
    return {k: round(v, 4) for k, v in result.items()}

In [20]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_fine_tuned_t5_small_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
)



In [21]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [22]:
trainer.train()

                                                  
 25%|██▌       | 495/1980 [04:13<08:01,  3.08it/s]

{'eval_loss': nan, 'eval_rouge1': 0.103, 'eval_rouge2': 0.0296, 'eval_rougeL': 0.0858, 'eval_rougeLsum': 0.0858, 'eval_gen_len': 15.1613, 'eval_runtime': 96.7346, 'eval_samples_per_second': 2.564, 'eval_steps_per_second': 1.282, 'epoch': 1.0}


 25%|██▌       | 500/1980 [04:14<2:59:50,  7.29s/it] 

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 2e-05, 'epoch': 1.01}


                                                  
 50%|█████     | 990/1980 [08:23<04:31,  3.64it/s]

{'eval_loss': nan, 'eval_rouge1': 0.103, 'eval_rouge2': 0.0296, 'eval_rougeL': 0.0858, 'eval_rougeLsum': 0.0858, 'eval_gen_len': 15.1613, 'eval_runtime': 95.839, 'eval_samples_per_second': 2.588, 'eval_steps_per_second': 1.294, 'epoch': 2.0}


 51%|█████     | 1000/1980 [08:26<24:03,  1.47s/it] 

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 2e-05, 'epoch': 2.02}


                                                   
 75%|███████▌  | 1485/1980 [12:36<02:14,  3.68it/s]

{'eval_loss': nan, 'eval_rouge1': 0.103, 'eval_rouge2': 0.0296, 'eval_rougeL': 0.0858, 'eval_rougeLsum': 0.0858, 'eval_gen_len': 15.1613, 'eval_runtime': 97.14, 'eval_samples_per_second': 2.553, 'eval_steps_per_second': 1.277, 'epoch': 3.0}


 76%|███████▌  | 1500/1980 [12:40<04:03,  1.97it/s]  

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 2e-05, 'epoch': 3.03}


                                                   
100%|██████████| 1980/1980 [16:51<00:00,  1.96it/s]

{'eval_loss': nan, 'eval_rouge1': 0.103, 'eval_rouge2': 0.0296, 'eval_rougeL': 0.0858, 'eval_rougeLsum': 0.0858, 'eval_gen_len': 15.1613, 'eval_runtime': 96.3676, 'eval_samples_per_second': 2.573, 'eval_steps_per_second': 1.287, 'epoch': 4.0}
{'train_runtime': 1011.3412, 'train_samples_per_second': 3.912, 'train_steps_per_second': 1.958, 'train_loss': 0.0, 'epoch': 4.0}





TrainOutput(global_step=1980, training_loss=0.0, metrics={'train_runtime': 1011.3412, 'train_samples_per_second': 3.912, 'train_steps_per_second': 1.958, 'total_flos': 5417781448734720.0, 'train_loss': 0.0, 'epoch': 4.0})

In [23]:
trainer.save_model("my_fine_tuned_t5_small_model")

In [24]:
text = billsum['test'][100]['text']
text = "summarize: " + text
text

'summarize: The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 43.101 is added to the Civil Code, to read:\n43.101.\n(a) An emergency responder shall not be liable for any damage to an unmanned aircraft or unmanned aircraft system, if the damage was caused while the emergency responder was providing, and the unmanned aircraft or unmanned aircraft system was interfering with, the operation, support, or enabling of the emergency services listed in Section 853 of the Government Code.\n(b) (1) For purposes of this section, “emergency responder” means either of the following, if acting within the scope of authority implicitly or expressly provided by a public entity or a public employee to provide emergency services:\n(A) A paid or unpaid volunteer.\n(B) A private entity.\n(2) All of the following terms shall have the same meaning as the terms as used in Chapter 4.5 (commencing with Section 853) of Part 2 of Division 3.6 of Title 1 of the Government Code:\n(

In [25]:
summarizer = pipeline("summarization", model="my_fine_tuned_t5_small_model")
pred = summarizer(text)
pred

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Token indices sequence length is longer than the specified maximum sequence length for this model (1234 > 512). Running this sequence through the model will result in indexing errors


[{'summary_text': 'A bill to amend the Civil Code of California to protect public and private emergency responders from liability relating to the new and increasing proliferation of unmanned aircraft systems that disrupt the provision of emergency services.'}]

In [26]:
tokenizer = AutoTokenizer.from_pretrained("my_fine_tuned_t5_small_model")
inputs = tokenizer(text, return_tensors="pt").input_ids
inputs

Token indices sequence length is longer than the specified maximum sequence length for this model (1232 > 512). Running this sequence through the model will result in indexing errors


tensor([[21603,    10,    37,  ...,  2017,     5,     1]])

In [27]:
model = AutoModelForSeq2SeqLM.from_pretrained("my_fine_tuned_t5_small_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [28]:
pred[0]['summary_text']

'A bill to amend the Civil Code of California to protect public and private emergency responders from liability relating to the new and increasing proliferation of unmanned aircraft systems that disrupt the provision of emergency services.'

In [29]:
preds = [pred[0]['summary_text']]

In [30]:
labels = [billsum['test'][100]['summary']]

In [31]:
rouge.compute(predictions=preds, references=labels, use_stemmer=True)

{'rouge1': 0.11654135338345864,
 'rouge2': 0.033962264150943396,
 'rougeL': 0.09022556390977444,
 'rougeLsum': 0.10526315789473684}