
## Importing Modules

In [1]:
import os
import pathlib
import numpy as np
import pandas as pd
import evaluate
from transformers import pipeline
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoTokenizer, TFAutoModel
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
import torch
from transformers import DefaultDataCollator

if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

  from .autonotebook import tqdm as notebook_tqdm



GPU is available!


In [2]:
MAIN_PATH = str(pathlib.Path().resolve())
DATASET_PATH = MAIN_PATH + '\\datasets'
MODEL_PATH = MAIN_PATH + '\\models'

In [3]:
models = os.listdir(MODEL_PATH)
models

['bert-base-cased',
 'bert-base-multilingual-cased',
 'bert-base-uncased',
 'bert-large-cased',
 'bert-large-uncased',
 'flan-t5-base',
 'flan-t5-large',
 'flan-t5-small',
 'gpt2',
 'gpt2-large',
 'gpt2-medium']

In [4]:
model_path = MODEL_PATH + '\\' + models[7]
model_path

'D:\\Python\\LLM_Environment\\models\\flan-t5-small'

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

## Import Dataset

In [6]:
# Load the dataset
billsum = load_dataset("billsum", split="ca_test")

In [7]:
billsum

Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 1237
})

In [8]:
billsum = billsum.train_test_split(test_size=0.2)

In [9]:
billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 248
    })
})

In [10]:
example = billsum["train"][0]
for key in example:
    print("A key of the example: \"{}\"".format(key))
    print("The value corresponding to the key-\"{}\"\n \"{}\"".format(key, example[key]))

A key of the example: "text"
The value corresponding to the key-"text"
 "The people of the State of California do enact as follows:


SECTION 1.
Chapter 6.3 (commencing with Section 21530) is added to Division 21 of the Elections Code, to read:
CHAPTER  6.3. County of Los Angeles Citizens Redistricting Commission
21530.
As used in this chapter, the following terms have the following meanings:
(a) “Board” means the Board of Supervisors of the County of Los Angeles.
(b) “Commission” means the Citizens Redistricting Commission in the County of Los Angeles established pursuant to Section 21532.
(c) “Immediate family member” means a spouse, child, in-law, parent, or sibling.
21531.
There is, in the County of Los Angeles, a Citizens Redistricting Commission. In the year following the year in which the decennial federal census is taken, the commission shall adjust the boundary lines of the supervisorial districts of the board in accordance with this chapter.
21532.
(a) The commission shall be

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [12]:
tokenized_text = tokenizer(example['text'])
for key in tokenized_text:
    print(key)
    print(tokenized_text[key])

Token indices sequence length is longer than the specified maximum sequence length for this model (2863 > 512). Running this sequence through the model will result in indexing errors


input_ids
[37, 151, 13, 8, 1015, 13, 1826, 103, 3, 35, 2708, 38, 6963, 10, 180, 3073, 9562, 1300, 8647, 3, 27865, 41, 287, 526, 4733, 28, 5568, 1401, 26918, 61, 19, 974, 12, 6022, 1401, 13, 8, 19488, 7, 3636, 6, 12, 608, 10, 9302, 2965, 5946, 3, 27865, 5, 1334, 13, 3144, 4975, 22540, 1624, 23, 20066, 53, 3527, 1401, 26918, 5, 282, 261, 16, 48, 5800, 6, 8, 826, 1353, 43, 8, 826, 2530, 7, 10, 41, 9, 61, 105, 279, 32, 986, 153, 598, 8, 2086, 13, 25795, 7, 13, 8, 1334, 13, 3144, 4975, 5, 41, 115, 61, 105, 5890, 5451, 153, 598, 8, 22540, 1624, 23, 20066, 53, 3527, 16, 8, 1334, 13, 3144, 4975, 2127, 19890, 288, 12, 5568, 1401, 4867, 4416, 41, 75, 61, 105, 196, 51, 5700, 342, 384, 1144, 153, 598, 3, 9, 9911, 6, 861, 6, 16, 18, 4207, 6, 4208, 6, 42, 108, 7428, 5, 1401, 4867, 5411, 290, 19, 6, 16, 8, 1334, 13, 3144, 4975, 6, 3, 9, 22540, 1624, 23, 20066, 53, 3527, 5, 86, 8, 215, 826, 8, 215, 16, 84, 8, 20, 75, 35, 7419, 2822, 23087, 19, 1026, 6, 8, 5473, 1522, 6142, 8, 20430, 2356, 13, 8, 14640

In [13]:
def preprocess_function(examples):
    # Prepends the string "summarize: " to each document in the 'text' field of the input examples.
    # This is done to instruct the T5 model on the task it needs to perform, which in this case is summarization.
    inputs = ["summarize: " + doc for doc in examples["text"]]

    # Tokenizes the prepended input texts to convert them into a format that can be fed into the T5 model.
    # Sets a maximum token length of 1024, and truncates any text longer than this limit.
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Tokenizes the 'summary' field of the input examples to prepare the target labels for the summarization task.
    # Sets a maximum token length of 128, and truncates any text longer than this limit.
    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    # Assigns the tokenized labels to the 'labels' field of model_inputs.
    # The 'labels' field is used during training to calculate the loss and guide model learning.
    model_inputs["labels"] = labels["input_ids"]

    # Returns the prepared inputs and labels as a single dictionary, ready for training.
    return model_inputs

In [14]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map: 100%|██████████| 989/989 [00:01<00:00, 622.57 examples/s]
Map: 100%|██████████| 248/248 [00:00<00:00, 726.75 examples/s]


In [15]:
tokenized_billsum['test'][0]['text']

'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 1714.21 of the Civil Code is amended to read:\n1714.21.\n(a) For purposes of this section, the following definitions shall apply:\n(1) “AED” or “defibrillator” means an automated external defibrillator.\n(2) “CPR” means cardiopulmonary resuscitation.\n(b) Any person who, in good faith and not for compensation, renders emergency care or treatment by the use of an AED at the scene of an emergency is not liable for any civil damages resulting from any acts or omissions in rendering the emergency care.\n(c) A person or entity who provides CPR and AED training to a person who renders emergency care pursuant to subdivision (b) is not liable for any civil damages resulting from any acts or omissions of the person rendering the emergency care.\n(d) (1) A person or entity that acquires an AED for emergency use pursuant to this section is not liable for any civil damages resulting from any acts or omissions in t

In [16]:
tokenized_billsum['test'][0]['summary']

'Existing law exempts from civil liability any person who, in good faith and not for compensation, renders emergency care or treatment by the use of an automated external defibrillator (AED) at the scene of an emergency, except in the case of personal injury or wrongful death that results from the gross negligence or willful or wanton misconduct of the person who renders emergency care or treatment. Existing law also exempts from civil liability a person or entity that acquires an AED for emergency use, a physician who is involved with the placement of the AED, and any person or entity responsible for the site where the AED is located if specified conditions are met, including maintenance and regular testing of the AED and having a written plan that describes the procedures to be followed in case of an emergency that may involve the use of the AED. Under existing law, those specified conditions also require, when an AED is placed in a public or private K–12 school, the school principal

In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_path)

In [18]:
rouge = evaluate.load("rouge")

In [19]:
def compute_metrics(eval_pred):
    # Unpacks the evaluation predictions tuple into predictions and labels.
    predictions, labels = eval_pred

    # Decodes the tokenized predictions back to text, skipping any special tokens (e.g., padding tokens).
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replaces any -100 values in labels with the tokenizer's pad_token_id.
    # This is done because -100 is often used to ignore certain tokens when calculating the loss during training.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decodes the tokenized labels back to text, skipping any special tokens (e.g., padding tokens).
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Computes the ROUGE metric between the decoded predictions and decoded labels.
    # The use_stemmer parameter enables stemming, which reduces words to their root form before comparison.
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Calculates the length of each prediction by counting the non-padding tokens.
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    # Computes the mean length of the predictions and adds it to the result dictionary under the key "gen_len".
    result["gen_len"] = np.mean(prediction_lens)

    # Rounds each value in the result dictionary to 4 decimal places for cleaner output, and returns the result.
    return {k: round(v, 4) for k, v in result.items()}

In [20]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_fine_tuned_t5_small_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
)



In [21]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [22]:
trainer.train()


 25%|██▌       | 124/496 [01:20<02:48,  2.20it/s]

{'eval_loss': nan, 'eval_rouge1': 0.0854, 'eval_rouge2': 0.0236, 'eval_rougeL': 0.0704, 'eval_rougeLsum': 0.0702, 'eval_gen_len': 13.9798, 'eval_runtime': 23.6688, 'eval_samples_per_second': 10.478, 'eval_steps_per_second': 1.31, 'epoch': 1.0}


 50%|█████     | 248/496 [02:17<01:50,  2.25it/s]
 50%|█████     | 248/496 [02:41<01:50,  2.25it/s]

{'eval_loss': nan, 'eval_rouge1': 0.0854, 'eval_rouge2': 0.0236, 'eval_rougeL': 0.0704, 'eval_rougeLsum': 0.0702, 'eval_gen_len': 13.9798, 'eval_runtime': 24.0089, 'eval_samples_per_second': 10.329, 'eval_steps_per_second': 1.291, 'epoch': 2.0}


                                                 
 75%|███████▌  | 372/496 [04:02<00:57,  2.14it/s]

{'eval_loss': nan, 'eval_rouge1': 0.0854, 'eval_rouge2': 0.0236, 'eval_rougeL': 0.0704, 'eval_rougeLsum': 0.0702, 'eval_gen_len': 13.9798, 'eval_runtime': 24.2416, 'eval_samples_per_second': 10.23, 'eval_steps_per_second': 1.279, 'epoch': 3.0}


                                                 
100%|██████████| 496/496 [05:25<00:00,  1.52it/s]

{'eval_loss': nan, 'eval_rouge1': 0.0854, 'eval_rouge2': 0.0236, 'eval_rougeL': 0.0704, 'eval_rougeLsum': 0.0702, 'eval_gen_len': 13.9798, 'eval_runtime': 24.1329, 'eval_samples_per_second': 10.276, 'eval_steps_per_second': 1.285, 'epoch': 4.0}
{'train_runtime': 325.9456, 'train_samples_per_second': 12.137, 'train_steps_per_second': 1.522, 'train_loss': 0.0, 'epoch': 4.0}





TrainOutput(global_step=496, training_loss=0.0, metrics={'train_runtime': 325.9456, 'train_samples_per_second': 12.137, 'train_steps_per_second': 1.522, 'total_flos': 1470765673218048.0, 'train_loss': 0.0, 'epoch': 4.0})

In [23]:
trainer.save_model("my_fine_tuned_t5_small_model")

In [24]:
text = billsum['test'][100]['text']
text = "summarize: " + text
text

'summarize: The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 21159.21 of the Public Resources Code is amended to read:\n21159.21.\nA housing project qualifies for an exemption from this division pursuant to Section 21159.22, 21159.23, or 21159.24 if it meets the criteria in the applicable section and all of the following criteria:\n(a) The project is consistent with any applicable general plan, specific plan, and local coastal program, including any mitigation measures required by a plan or program, as that plan or program existed on the date that the application was deemed complete and with any applicable zoning ordinance, as that zoning ordinance existed on the date that the application was deemed complete, except that a project shall not be deemed to be inconsistent with the zoning designation for the site if that zoning designation is inconsistent with the general plan only because the project site has not been rezoned to conform with a more recen

In [25]:
summarizer = pipeline("summarization", model="my_fine_tuned_t5_small_model")
pred = summarizer(text)
pred

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Token indices sequence length is longer than the specified maximum sequence length for this model (1132 > 512). Running this sequence through the model will result in indexing errors


[{'summary_text': '(i) (1) The project site is not located on developed open space. (2) For the purposes of this subdivision, “developed open space” means land that meets all of the following criteria: (a) It is consistent with any applicable general plan, specific plan, and local coastal program, including any mitigation measures required by a plan or program; (b) It does not contain wetlands, does not have any value as a wildlife habitat; and the project does not harm any species protected by the federal Endangered Species Act of 1973 (16 U.S.C. Sec. 1531 et seq.) or protected under the Native Plant Protection Act (Chapter 10 (commencing with Section 2050) of Division 3 of the Fish and Game Code), and the program does not cause the destruction or removal of any species listed by zoning ordinance in effect at the time the application for the project was deemed complete.'}]

In [26]:
tokenizer = AutoTokenizer.from_pretrained("my_fine_tuned_t5_small_model")
inputs = tokenizer(text, return_tensors="pt").input_ids
inputs

Token indices sequence length is longer than the specified maximum sequence length for this model (1130 > 512). Running this sequence through the model will result in indexing errors


tensor([[21603,    10,    37,  ...,  3659,     5,     1]])

In [27]:
model = AutoModelForSeq2SeqLM.from_pretrained("my_fine_tuned_t5_small_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [28]:
pred[0]['summary_text']

'(i) (1) The project site is not located on developed open space. (2) For the purposes of this subdivision, “developed open space” means land that meets all of the following criteria: (a) It is consistent with any applicable general plan, specific plan, and local coastal program, including any mitigation measures required by a plan or program; (b) It does not contain wetlands, does not have any value as a wildlife habitat; and the project does not harm any species protected by the federal Endangered Species Act of 1973 (16 U.S.C. Sec. 1531 et seq.) or protected under the Native Plant Protection Act (Chapter 10 (commencing with Section 2050) of Division 3 of the Fish and Game Code), and the program does not cause the destruction or removal of any species listed by zoning ordinance in effect at the time the application for the project was deemed complete.'

In [29]:
preds = [pred[0]['summary_text']]

In [30]:
labels = [billsum['test'][100]['summary']]

In [31]:
rouge.compute(predictions=preds, references=labels, use_stemmer=True)

{'rouge1': 0.3558718861209964,
 'rouge2': 0.043010752688172046,
 'rougeL': 0.1423487544483986,
 'rougeLsum': 0.1708185053380783}