In [1]:
# importing libraries

import pandas as pd
import numpy as np
from datasets import load_dataset,DatasetDict
from transformers import AutoTokenizer, Seq2SeqTrainingArguments,Seq2SeqTrainer, TFAutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
import evaluate
from transformers import create_optimizer, AdamWeightDecay
from transformers.keras_callbacks import KerasMetricCallback

  from .autonotebook import tqdm as notebook_tqdm





In [131]:
checkpoint = "google-t5/t5-small"

In [132]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [133]:
try:
    df = load_dataset("csv", data_files=r"D:\vs code\python\DeepLearning\Projects\textSummarizer\Abstractive\utils\dataset_new.csv")
except Exception as e:
    print("An error occurred:", e)

In [134]:
df = df.remove_columns(column_names='Unnamed: 0')

DatasetDict({
    train: Dataset({
        features: ['summary', 'text'],
        num_rows: 98379
    })
})

In [135]:
max_text_length = max([len(txt.split()) for txt in df['train']['text']])
max_summary_length = max([len(txt.split()) for txt in df['train']['summary']])

In [136]:
print("max text length :",max_text_length)
print("max summary length :",max_summary_length)

max text length : 66
max summary length : 14


Or you can load dataset directly

In [3]:
df = load_dataset("billsum", split="ca_test")

In [137]:
df = df['train'].train_test_split(test_size=0.2)

In [156]:
df

DatasetDict({
    train: Dataset({
        features: ['summary', 'text'],
        num_rows: 78703
    })
    test: Dataset({
        features: ['summary', 'text'],
        num_rows: 19676
    })
})

In [139]:
# Text Preprocessing

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples['text']]
    model_inputs = tokenizer(inputs,max_length=1024,truncation=True)

    labels = tokenizer(text_target=examples['summary'],max_length=128,truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


In [140]:
tokenized_df = df.map(preprocess_function, batched=True)


[A


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  0%|          | 0/12596 [03:04<?, ?it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Map: 100%|██████████| 78703/78703 [00:08<00:00, 9000.30 examples/s]
Map: 100%|██████████| 19676/19676 [00:02<00:00, 8247.81 examples/s]


In [146]:
df['train'][0]

{'summary': 'govt plans make connaught place carfree weekends ',
 'text': 'new delhi municipal council reportedly planning stop vehicular movement connaught place weekends order decongest area gets around 5 lakh visitors every day. pilotlaunch plan scheduled august studying traffic flow parking issues council decide whether arrangement made permanent not. '}

In [141]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

In [142]:
rouge = evaluate.load("rouge")

In [143]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Training

In [150]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)




All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [151]:
optimizer = AdamWeightDecay(learning_rate=2e-5,weight_decay_rate=0.01)

In [154]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_df['train'],
    shuffle = True,
    batch_size = 16,
    collate_fn = data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_df['test'],
    shuffle = False,
    batch_size = 16,
    collate_fn = data_collator,
)

In [155]:
model.compile(optimizer=optimizer)

In [158]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)

In [159]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[metric_callback])

Epoch 1/3


  0%|          | 0/19676 [17:49<?, ?it/s]



 423/4918 [=>............................] - ETA: 1:24:39 - loss: 2.7646

Inference

In [None]:
text = input("Enter text here")


tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer(text, return_tensors="tf").input_ids
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)


# Decode the generated token ids back into the text:

tokenizer.decode(outputs[0], skip_special_tokens=True)