In [1]:
!pip install -U transformers -q
!pip install -U accelerate -q
!pip install keras_nlp -q
!pip install datasets -q
!pip install huggingface-hub -q
!pip install rouge-score -q

In [2]:
pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [3]:
import wandb

ModuleNotFoundError: No module named 'wandb'

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb-api-key")
wandb.login(key=secret_value_0)

In [None]:
import nltk
nltk.download("all",quiet=True)
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [None]:
from datasets import load_dataset
dataset = load_dataset("xsum", split="train")
print(dataset)

In [None]:
print(dataset[0])

In [None]:
datasets = dataset.train_test_split(train_size=0.05,test_size=0.02)

In [None]:
print(len(datasets['train']))
print(len(datasets['test']))

In [None]:
MAX_INPUT_LENGTH = 512
MIN_TARGET_LENGTH = 5
MAX_TARGET_LENGTH = 128
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
MAX_EPOCHS = 100
MODEL_CHECKPOINT = "t5-small" # Name of Model

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [None]:
if MODEL_CHECKPOINT in ["t5-small", "t5-base"]:
  prefix = "summarize: "
else:
  prefix = ""

In [None]:
#Preprocessing
def preprocess_function(examples):
  inputs = [prefix + doc for doc in examples["document"]]
  model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH,truncation=True)
  # Setup the tokenizer for targets
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
train = datasets['train']
test = datasets['test']

In [None]:
tokenized_train = train.map(preprocess_function, batched=True)
tokenized_test = test.map(preprocess_function, batched=True)

In [None]:
import transformers
from transformers import TFAutoModelForSeq2SeqLM, AutoModelForSeq2SeqLM,DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import nltk
import numpy as np
from datasets import load_metric

metric = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Replace -100 in the predictions as we can't decode them.
    preds = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


In [None]:
import torch

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and being used")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU instead")


In [None]:
model_name = MODEL_CHECKPOINT.split("/")[-1]
# output_dir = "/kaggle/working/Model/"

args = Seq2SeqTrainingArguments(
    output_dir="./financial_report_summarization",
    evaluation_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=MAX_EPOCHS,
    predict_with_generate=True,
    fp16=True
)

In [None]:
# !pip install -U accelerate -q
import accelerate
accelerate.__version__

In [None]:
trainer = Seq2SeqTrainer(
    model.to(device),
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()
model.save
model.save_pretrained("./financial_report_summarization")
tokenizer.save_pretrained("./financial_report_summarization")

In [None]:
predict_results = trainer.predict(tokenized_test,max_length=128, num_beams=3)

In [None]:
if args.predict_with_generate:
    # Replace -100 with pad_token_id in predictions
    preds = np.where(predict_results.predictions != -100, predict_results.predictions, tokenizer.pad_token_id)
    # Decode batched predictions into text, skipping special tokens and cleaning up spaces
    predictions = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    # Strip leading/trailing spaces from each prediction
    predictions = [pred.strip() for pred in predictions]


In [None]:
test['summary'][:2]

In [None]:
predictions[:2]

In [None]:
model.save("./Saved_model")