In [1]:
!pip install -U transformers -q
!pip install -U accelerate -q
!pip install keras_nlp -q
!pip install datasets -q
!pip install huggingface-hub -q
!pip install rouge-score -q

In [2]:
pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [3]:
import wandb

In [4]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb-api-key")
wandb.login(key=secret_value_0)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
import nltk
nltk.download("all",quiet=True)
import numpy as np
import tensorflow as tf
from tensorflow import keras



In [6]:
from datasets import load_dataset
dataset = load_dataset("xsum", split="train")
print(dataset)

Downloading builder script:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/954 [00:00<?, ?B/s]

Downloading and preparing dataset xsum/default (download: 245.38 MiB, generated: 507.60 MiB, post-processed: Unknown size, total: 752.98 MiB) to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934. Subsequent calls will reuse this data.
Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 204045
})


In [7]:
print(dataset[0])



In [8]:
datasets = dataset.train_test_split(train_size=0.05,test_size=0.02)

In [9]:
print(len(datasets['train']))
print(len(datasets['test']))

10202
4081


In [10]:
MAX_INPUT_LENGTH = 512
MIN_TARGET_LENGTH = 5
MAX_TARGET_LENGTH = 128
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
MAX_EPOCHS = 2
MODEL_CHECKPOINT = "t5-small" # Name of Model

In [11]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [12]:
if MODEL_CHECKPOINT in ["t5-small", "t5-base"]:
  prefix = "summarize: "
else:
  prefix = ""

In [13]:
#Preprocessing
def preprocess_function(examples):
  inputs = [prefix + doc for doc in examples["document"]]
  model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH,truncation=True)
  # Setup the tokenizer for targets
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [14]:
train = datasets['train']
test = datasets['test']

In [15]:
tokenized_train = train.map(preprocess_function, batched=True)
tokenized_test = test.map(preprocess_function, batched=True)

  0%|          | 0/11 [00:00<?, ?ba/s]



  0%|          | 0/5 [00:00<?, ?ba/s]

In [16]:
import transformers
from transformers import TFAutoModelForSeq2SeqLM, AutoModelForSeq2SeqLM,DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [17]:
import nltk
import numpy as np
from datasets import load_metric

metric = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Replace -100 in the predictions as we can't decode them.
    preds = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [18]:
import torch

In [19]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and being used")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU instead")


GPU is available and being used


In [20]:
model_name = MODEL_CHECKPOINT.split("/")[-1]
# output_dir = "/kaggle/working/Model/"

args = Seq2SeqTrainingArguments(
    output_dir="./financial_report_summarization",
    evaluation_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=MAX_EPOCHS,
    predict_with_generate=True,
    fp16=True
)

In [21]:
# !pip install -U accelerate -q
import accelerate
accelerate.__version__

'0.23.0'

In [22]:
trainer = Seq2SeqTrainer(
    model.to(device),
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [24]:
trainer.train()


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.8812,2.643121,25.5943,6.1847,19.9135,19.907,18.7925
2,2.8601,2.634141,25.8626,6.2786,20.0837,20.0657,18.7974




TrainOutput(global_step=1276, training_loss=2.863147699720807, metrics={'train_runtime': 718.7066, 'train_samples_per_second': 28.39, 'train_steps_per_second': 1.775, 'total_flos': 2761421069746176.0, 'train_loss': 2.863147699720807, 'epoch': 2.0})

In [25]:
# model.save()
model.save_pretrained("./financial_report_summarization")
tokenizer.save_pretrained("./financial_report_summarization")

('./financial_report_summarization/tokenizer_config.json',
 './financial_report_summarization/special_tokens_map.json',
 './financial_report_summarization/spiece.model',
 './financial_report_summarization/added_tokens.json',
 './financial_report_summarization/tokenizer.json')

In [26]:
predict_results = trainer.predict(tokenized_test,max_length=128, num_beams=3)



In [27]:
if args.predict_with_generate:
    # Replace -100 with pad_token_id in predictions
    preds = np.where(predict_results.predictions != -100, predict_results.predictions, tokenizer.pad_token_id)
    # Decode batched predictions into text, skipping special tokens and cleaning up spaces
    predictions = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    # Strip leading/trailing spaces from each prediction
    predictions = [pred.strip() for pred in predictions]


In [28]:
test['summary'][:2]

["Venezuela's opposition, non-government organisations and international bodies have accused the government of using military courts against its critics.",
 "Taiwan's parliament has approved a bill banning the slaughter of cats and dogs for human consumption."]

In [29]:
predictions[:2]

['Venezuelan president Nicolás Maduro has said he plans to create a constitutional military assembly to deepen his revolution.',
 'Taiwan has passed a bill banning the sale of meat and fur of pets for what it calls "economic purposes".']

In [31]:
# model.save("./Saved_model