In [1]:
# Transformers installation
! pip install transformers datasets
! pip install transformers datasets evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l- done
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=dd4562bf51aa979bbbb83795ac724feb5fa1cf2d87222989e00941cd7d63463c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.1 rouge_score-0.1.2


In [2]:
!pip install wandb



In [3]:
from datasets import Dataset

In [4]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value = user_secrets.get_secret("wandb-api-key")


wandb.login(key=secret_value)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from glob import glob

In [6]:
import pandas as pd
import random

# Load the dataset
filename = "/kaggle/input/cls-merged/merged_data.csv"
df = pd.read_csv(filename)

# Shuffle the dataset
df = df.sample(frac=1, random_state=42)  # Shuffle the DataFrame with a fixed random seed for reproducibility

# Limit the dataset to 20,000 rows
df = df.head(25000)

# Rename the columns to 'text' and 'summary'
df.rename(columns={'Input': 'text', 'Output': 'summary'}, inplace=True)



In [7]:
df

Unnamed: 0,text,summary
12796,Revenue share 3.6m Allergy and Autoimmune Main...,Revenue share 3.6m Allergy and Autoimmune Main...
1255,Michael Marx Chief Executive 30 March 2009 Nat...,The results were affected by the need to make ...
23441,I am pleased to report that the Group has cont...,I am pleased to report that the Group has cont...
3441,"In the Nordic region, AFDEC statistics show a ...","In the Nordic region, AFDEC statistics show a ..."
9733,These awards and the increased demand again fo...,These awards and the increased demand again fo...
...,...,...
1262,The 2009 account renewals Chief Executives Re...,The 2009 account renewals Chief Executives Re...
23596,We still completed four important wells during...,We still completed four important wells during...
17359,The combination of these wells and the newly- ...,The combination of these wells and the newly- ...
26209,We stock all the products motorists need to re...,We stock all the products motorists need to re...


In [8]:
df.columns

Index(['text', 'summary'], dtype='object')

In [9]:
dataset = Dataset.from_pandas(df[['text', 'summary']])

In [10]:
dataset

Dataset({
    features: ['text', 'summary', '__index_level_0__'],
    num_rows: 25000
})

In [11]:
dataset = dataset.train_test_split(test_size=0.0001)

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', '__index_level_0__'],
        num_rows: 24997
    })
    test: Dataset({
        features: ['text', 'summary', '__index_level_0__'],
        num_rows: 3
    })
})

In [13]:
dataset["train"][11]

{'text': 'This requires the faithful representation of the effects of transactions, other events and conditions in accordance with the definitions and recognition criteria for assets, liabilities, income and expenses set out in the International Accounting Standards Boards Framework for the Preparation and Presentation of Financial Statements. In virtually all circumstances, a fair presentation will be achieved by compliance with all applicable IFRS. Directors are also required to:  properly select and apply accounting policies;  present information, including accounting policies, in a manner that provides relevant, reliable, comparable and under- standable information; and  provide additional disclosures when compliance with the specific requirements in IFRS is insufficient to enable users to understand the impact of particular transactions, other events and conditions on the entitys financial position and financial performance. 28 Statement of Director Responsibilities In the case of

The next step is to load a T5 tokenizer to process `text` and `summary`:

In [14]:
# Loading Model and tokenizer
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', add_prefix_space=True)

bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")



Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

In [15]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=250, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:

In [16]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
import pickle

# Assuming tokenized_dataset is your tokenized dataset
with open('tokenized_dataset.pkl', 'wb') as pkl_file:
    pickle.dump(tokenized_dataset, pkl_file)


Now create a batch of examples using [DataCollatorForSeq2Seq](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorForSeq2Seq). It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [18]:
checkpoint = "facebook/bart-large"

In [19]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## Evaluate

Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):

In [20]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Then create a function that passes your predictions and labels to [compute](https://huggingface.co/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.compute) to calculate the ROUGE metric:

In [21]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.

## Train

<Tip>

If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the basic tutorial [here](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!

</Tip>

You're ready to start training your model now! Load T5 with [AutoModelForSeq2SeqLM](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForSeq2SeqLM):

In [22]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

At this point, only three steps remain:

1. Define your training hyperparameters in [Seq2SeqTrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments). The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) will evaluate the ROUGE metric and save the training checkpoint.
2. Pass the training arguments to [Seq2SeqTrainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainer) along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
3. Call [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train) to finetune your model.

In [23]:
import torch
torch.cuda.empty_cache()


In [24]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_dataset_model",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mamiteshpatra2020[0m ([33mteddyracnh[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20231015_144523-eom3myy7[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mfanciful-resonance-21[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/teddyracnh/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/teddyracnh/huggingface/runs/eom3myy7[0m


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.1658,0.018991,0.1336,0.1204,0.1336,0.1336,20.0
2,0.1509,0.018309,0.1336,0.1204,0.1336,0.1336,20.0
3,0.1587,0.024363,0.1336,0.1204,0.1336,0.1336,20.0
4,0.1491,0.013868,0.1336,0.1204,0.1336,0.1336,20.0
5,0.1298,0.012691,0.1336,0.1204,0.1336,0.1336,20.0




TrainOutput(global_step=31250, training_loss=0.15660687475585938, metrics={'train_runtime': 38693.7488, 'train_samples_per_second': 3.23, 'train_steps_per_second': 0.808, 'total_flos': 1.51978070125781e+17, 'train_loss': 0.15660687475585938, 'epoch': 5.0})

Once training is completed, share your model to the Hub with the [push_to_hub()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:

In [25]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import os

# Define the directory where you want to save the model
output_directory = "model"

# Create the directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Save the model, tokenizer, and configuration to the specified directory
model.save_pretrained(output_directory)
tokenizer.save_pretrained(output_directory)



('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.json',
 'model/merges.txt',
 'model/added_tokens.json')

<Tip>

For a more in-depth example of how to finetune a model for summarization, take a look at the corresponding
[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).

</Tip>

In [26]:
text = dataset["test"]["text"]

The simplest way to try out your finetuned model for inference is to use it in a [pipeline()](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline). Instantiate a `pipeline` for summarization with your model, and pass your text to it:

In [27]:
# # Load the saved model and tokenizer for testing
# model = AutoModelForSeq2SeqLM.from_pretrained(output_directory)
# tokenizer = AutoTokenizer.from_pretrained(output_directory)

# # Instantiate a pipeline for summarization with the saved model
# summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# # Generate the summary
# generated_summary = summarizer(text)

# # Print the generated summary
# print(generated_summary)


In [28]:
!zip -r bart.zip /kaggle/working

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/wandb/ (stored 0%)
  adding: kaggle/working/wandb/debug.log (deflated 68%)
  adding: kaggle/working/wandb/latest-run/ (stored 0%)
  adding: kaggle/working/wandb/latest-run/tmp/ (stored 0%)
  adding: kaggle/working/wandb/latest-run/tmp/code/ (stored 0%)
  adding: kaggle/working/wandb/latest-run/files/ (stored 0%)
  adding: kaggle/working/wandb/latest-run/files/conda-environment.yaml (deflated 66%)
  adding: kaggle/working/wandb/latest-run/files/output.log (deflated 98%)
  adding: kaggle/working/wandb/latest-run/files/wandb-summary.json (deflated 55%)
  adding: kaggle/working/wandb/latest-run/files/wandb-metadata.json (deflated 61%)
  adding: kaggle/working/wandb/latest-run/files/config.yaml (deflated 78%)
  adding: kaggle/working/wandb/latest-run/files/requirements.txt (deflated 58%)
  adding: kaggle/working/wandb/latest-run/logs/ (stored 0%)
  adding: kaggle/working/wandb/latest-run/logs/debug.log (deflated 68

In [29]:
from IPython.display import FileLink
FileLink(r'file.zip')