In [1]:
# Transformers installation
! pip install transformers datasets
! pip install transformers datasets evaluate rouge_score
!pip install sentencepiece

# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=1aea838f38bc3245747000a6e5da93e2a5f5f51d4fef3f5e134e13259d4a6649
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.1 rouge_score-0.1.2


In [2]:
!pip install wandb



In [3]:
from datasets import Dataset ,DatasetDict

In [4]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value = user_secrets.get_secret("wandb-api-key")


wandb.login(key=secret_value)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from glob import glob

In [6]:
import pandas as pd
import random

# Load the dataset
filename = "/kaggle/input/merged-csv/merged_data.csv"
df = pd.read_csv(filename)

# Shuffle the dataset
df = df.sample(frac=1, random_state=42)  # Shuffle the DataFrame with a fixed random seed for reproducibility
df = df.head(17000)
# Rename the columns to 'text' and 'summary'
df.rename(columns={'Input': 'text', 'Output': 'summary'}, inplace=True)

In [7]:
df.columns

Index(['text', 'summary'], dtype='object')

In [8]:
df

Unnamed: 0,text,summary
12796,Revenue share 3.6m Allergy and Autoimmune Main...,Revenue share 3.6m Allergy and Autoimmune Main...
1255,Michael Marx Chief Executive 30 March 2009 Nat...,The results were affected by the need to make ...
23441,I am pleased to report that the Group has cont...,I am pleased to report that the Group has cont...
3441,"In the Nordic region, AFDEC statistics show a ...","In the Nordic region, AFDEC statistics show a ..."
9733,These awards and the increased demand again fo...,These awards and the increased demand again fo...
...,...,...
2056,Strategic Plan Transform core economics Contin...,Strategic Plan Transform core economics Contin...
30347,Lonmin at a Glance Worlds 3rd largest primary...,Lonmin at a Glance Worlds 3rd largest primary...
4553,Whilst we maintained our revenues at forecast...,Whilst we maintained our revenues at forecast...
20701,CHAIRMANS STATEMENT We are confident that Chl...,CHAIRMANS STATEMENT We are confident that Chl...


In [9]:
dataset = Dataset.from_pandas(df[['text', 'summary']])

In [10]:
dataset

Dataset({
    features: ['text', 'summary', '__index_level_0__'],
    num_rows: 17000
})

In [11]:
dataset = dataset.train_test_split(test_size=0.0001)

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', '__index_level_0__'],
        num_rows: 16998
    })
    test: Dataset({
        features: ['text', 'summary', '__index_level_0__'],
        num_rows: 2
    })
})

In [13]:
dataset["train"][0]['text']

'Dividends On 31 May 2013 the shareholders approved a final dividend  in respect of the financial year ended 31 December 2012  of USD 0.0824 per ordinary share and a Special Dividend in re- spect of the financial year ended 31 December 2012 of USD  0.0232 per ordinary share, in recognition of the successful  sale of our gold mining assets in Kazakhstan, Kyrgyzstan and  Romania. The Special Dividend and Final Dividend combined  represent an aggregate dividend payment of USD 0.1056 per  ordinary share for 2012. This resulted in a total payment of ap- proximately USD 320 million, which is around 33% of the Ad- justed Profit from Continuing Operations for 2012. During April 2014, the Board will decide whether to propose  a dividend for the financial year 2013, taking into account  the Companys dividend policy, the current market situation  and financial condition. Cost control In the year 2013 the gold price fell 29%. Most companies  in the sector, including PGIL, undertook cost control me

The next step is to load a  tokenizer to process `text` and `summary`:

In [14]:
from transformers import AutoTokenizer

checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [15]:
from transformers import DataCollator

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding =True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [16]:
# tokenized_dataset = dataset.map(preprocess_function , batched = True)
tokenized_dataset = dataset.map(preprocess_function , batched = True)


  0%|          | 0/17 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
# import pickle

# # Assuming tokenized_dataset is your tokenized dataset
# with open('tpega.pkl', 'wb') as pkl_file:
#     pickle.dump(tokenized_dataset, pkl_file)

    


In [18]:
# import pickle

# # Assuming you have a pickle file named 'tpega.pkl'
# with open('/kaggle/input/pkl-file-pegasus/tpega.pkl', 'rb') as pkl_file:
#     loaded_tokenized_dataset = pickle.load(pkl_file)


In [19]:
# from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# # Load the Pegasus-XSum tokenizer
# tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

# # Load the Pegasus-XSum model
# pegasus_model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')


Now create a batch of examples using [DataCollatorForSeq2Seq](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorForSeq2Seq). It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [20]:
checkpoint = "t5-base"

In [21]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## Evaluate

Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):

In [22]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Then create a function that passes your predictions and labels to [compute](https://huggingface.co/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.compute) to calculate the ROUGE metric:

In [23]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.

## Train

<Tip>

If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the basic tutorial [here](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!

</Tip>

You're ready to start training your model now! Load T5 with [AutoModelForSeq2SeqLM](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForSeq2SeqLM):

At this point, only three steps remain:

1. Define your training hyperparameters in [Seq2SeqTrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments). The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) will evaluate the ROUGE metric and save the training checkpoint.
2. Pass the training arguments to [Seq2SeqTrainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainer) along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
3. Call [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train) to finetune your model.

In [24]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [25]:
import torch
torch.cuda.empty_cache()



In [26]:
training_args = Seq2SeqTrainingArguments(
    output_dir="t5-model-final-pytorch-merged",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmister[0m ([33mmsi[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.261,0.047675,0.201,0.1447,0.201,0.201,19.0
2,0.2351,0.044822,0.201,0.1447,0.201,0.201,19.0
3,0.2268,0.037903,0.201,0.1447,0.201,0.201,19.0
4,0.2261,0.039935,0.201,0.1447,0.201,0.201,19.0
5,0.2179,0.039621,0.201,0.1447,0.201,0.201,19.0




TrainOutput(global_step=5315, training_loss=0.2508500480472425, metrics={'train_runtime': 8832.6224, 'train_samples_per_second': 9.622, 'train_steps_per_second': 0.602, 'total_flos': 5.17553311186944e+16, 'train_loss': 0.2508500480472425, 'epoch': 5.0})

Once training is completed, share your model to the Hub with the [push_to_hub()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:

In [27]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import os

# Define the directory where you want to save the model
output_directory = "model"

# Create the directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Save the model, tokenizer, and configuration to the specified directory
model.save_pretrained(output_directory)
tokenizer.save_pretrained(output_directory)



('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/spiece.model',
 'model/added_tokens.json',
 'model/tokenizer.json')

<Tip>

For a more in-depth example of how to finetune a model for summarization, take a look at the corresponding
[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).

</Tip>

In [28]:
text = dataset["test"]["text"]

The simplest way to try out your finetuned model for inference is to use it in a [pipeline()](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline). Instantiate a `pipeline` for summarization with your model, and pass your text to it:

In [29]:
# # Load the saved model and tokenizer for testing
# model = AutoModelForSeq2SeqLM.from_pretrained(output_directory)
# tokenizer = AutoTokenizer.from_pretrained(output_directory)

# # Instantiate a pipeline for summarization with the saved model
# summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# # Generate the summary
# generated_summary = summarizer(text)

# # Print the generated summary
# print(generated_summary)


In [30]:
!zip -r bart.zip /kaggle/working

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/t5-model-final-pytorch-merged/ (stored 0%)
  adding: kaggle/working/t5-model-final-pytorch-merged/runs/ (stored 0%)
  adding: kaggle/working/t5-model-final-pytorch-merged/runs/Oct30_12-14-49_e0f8cfdfc278/ (stored 0%)
  adding: kaggle/working/t5-model-final-pytorch-merged/runs/Oct30_12-14-49_e0f8cfdfc278/events.out.tfevents.1698668095.e0f8cfdfc278.32.0 (deflated 63%)
  adding: kaggle/working/t5-model-final-pytorch-merged/checkpoint-4000/ (stored 0%)
  adding: kaggle/working/t5-model-final-pytorch-merged/checkpoint-4000/optimizer.pt (deflated 8%)
  adding: kaggle/working/t5-model-final-pytorch-merged/checkpoint-4000/tokenizer.json (deflated 74%)
  

In [31]:
from IPython.display import FileLink
FileLink(r'bart.zip')

In [32]:
!pip install huggingface-hub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [33]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("token")
secret_value_1 = user_secrets.get_secret("wandb-api-key")

# wandb.login(key=secret_value_0)

wandb.login(key=secret_value_1)



True

In [34]:
import huggingface_hub
huggingface_hub.login(token=secret_value_0, write_permission=True)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [36]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


# Define the name for your model on Hugging Face Hub
hub_model_name = "randomshit11/t5-base-fin-1"

# Save the model and tokenizer to the Hugging Face Model Hub
model.push_to_hub(hub_model_name)
tokenizer.push_to_hub(hub_model_name)

# Once the above is done, you can also save the configuration for the model
model.config.push_to_hub(hub_model_name)

# Commit your changes
model.push_to_hub(hub_model_name, commit_message="Initial commit")

print(f"Model and tokenizer are now available on the Hugging Face Model Hub with the name: {hub_model_name}")

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Model and tokenizer are now available on the Hugging Face Model Hub with the name: randomshit11/t5-base-fin-1
