In [None]:
# Transformers installation
! pip install transformers datasets
! pip install transformers datasets evaluate rouge_score

# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

In [None]:
import wandb
from kaggle_secrets import UserSecretsClient
import huggingface_hub
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("hf")
secret_value_1 = user_secrets.get_secret("wandb-key")

wandb.login(key=secret_value_1)

In [None]:
huggingface_hub.login(token = secret_value_0 ,write_permission =True )

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from glob import glob

In [2]:
from datasets import Dataset
import pandas as pd
import glob




# Create a Hugging Face dataset# Use your fine tuning file
filename = "/kaggle/input/bert-cls-in-abs/IN-abs_CLS.xlsx"

df = pd.read_excel(filename,index_col=0)
df = df.reset_index(drop=True)  # Reset the index without creating a new column
df.rename(columns = {'data':'text', 'summary':'summary'}, inplace = True)
len(df)
dataset = Dataset.from_pandas(df[['text', 'summary']])



In [3]:
dataset

Dataset({
    features: ['text', 'summary'],
    num_rows: 16539
})

In [4]:
dataset = dataset.train_test_split(test_size=0.2)

In [5]:
dataset["train"][0]

{'text': "The Schedule and the rules continued without repeal or amendment when the new section III (1) was substituted in 1936, and when this section made a reference to the rules in Schedule IV it could only be a reference to the rules in the Schedule IV which stood ' unaltered. If the phraseology employed in the Schedule was inappropriate to a class which fell within section 111(1), the, only effect would be that the tax could not be levied, because 976 of the defect in the law imposing the tax, but such a situation is not remedied by reference to the provision in the General Clauses Act on which the learned Judges have relied. If, therefore, the, tax was one not lawfully levied just prior to April 1, 1937 and was one brought in after the Government of India Act, 1935 came into force, and really only from April 1, 1942 assuming this to be lawful it is obvious that the validity of this tax could not be sustained as a continuation of a lawful pre existing levy under section 143 (2). I

The next step is to load a T5 tokenizer to process `text` and `summary`:

In [6]:
from transformers import AutoTokenizer

checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


The preprocessing function you want to create needs to:

1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks.
2. Use the keyword `text_target` argument when tokenizing labels.
3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter.

In [7]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:

In [8]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

Now create a batch of examples using [DataCollatorForSeq2Seq](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorForSeq2Seq). It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [9]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)



## Evaluate

Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):

In [10]:
import evaluate

rouge = evaluate.load("rouge")

Then create a function that passes your predictions and labels to [compute](https://huggingface.co/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.compute) to calculate the ROUGE metric:

In [11]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.

## Train

<Tip>

If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the basic tutorial [here](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!

</Tip>

You're ready to start training your model now! Load T5 with [AutoModelForSeq2SeqLM](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForSeq2SeqLM):

In [12]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

At this point, only three steps remain:

1. Define your training hyperparameters in [Seq2SeqTrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments). The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) will evaluate the ROUGE metric and save the training checkpoint.
2. Pass the training arguments to [Seq2SeqTrainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainer) along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
3. Call [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train) to finetune your model.

In [13]:
import torch
torch.cuda.empty_cache()


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="t5-base-cls",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mastro2105[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Once training is completed, share your model to the Hub with the [push_to_hub()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:

In [None]:
# Define additional training arguments for the next phase of training
additional_training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_dataset_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=1
    num_train_epochs=10,  # Train for an additional 10 epochs
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

# Create a new trainer for additional training
additional_trainer = Seq2SeqTrainer(
    model=model,
    args=additional_training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Continue training for additional epochs
additional_trainer.train()


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import os

# Define the directory where you want to save the model
output_directory = "model"

# Create the directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Load a pre-trained model and tokenizer (replace with your own model and tokenizer)
model_name = "t5-basev1"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Save the model, tokenizer, and configuration to the specified directory
model.save_pretrained(output_directory)
tokenizer.save_pretrained(output_directory)



In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


# Define the name for your model on Hugging Face Hub
hub_model_name = "astro21/t5-base-cls"

# Save the model and tokenizer to the Hugging Face Model Hub
bart_model.push_to_hub(hub_model_name)
tokenizer.push_to_hub(hub_model_name)

# Once the above is done, you can also save the configuration for the model
bart_model.config.push_to_hub(hub_model_name)

# Commit your changes
bart_model.push_to_hub(hub_model_name, commit_message="Initial commit")

print(f"Model and tokenizer are now available on the Hugging Face Model Hub with the name: {hub_model_name}")

<Tip>

For a more in-depth example of how to finetune a model for summarization, take a look at the corresponding
[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).

</Tip>

In [None]:
text = dataset["test"]["text"]

The simplest way to try out your finetuned model for inference is to use it in a [pipeline()](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline). Instantiate a `pipeline` for summarization with your model, and pass your text to it:

In [None]:
# !zip -r file.zip /kaggle/working

In [None]:
# from IPython.display import FileLink
# FileLink(r'file.zip')

In [None]:
# # # Load the saved model and tokenizer for testing
# model = AutoModelForSeq2SeqLM.from_pretrained(output_directory)
# tokenizer = AutoTokenizer.from_pretrained(output_directory)



# # Define the maximum chunk size (in tokens)
# max_chunk_size = 1024  # Adjust as needed

# # Split the text into manageable chunks
# text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]

# # Initialize an empty list to store individual summaries
# individual_summaries = []

# # Generate summaries for each chunk separately
# for chunk in text_chunks:
#     # Tokenize the chunk
#     tokenized_input = tokenizer("summarize: " + chunk, truncation=True, max_length=max_chunk_size)

#     # Generate the summary for the chunk
#     summary = model.generate(tokenized_input["input_ids"], max_length=128, do_sample=False)

#     # Decode the generated summary
#     generated_summary = tokenizer.decode(summary[0], skip_special_tokens=True)
    
#     print(generated_summary)

#     # Append the individual summary to the list
#     individual_summaries.append(generated_summary)

# # Concatenate individual summaries into a single summary for the entire document
# full_document_summary = " ".join(individual_summaries)

# # Print or save the full document summary
# print(full_document_summary)
