# Fine Tuning T5-small for text summarization task
## with Big Patent Dataset

## Installing deps

In [None]:
pip install datasets transformers evaluate numpy tensorflow==2.15 huggingface_hub rouge_score



In [None]:
pip install -U accelerate

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/302.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from t

## Creating a virtual GPU with a specific memory limit (only run for training)

In [None]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Restrict TensorFlow to only use the first GPU
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

    # Create a virtual GPU with 20GB of memory
    virtual_gpu = tf.config.experimental.VirtualDeviceConfiguration(memory_limit=20 * 1024 * 1024)
    tf.config.experimental.set_virtual_device_configuration(gpus[0], [virtual_gpu])
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

## Imports

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np

## Login to huggingface hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Declaring constants

In [None]:
rouge = evaluate.load("rouge")

model_name = "KipperDev/t5_summarizer_model"
prefix = "summarize: "
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

## Import dataset



In [None]:
dataset = load_dataset("big_patent", "e")
train_dataset = dataset["train"]
validation_dataset = dataset["validation"]
test_dataset = dataset["test"]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.71k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.13G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/506M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/508M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/34443 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1914 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1914 [00:00<?, ? examples/s]

## Function to preprocess data to model inputs

In [None]:
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["description"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["abstract"], max_length=256, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

## Function to compute the ROUGE metrics between the predictions and labels

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

## Preprocess datasets

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model.config.max_length = 256
model.config.min_length = 50
model.config.early_stopping = False



In [None]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = validation_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/34443 [00:00<?, ? examples/s]

Map:   0%|          | 0/1914 [00:00<?, ? examples/s]

Map:   0%|          | 0/1914 [00:00<?, ? examples/s]

## Declaring training args

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="t5_summarizer_model",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=12,
    predict_with_generate=True,
    push_to_hub=True,
    warmup_steps=500,
    lr_scheduler_type="linear"
)



## Create trainer instance

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

## Test the model after training

In [None]:
results = trainer.evaluate(eval_dataset=tokenized_test)
print(results)