In [1]:
from transformers import AutoModelForSeq2SeqLM, T5Tokenizer, EarlyStoppingCallback, Seq2SeqTrainingArguments,Seq2SeqTrainer, DataCollatorForSeq2Seq
import torch
import time
import pandas as pd

import numpy as np
import re
from datasets import  load_from_disk,load_dataset


  from .autonotebook import tqdm as notebook_tqdm


<a name='1.2'></a>
### 1.2 - Load Dataset and LLM


In [3]:
dataset = load_dataset("Ahmadsameh8/QalbPreprocessedAndMergedwithPunct")


Found cached dataset parquet (/home/ahmadsameh8/.cache/huggingface/datasets/Ahmadsameh8___parquet/Ahmadsameh8--QalbPreprocessedAndMergedwithPunct-770424d0fbf6464d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 3/3 [00:00<00:00, 294.81it/s]


In [3]:
import gc
gc.collect()

31

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(device)

In [5]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"



In [None]:
model_name = "UBC-NLP/AraT5v2-base-1024"
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name,device_map={"":0})
tokenizer = T5Tokenizer.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
def preprocess_function(example,padding="max_length"):
    # add prefix to the input for t5
    inputs = example["incorrect"]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=512, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=example["correct"], max_length=512, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['correct', 'incorrect'])
print(f"Keys of tokenized dataset: {list(tokenized_datasets['train'].features)}")


                                                                   

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']




Check the shapes of all three parts of the dataset:

In [10]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (18350, 3)
Validation: (2293, 3)
Test: (2295, 3)
DatasetDict({
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2293
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 18350
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2295
    })
})


<a name='2.2'></a>
### 2.2 - Fine-Tune the Model with the Preprocessed Dataset



In [11]:
output_dir = "./textcorrection_model"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    learning_rate=5e-5, # Higher learning rate than full fine-tuning.
    num_train_epochs=4,
    # max_grad_norm=0.1,
    logging_steps=500,
    eval_steps=500,
    save_strategy= "epoch",
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    save_steps=1024,
    warmup_steps=512,
    load_best_model_at_end = True,
    fp16=True,
    save_total_limit=2,
    # report_to="tensorboard"
)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=4)

trainer = Seq2SeqTrainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],

    callbacks=[early_stopping_callback],
    data_collator=data_collator,


    # tokenizer=tokenizer,
)

In [None]:
trainer.train()
model_path= "modelwithpunct"
trainer.model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [32]:
import gc
gc.collect()

1925

In [None]:
%load_ext tensorboard
%tensorboard --logdir '{output_dir}'/runs

In [None]:
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(device)
tokenizer1 = T5Tokenizer.from_pretrained("modelwithoutpunct")
model1 = AutoModelForSeq2SeqLM.from_pretrained("modelwithoutpunct").to(device)

inp = dataset["test"]["incorrect"][2]
input_ids = tokenizer1(inp, return_tensors="pt").input_ids.to(device)
outputs = model1.generate(input_ids, max_length=512)
out = tokenizer1.decode(outputs[0], skip_special_tokens=True)