**Supervised Fine Tuning of CodeT5 models**

In [None]:
!pip install datasets
!pip install torch
!pip install transformers

Loading needed Libraries

In [None]:
#Loading needed libraries
import Datasets.DataLoader as DataLoader
import ModelArguments
import transformers
from transformers import TrainingArguments as HFTrainingArguments
from transformers import Trainer
from transformers import AutoTokenizer
from dataclasses import dataclass, field
from typing import Optional


Logging to HuggingFace to have acess to the models

In [None]:
from huggingface_hub import login
login(token="...")

A class to handle Training Arguments

In [None]:
@dataclass
class TrainingArguments(HFTrainingArguments):
    """
    Training arguments tailored for Salesforce/codet5p-770m or similar encoder-decoder models.
    """
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Directory for caching pre-trained models and tokenizers."}
    )
    optim: str = field(
        default="adamw_torch",
        metadata={"help": "Optimizer to use during training. Default is AdamW implemented in PyTorch."}
    )
    model_max_length: int = field(
        default=1024,
        metadata={"help": "Maximum sequence length. Sequences longer than this will be truncated."}
    )
    learning_rate: float = field(
        default=5e-5,
        metadata={"help": "Learning rate for training."}
    )
    warmup_steps: int = field(
        default=0,
        metadata={"help": "Number of warmup steps for the learning rate scheduler."}
    )
    weight_decay: float = field(
        default=0.01,
        metadata={"help": "Weight decay for the AdamW optimizer."}
    )
    per_device_train_batch_size: int = field(
        default=2,
        metadata={"help": "Batch size per device for training."}
    )
    per_device_eval_batch_size: int = field(
        default=2,
        metadata={"help": "Batch size per device for evaluation."}
    )
    evaluation_strategy: str = field(
        default="steps",
        metadata={"help": "Evaluation strategy to use. Choose from 'no', 'steps', or 'epoch'."}
    )
    save_steps: int = field(
        default=500,
        metadata={"help": "Number of steps between saving checkpoints."}
    )
    logging_steps: int = field(
        default=50,
        metadata={"help": "Number of steps between logging outputs."}
    )
    predict_with_generate: bool = field(
        default=True,
        metadata={
            "help": "Whether to use `generate()` to compute metrics during evaluation for sequence-to-sequence tasks."
        }
    )
    fp16: bool = field(
        default=True,
        metadata={"help": "Use 16-bit (mixed) precision instead of 32-bit."}
    )


This code defines a custom trainer, `DynamicTrainer`, that uses a classicla Cross-Entropy Loss. The model's primary objective is to predict optimized code by minimizing the Cross-Entropy loss, which measures how accurately the predicted code matches the expected optimized output.

In [None]:
class DynamicTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        # Initialize the custom trainer by calling the parent class constructor
        super(DynamicTrainer, self).__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Compute the loss by calculating the Cross-Entropy Loss.
        The loss will be automatically generated by the model using the provided labels.
        
        Args:
        - model: The neural network model to generate predictions.
        - inputs: A dictionary containing the input and labels.
        - return_outputs (optional): Flag indicating whether to return outputs along with loss.
        
        Returns:
        - total_loss: The final calculated loss used for backpropagation.
        - outputs: (optional) model outputs returned if `return_outputs` is set to True.
        """
        labels = inputs.get("labels")  # Get the true labels from the inputs
        input_ids = inputs.get("input_ids")  # Get the input tokens to feed to the model

        # Pass input_ids to the model and calculate loss based on labels (Cross-Entropy Loss)
        outputs = model(input_ids=input_ids,
                        attention_mask=inputs.get("attention_mask"),
                        labels=labels)  # 'labels' are used to calculate the loss automatically

        # If the flag is set, return both the loss and the outputs (e.g., for tracking or further processing)
        if return_outputs:
            return outputs.loss, outputs
        return outputs.loss  # Return just the computed total loss for backpropagation


Now we can effectively fine-tune our model

In [None]:
# Dataset and model
dataset_path = "../Datset/Code_pairs.csv"  
model_path = "Salesforce/codet5p-770m"  

# Tokenizer initialization
tokenizer = AutoTokenizer.from_pretrained(model_path, truncation=True, padding='max_length', max_length=512)

# Load the dataset and tokenize it
data_loader = DataLoader()
data_loader.load(dataset_path)
data_loader.tokenize(tokenizer, max_length = 512)

#splitting into train and test
data_loader.split_data()

# Model initialization
model_args = ModelArguments()
model, tokenizer = model_args.load_model_and_tokenizer()

# Training arguments definition
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
)

# Trainer initialization
trainer = DynamicTrainer(
    model=model,
    args=training_args,
    train_dataset=data_loader.tokenized_dataset["train"],
    eval_dataset=data_loader.tokenized_dataset["test"]
)

# Training
trainer.train()