In [None]:
!pip install datasets

In [None]:
!pip install evaluate

In [3]:
import evaluate

In [4]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import os

In [26]:
class model_trainer:
  def  __init__(self,name,dataset,isPEFT=False):
    self.name=name
    self.isPEFT=isPEFT
    self.dataset=dataset
    self.id2label = {0: "GPT2", 1: "GPT4o",2:"GPT_NEO",3:"Gemini",4:"Reformer"}
    self.label2id = {d:i for i, d in self.id2label.items()}
    self.prep_data()

  def prep_data(self):
    df=pd.read_csv(self.dataset)
    df.fillna(" ",inplace=True)
    df["x_i+x_j"]=df["x_i"]+[" "]+df["x_j"]
    df["Label"]=df["Label"].map(self.label2id)
    df.drop(["x_i","x_j"],axis=1,inplace=True)
    df.to_csv("LLM_dataset_comb.csv",index=False)

  def dataset_train(self):
    dataset = load_dataset("csv", data_files="LLM_dataset_comb.csv")
    dataset = dataset["train"].train_test_split(test_size=0.2, shuffle=True)
    self.tokenizer = AutoTokenizer.from_pretrained(self.name)
    def preprocess_function(examples):
      return self.tokenizer(examples["x_i+x_j"], truncation=True, padding='max_length', max_length=512)
    tokenized_data = dataset.map(preprocess_function)
    tokenized_data=tokenized_data.rename_column("Label", "label")
    tokenized_data=tokenized_data.rename_column("x_i+x_j", "text")
    return tokenized_data

  def prep_model(self):
    model = AutoModelForSequenceClassification.from_pretrained(self.name, num_labels=5, id2label=self.id2label, label2id=self.label2id)
    if self.isPEFT:

      raise NotImplementedError("Error 1")
    else:
      for name,param in model.named_parameters():
        if name.startswith("distilbert"):
          param.requires_grad=False
    return model

  def train(self):
    tokenized_data=self.dataset_train()
    model=self.prep_model()
    accuracy = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
      predictions, labels = eval_pred
      predictions = np.argmax(predictions, axis=1)
      return accuracy.compute(predictions=predictions, references=labels)

    try:
      os.mkdir("model_saved")
    except:
      pass
    training_args = TrainingArguments(
    output_dir="model_saved",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16, # Batch Size
    num_train_epochs=2, # Total Epcohs
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit = 2)

    trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=tokenized_data["train"],
      eval_dataset=tokenized_data["test"],
      tokenizer=self.tokenizer,
      compute_metrics=compute_metrics )
    trainer.train()
    return trainer.state.best_model_checkpoint # Returns best checkpoint of model


In [27]:
name="distilbert/distilbert-base-uncased"
dataset="LLM_dataset.csv"

In [None]:
model_trainer(name,dataset).train()