<a href="https://colab.research.google.com/github/bayesmaxxing/gpt-text-classifier/blob/main/gpt_classification_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
from google.colab import userdata

In [None]:
# Load the data
gpt_data = pd.read_csv('./training_data_gpts.csv')
llama_data = pd.read_csv('./pplx_training_data.csv')

training_data = pd.concat(gpt_data, llama_data, ignore_index=True)
prompts = pd.read_csv('./prompts.csv')
unique_prompts = prompts['prompt'].unique()
# Preprocess the data
train_prompts, eval_prompts = train_test_split(unique_prompts, test_size=0.2, random_state=42)

train_df = prompts[prompts['prompt'].isin(train_prompts)]
eval_df = prompts[prompts['prompt'].isin(eval)]

training_data = Dataset.from_pandas(train_df)
eval_data = Dataset.from_pandas(eval_df)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[[{'generated_text': 'Can you explain lambda functions?\n\nA:\n\nThe way we use the language of maths is different than the way students use the language of computer science.  Computer programs are written in a different way to that of a human being, and the way we use mathematics is different the way we do this.  The way we use the language of maths is not the way computer scientists write their programs, because computers are the people who write the programs.\nWhat is important is that we write mathematical programs in the same way as computers do.\nWhen we write mathematical programs, we look at the mathematics, and we try to apply the mathematics to the problem at hand.  The important part of this is that we use the same terminology that computers use.  We don'}]]


In [None]:
# Setting up device for GPU usage
from torch import cuda
device = torch.device("cuda" if cuda.is_available() else "cpu")

In [None]:
# Import tokenizer and apply it to the prompts
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def preprocess_function(data):
  return tokenizer(data["text"], truncation=True) # what's the true column name here?

train_data = training_data.map(preprocess_function, batched=True, return_tensors="pt")
eval_data = eval_data.map(preprocess_function, batched=True, return_tensors="pt")

In [None]:
# Map labels to ids and ids to labels
# TODO: check the labels to make sure that they are correct
id2label = {0: "GPT-4o", 1: "GPT-3.5-turbo", 2: "Llama 3 sonar small"}
label2id = {"GPT-4o": 0, "GPT-3.5": 1, "Llama 3 sonar small": 2}

In [None]:
# Load evaluation metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased",
                                                           num_labels=3, id2label=id2label, label2id=label2id)
model.to(device)

In [None]:
# prepare training arguments and trainer here
training_args = TrainingArguments(
    output_dir="distilbert_gpt_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_training_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
    #data_collator=data_collator,
    compute_metrics=compute_metrics
)