<a href="https://colab.research.google.com/github/bhussn/SecSplitLLM/blob/main/SecSplitLLM/notebooks/gpt-2/Benchmark_GPT2_SST_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
!pip install transformers[torch] accelerate -U
!pip install datasets evaluate
!pip install trl
!pip install wandb

In [None]:
import wandb
wandb.login()

In [None]:

import pandas as pd # Imports the panda library from huggin face to load and visualize the dataset
df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet")
df_sample = df.sample(frac=0.014)
print(df_sample)

import os
os.environ["WANDB_PROJECT"] = "gpt2-classification"
os.environ["WANDB_WATCH"] = "all"

from datasets import Dataset
from transformers import GPT2Tokenizer

# Convert the pandas DataFrame to a Dataset
dataset = Dataset.from_pandas(df_sample)

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token # GPT-2 doesn't have a padding token by default

# Tokenize the text
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split into train and validation sets
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

from transformers import GPT2ForSequenceClassification

# Define the number of labels (positive/negative)
num_labels = 2

# Load GPT-2 with a sequence classification head
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=num_labels)
model.config.pad_token_id = tokenizer.pad_token_id # Set padding token id for the model

from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

    # Define evaluation metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


# Define training arguments
training_args = TrainingArguments(
        output_dir="./results",  # Output directory
        report_to="wandb",
        run_name="gpt2_classification_12_2",  # Run name
        num_train_epochs=2,  # Number of training epochs
        per_device_train_batch_size=8,  # Batch size per device during training
        per_device_eval_batch_size=8,   # Batch size for evaluation
        warmup_steps=500,  # Number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # Strength of weight decay
        logging_dir="./logs",  # Directory for storing logs
        logging_steps=1,
        eval_strategy="epoch",
    )

# Create Trainer instance
trainer = Trainer(
        model=model,  # The model to train
        args=training_args,  # The training arguments
        train_dataset=train_dataset,  # The training dataset
        eval_dataset=eval_dataset,  # The evaluation dataset
        compute_metrics=compute_metrics, # The function to compute metrics
    )

# Start training
trainer.train()

results = trainer.evaluate()
print(f"Validation Loss: {results['eval_loss']}")

model.save_pretrained('fine-tuned-gpt2')
tokenizer.save_pretrained('fine-tuned-gpt2')
