<a href="https://colab.research.google.com/github/bhussn/SecSplitLLM/blob/main/SecSplitLLM/notebooks/benchmarks/gpt-2/Benchmark_GPT2_SST_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers[torch] accelerate -U
!pip install datasets evaluate
!pip install trl
!pip install wandb

In [None]:
import wandb
wandb.login()

In [None]:
import csv
from transformers import TrainerCallback
import torch # Import torch for GPU memory
import time  # Import time for tracking time

class CSVLoggerCallback(TrainerCallback):
    def __init__(self, csv_filepath, model_name, run_name):
        self.csv_filepath = csv_filepath
        self.csv_file = open(self.csv_filepath, 'a', newline='')
        file_exists = os.path.exists(self.csv_filepath)
        self.writer = csv.writer(self.csv_file)
        # Write header only if the file is new or empty
        if not file_exists or os.path.getsize(self.csv_filepath) == 0:
             self.writer.writerow(['model', 'run_name', 'epoch', 'train_loss', 'val_loss', 'eval_accuracy', 'gpu_memory_mb', 'epoch_duration_seconds'])
        self.model_name = model_name # Store the model name
        self.run_name = run_name # Store the run name
        # Updated header to include GPU memory and training time

        self.epoch_start_time = None
        self.peak_gpu_memory_mb = 0
        self.logged_epochs = set() # Keep track of epochs already logged

    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start_time = time.time()
        # Reset peak memory at the start of each epoch
        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats()
            self.peak_gpu_memory_mb = 0 # Reset the stored peak memory

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            epoch = logs.get('epoch')
            train_loss = logs.get('loss') # Assuming 'loss' is the key for training loss
            eval_loss = logs.get('eval_loss')
            eval_accuracy = logs.get('eval_accuracy')

            # Capture current GPU memory usage
            if torch.cuda.is_available():
                current_gpu_memory_allocated = torch.cuda.memory_allocated()
                current_gpu_memory_cached = torch.cuda.memory_cached()
                # Using allocated memory for a more direct measure of usage
                gpu_memory_mb = (current_gpu_memory_allocated / 1024 / 1024)
                 # Update peak memory if current is higher
                if gpu_memory_mb > self.peak_gpu_memory_mb:
                    self.peak_gpu_memory_mb = gpu_memory_mb
            else:
                gpu_memory_mb = 0

            # Log training info with current memory usage
            if train_loss is not None and epoch is not None:
                self.last_train_loss = train_loss
                # Log training loss, leaving eval columns empty
                self.writer.writerow([self.model_name, self.run_name, f'{epoch:.2f}', train_loss, '', '', gpu_memory_mb, ''])

            # Log evaluation info at the end of an epoch
            # This part is triggered when eval_strategy="epoch"
            if eval_loss is not None and eval_accuracy is not None and epoch is not None:
              int_epoch = int(epoch)
              if int_epoch not in self.logged_epochs:
                  epoch_end_time = time.time()
                  epoch_duration = epoch_end_time - self.epoch_start_time if self.epoch_start_time else 0

                # Log eval metrics, training loss is not applicable here
                # Using the peak memory recorded during the epoch
                  self.writer.writerow([self.model_name, self.run_name, int(epoch), self.last_train_loss, eval_loss, eval_accuracy, self.peak_gpu_memory_mb, epoch_duration])
                  self.logged_epochs.add(int_epoch) # Add the epoch to the set of logged epochs

        self.csv_file.flush()

    def __del__(self):
        if self.csv_file:
            self.csv_file.close()


%load_ext cudf.pandas
import pandas as pd # Imports the panda library from huggin face to load and visualize the dataset
df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet")
df_sample = df.sample(frac=0.012)
print(df_sample)

import os
os.environ["WANDB_PROJECT"] = "gpt2-classification"
os.environ["WANDB_WATCH"] = "all"

from datasets import Dataset
from transformers import GPT2Tokenizer

# Convert the pandas DataFrame to a Dataset
dataset = Dataset.from_pandas(df_sample)

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token # GPT-2 doesn't have a padding token by default

# Tokenize the text
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split into train and validation sets
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

from transformers import GPT2ForSequenceClassification

# Define the number of labels (positive/negative)
num_labels = 2

# Load GPT-2 with a sequence classification head
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=num_labels)
model.config.pad_token_id = tokenizer.pad_token_id # Set padding token id for the model

from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

    # Define evaluation metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


# Define training arguments
training_args = TrainingArguments(
        output_dir="./results",  # Output directory
        report_to="wandb",
        run_name="gpt2_classification_11_1",  # Run name
        num_train_epochs=2,  # Number of training epochs
        per_device_train_batch_size=8,  # Batch size per device during training
        per_device_eval_batch_size=8,   # Batch size for evaluation
        warmup_steps=500,  # Number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # Strength of weight decay
        logging_dir="./logs",  # Directory for storing logs
        logging_steps=15,
        eval_strategy="epoch",
        save_strategy="epoch",
    )
# Get model_name and_run to pass it to the logger
model_name = model.config._name_or_path if hasattr(model, 'config') and hasattr(model.config, '_name_or_path') else "unknown_model"
run_name = training_args.run_name

# Create a CSVLoggerCallback instance
csv_logger_callback = CSVLoggerCallback('./trainingGPT2_log.csv', model_name, run_name)

# Create Trainer instance
trainer = Trainer(
        model=model,  # The model to train
        args=training_args,  # The training arguments
        train_dataset=train_dataset,  # The training dataset
        eval_dataset=eval_dataset,  # The evaluation dataset
        compute_metrics=compute_metrics, # The function to compute metrics
        callbacks=[csv_logger_callback],
    )

# Start training
trainer.train()

results = trainer.evaluate()
print(f"Validation Loss: {results['eval_loss']}")

model.save_pretrained('fine-tuned-gpt2')
tokenizer.save_pretrained('fine-tuned-gpt2')
