In [16]:
from datasets import load_dataset, load_from_disk
import evaluate
import torch
from torch import nn
from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)

from model import GPT, GPTConfig
from transformers import GPT2Tokenizer
from transformers.modeling_outputs import SequenceClassifierOutput
import numpy as np

In [17]:
# 2. Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT2 doesn't have a pad token

In [18]:
from transformers import GPT2ForSequenceClassification
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# 1. Load the SST-2 dataset
dataset = load_dataset("glue", "sst2")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

In [20]:
# 3. Preprocessing
def preprocess(example):
    return tokenizer(
        example["sentence"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

encoded_train = train_dataset.map(preprocess, batched=True)
encoded_eval = eval_dataset.map(preprocess, batched=True)

# 4. Metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

# 5. Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 6. Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-sst2-cls",
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),
    save_total_limit=1,
    report_to="none",  # No wandb
    warmup_ratio=0.1,
)

# 7. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 8. Train!
trainer.train()


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy
500,1.9058,0.530439,0.732798
1000,0.5057,0.32873,0.869266
1500,0.4269,0.415834,0.873853
2000,0.411,0.292624,0.896789
2500,0.375,0.360433,0.896789
3000,0.3336,0.343878,0.893349
3500,0.344,0.360243,0.892202
4000,0.3364,0.412279,0.885321
4500,0.3332,0.463412,0.872706
5000,0.3174,0.333349,0.895642


TrainOutput(global_step=25257, training_loss=0.27535226657799616, metrics={'train_runtime': 1374.2033, 'train_samples_per_second': 147.028, 'train_steps_per_second': 18.379, 'total_flos': 1.3198556141715456e+16, 'train_loss': 0.27535226657799616, 'epoch': 3.0})