## **Dataset**

In [1]:
# With small dataset: 50% full data
from datasets import load_dataset

# Load IMDb dataset
dataset = load_dataset("imdb")
# rename label to labels
dataset = dataset.rename_column("label", "labels")

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'labels'],
        num_rows: 50000
    })
})

## **Tokenizer**

In [3]:
from transformers import AutoTokenizer

model_name = "binhphap5/gpt-small-c4"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
# Tokenize function
def tokenize(example):
    return tokenizer(
        example["text"], truncation=True, max_length=512
    )

# Tokenize full dataset
tokenized_ds = dataset.map(tokenize, batched=True)
tokenized_ds = tokenized_ds.remove_columns(["text"])
tokenized_ds.set_format("torch")

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [5]:
# data collator
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [6]:
tokenizer.pad_token_id

1

## **Model**

In [7]:
from transformers import AutoModelForSequenceClassification

id2label = {0: 'Negative', 1: 'Positive'}
label2id = {'Negative': 0, 'Positive': 1}

model = AutoModelForSequenceClassification.from_pretrained(
    "binhphap5/gpt-small-c4",
    num_labels=2, id2label=id2label, label2id=label2id,
)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at binhphap5/gpt-small-c4 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(100000, 512)
    (wpe): Embedding(512, 512)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=512, out_features=2, bias=False)
)

In [9]:
# forward pass
import torch

model
model.eval()
inputs = tokenized_ds["train"][0]['input_ids'].unsqueeze(0)  # Add batch dimension
with torch.no_grad():
    outputs = model(inputs)
print(outputs)

SequenceClassifierOutputWithPast(loss=None, logits=tensor([[ 0.1510, -0.7583]]), past_key_values=((tensor([[[[ 9.1632e-01, -1.8706e+00,  1.1628e+00,  ..., -6.4231e-01,
            8.4895e-01, -1.4081e+00],
          [-1.6359e-01,  1.2209e-01,  1.2591e-01,  ...,  3.2923e-01,
            3.6682e-01, -6.4372e-02],
          [ 1.1112e+00, -2.6102e+00,  1.1262e+00,  ...,  1.8748e-01,
            1.1027e+00, -1.6410e+00],
          ...,
          [ 1.3032e+00, -1.1904e+00,  6.7368e-01,  ...,  2.9726e+00,
            6.3672e-01, -1.6338e+00],
          [-1.4234e+00,  9.3294e-01, -1.1388e+00,  ..., -4.1344e-01,
            2.1108e-01,  1.5182e+00],
          [-9.2435e-01, -1.0644e+00,  4.1519e-01,  ...,  2.0647e+00,
            2.1216e-01, -2.1505e+00]],

         [[ 2.0982e-01, -8.4246e-01, -1.3742e+00,  ...,  6.2791e-01,
            9.7276e-01,  4.4668e-01],
          [ 1.4081e+00,  9.5891e-01, -2.0184e-01,  ...,  4.2944e-01,
            5.5174e-01,  9.5081e-01],
          [-2.8520e-01, -7.1

## **Training**

In [10]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="imdb2-full-gpt2-small",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=1,
    fp16=True,
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/3910 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 0.3778, 'grad_norm': 11.165544509887695, 'learning_rate': 9.002557544757034e-06, 'epoch': 1.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.27316516637802124, 'eval_accuracy': 0.88684, 'eval_runtime': 36.717, 'eval_samples_per_second': 680.884, 'eval_steps_per_second': 10.649, 'epoch': 1.0}
{'loss': 0.2483, 'grad_norm': 43.23914337158203, 'learning_rate': 8.002557544757034e-06, 'epoch': 2.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.28208571672439575, 'eval_accuracy': 0.88896, 'eval_runtime': 36.1116, 'eval_samples_per_second': 692.299, 'eval_steps_per_second': 10.828, 'epoch': 2.0}
{'loss': 0.1894, 'grad_norm': 20.37828254699707, 'learning_rate': 7.002557544757034e-06, 'epoch': 3.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.27936893701553345, 'eval_accuracy': 0.8974, 'eval_runtime': 36.5715, 'eval_samples_per_second': 683.592, 'eval_steps_per_second': 10.691, 'epoch': 3.0}
{'loss': 0.1433, 'grad_norm': 19.281404495239258, 'learning_rate': 6.002557544757034e-06, 'epoch': 4.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.2980090379714966, 'eval_accuracy': 0.89988, 'eval_runtime': 36.6389, 'eval_samples_per_second': 682.335, 'eval_steps_per_second': 10.672, 'epoch': 4.0}
{'loss': 0.1061, 'grad_norm': 33.30314254760742, 'learning_rate': 5.002557544757034e-06, 'epoch': 5.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.30796879529953003, 'eval_accuracy': 0.90044, 'eval_runtime': 36.4216, 'eval_samples_per_second': 686.406, 'eval_steps_per_second': 10.735, 'epoch': 5.0}
{'loss': 0.08, 'grad_norm': 39.59245681762695, 'learning_rate': 4.0076726342711e-06, 'epoch': 6.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.3629363775253296, 'eval_accuracy': 0.89852, 'eval_runtime': 36.4567, 'eval_samples_per_second': 685.746, 'eval_steps_per_second': 10.725, 'epoch': 6.0}
{'loss': 0.0559, 'grad_norm': 22.916973114013672, 'learning_rate': 3.0076726342710997e-06, 'epoch': 7.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.43037480115890503, 'eval_accuracy': 0.89784, 'eval_runtime': 35.3164, 'eval_samples_per_second': 707.886, 'eval_steps_per_second': 11.071, 'epoch': 7.0}
{'loss': 0.0433, 'grad_norm': 1.7801939249038696, 'learning_rate': 2.0076726342711e-06, 'epoch': 8.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.4582618772983551, 'eval_accuracy': 0.89744, 'eval_runtime': 42.3723, 'eval_samples_per_second': 590.008, 'eval_steps_per_second': 9.228, 'epoch': 8.0}
{'loss': 0.0326, 'grad_norm': 1.8193930387496948, 'learning_rate': 1.0076726342710998e-06, 'epoch': 9.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.5097520351409912, 'eval_accuracy': 0.89928, 'eval_runtime': 42.3976, 'eval_samples_per_second': 589.656, 'eval_steps_per_second': 9.222, 'epoch': 9.0}
{'loss': 0.0269, 'grad_norm': 29.244569778442383, 'learning_rate': 7.672634271099745e-09, 'epoch': 10.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.5305415391921997, 'eval_accuracy': 0.90104, 'eval_runtime': 42.2903, 'eval_samples_per_second': 591.153, 'eval_steps_per_second': 9.246, 'epoch': 10.0}
{'train_runtime': 1422.0743, 'train_samples_per_second': 175.8, 'train_steps_per_second': 2.75, 'train_loss': 0.13036981787523041, 'epoch': 10.0}


TrainOutput(global_step=3910, training_loss=0.13036981787523041, metrics={'train_runtime': 1422.0743, 'train_samples_per_second': 175.8, 'train_steps_per_second': 2.75, 'total_flos': 1.452763121811456e+16, 'train_loss': 0.13036981787523041, 'epoch': 10.0})

In [13]:
log = trainer.state.log_history
log

[{'loss': 0.3778,
  'grad_norm': 11.165544509887695,
  'learning_rate': 9.002557544757034e-06,
  'epoch': 1.0,
  'step': 391},
 {'eval_loss': 0.27316516637802124,
  'eval_accuracy': 0.88684,
  'eval_runtime': 36.717,
  'eval_samples_per_second': 680.884,
  'eval_steps_per_second': 10.649,
  'epoch': 1.0,
  'step': 391},
 {'loss': 0.2483,
  'grad_norm': 43.23914337158203,
  'learning_rate': 8.002557544757034e-06,
  'epoch': 2.0,
  'step': 782},
 {'eval_loss': 0.28208571672439575,
  'eval_accuracy': 0.88896,
  'eval_runtime': 36.1116,
  'eval_samples_per_second': 692.299,
  'eval_steps_per_second': 10.828,
  'epoch': 2.0,
  'step': 782},
 {'loss': 0.1894,
  'grad_norm': 20.37828254699707,
  'learning_rate': 7.002557544757034e-06,
  'epoch': 3.0,
  'step': 1173},
 {'eval_loss': 0.27936893701553345,
  'eval_accuracy': 0.8974,
  'eval_runtime': 36.5715,
  'eval_samples_per_second': 683.592,
  'eval_steps_per_second': 10.691,
  'epoch': 3.0,
  'step': 1173},
 {'loss': 0.1433,
  'grad_norm': 

## **Visualize Training Metrics**

In [14]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Extract metrics
epochs = []
eval_losses = []
eval_accuracies = []

for entry in log:
    if 'eval_loss' in entry:
        epochs.append(entry['epoch'])
        eval_losses.append(entry['eval_loss'])
        eval_accuracies.append(entry['eval_accuracy'])

# Create subplot with two y-axes
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=epochs, y=eval_losses, name="Evaluation Loss"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=epochs, y=eval_accuracies, name="Evaluation Accuracy"),
    secondary_y=True,
)

# Set x-axis title
fig.update_xaxes(title_text="Epoch")

# Set y-axes titles
fig.update_yaxes(title_text="Loss", secondary_y=False)
fig.update_yaxes(title_text="Accuracy", secondary_y=True)

fig.update_layout(
    title_text="Training Metrics over Epochs",
    hovermode='x unified'
)

fig.show()

## **Save Training Log**

In [15]:
import json
import os
from datetime import datetime

# Create a logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True)

# Generate filename with timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_filename = f'logs/training_log_{timestamp}.json'

# Save the log
with open(log_filename, 'w') as f:
    json.dump(log, f, indent=4)

print(f'Training log saved to {log_filename}')

Training log saved to logs/training_log_20250420_212657.json
