## **Dataset**

In [1]:
# With small dataset: 50% full data
from datasets import load_dataset

# Load IMDb dataset
dataset = load_dataset("imdb")
# rename label to labels
dataset = dataset.rename_column("label", "labels")

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'labels'],
        num_rows: 50000
    })
})

## **Tokenizer**

In [3]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFKC
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

# Initialize BPE tokenizer
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.normalizer = NFKC()
tokenizer.decoder = ByteLevelDecoder()

trainer = BpeTrainer(
    vocab_size=100_000,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)

tokenizer.train_from_iterator(dataset["train"]["text"], trainer)
tokenizer.save("gpt_tokenizer.json")

In [4]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file="gpt_tokenizer.json")
tokenizer.add_special_tokens({
    "bos_token": "<s>",
    "eos_token": "</s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>",
})

tokenizer.save_pretrained("gpt-tokenizer")



('gpt-tokenizer\\tokenizer_config.json',
 'gpt-tokenizer\\special_tokens_map.json',
 'gpt-tokenizer\\tokenizer.json')

In [5]:
# Tokenize function
MAX_LENGTH = 512

def tokenize(example):
    return tokenizer(example["text"], truncation=True, max_length=MAX_LENGTH)

tokenized_ds = dataset.map(
    tokenize, batched=True
)
tokenized_ds = tokenized_ds.remove_columns(["text"])
tokenized_ds.set_format("torch")

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
# data collator
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [7]:
tokenizer.pad_token_id

1

## **Model**

In [8]:
from transformers import GPT2Config, GPT2ForSequenceClassification

id2label = {0: 'Negative', 1: 'Positive'}
label2id = {'Negative': 0, 'Positive': 1}

config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=MAX_LENGTH,
    n_embd=512,
    n_layer=6,
    n_head=8,
    num_labels=2,
    haha=id2label,
    label2id=label2id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

model = GPT2ForSequenceClassification(config)
model.config.pad_token_id = tokenizer.pad_token_id

In [9]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(100000, 512)
    (wpe): Embedding(512, 512)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=512, out_features=2, bias=False)
)

In [10]:
# forward pass
import torch

model
model.eval()
inputs = tokenized_ds["train"][0]['input_ids'].unsqueeze(0)  # Add batch dimension
with torch.no_grad():
    outputs = model(inputs)
print(outputs)

SequenceClassifierOutputWithPast(loss=None, logits=tensor([[0.2615, 0.2846]]), past_key_values=((tensor([[[[-0.3040,  0.2290, -0.1085,  ...,  0.3097, -0.6714,  0.3411],
          [-0.2150, -1.0689,  0.7427,  ...,  0.1514,  0.3218, -0.2961],
          [ 0.7309,  0.4143,  0.1799,  ...,  0.5573, -0.1998, -0.0141],
          ...,
          [ 0.1807, -0.0502, -0.2924,  ...,  0.2674,  0.5167,  0.3025],
          [-0.7637, -0.2971, -0.2623,  ...,  0.2479,  0.5016,  0.4805],
          [-0.1259,  0.2085, -0.0356,  ..., -0.1224, -0.0296, -0.0432]],

         [[-0.6074, -0.1605, -1.1309,  ...,  0.1464,  0.2645,  0.6637],
          [ 0.1508,  0.4145, -0.1393,  ...,  0.2592, -0.1940, -0.0610],
          [ 0.3143, -0.2252, -1.4627,  ..., -0.4961,  0.7226,  0.4839],
          ...,
          [-0.0255,  0.1705,  0.1320,  ..., -0.0258,  0.4380, -0.2093],
          [ 0.0239,  0.5075,  0.2427,  ..., -0.0450, -0.8513,  0.0474],
          [-0.1410, -0.3488,  0.3815,  ...,  0.3930, -0.1121,  0.3978]],

     

## **Training**

In [11]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="imdb2-full-gpt2-small",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=1,
    fp16=True,
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/3910 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 0.5892, 'grad_norm': 32.99339294433594, 'learning_rate': 4.501278772378517e-05, 'epoch': 1.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.5043091177940369, 'eval_accuracy': 0.767, 'eval_runtime': 42.665, 'eval_samples_per_second': 585.96, 'eval_steps_per_second': 9.164, 'epoch': 1.0}
{'loss': 0.2838, 'grad_norm': 32.96120834350586, 'learning_rate': 4.001278772378517e-05, 'epoch': 2.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.3074481189250946, 'eval_accuracy': 0.86896, 'eval_runtime': 42.5533, 'eval_samples_per_second': 587.498, 'eval_steps_per_second': 9.188, 'epoch': 2.0}
{'loss': 0.174, 'grad_norm': 16.529510498046875, 'learning_rate': 3.501278772378517e-05, 'epoch': 3.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.31480881571769714, 'eval_accuracy': 0.8716, 'eval_runtime': 43.5217, 'eval_samples_per_second': 574.426, 'eval_steps_per_second': 8.984, 'epoch': 3.0}
{'loss': 0.1159, 'grad_norm': 38.455718994140625, 'learning_rate': 3.0012787723785167e-05, 'epoch': 4.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.4323212802410126, 'eval_accuracy': 0.85916, 'eval_runtime': 43.4276, 'eval_samples_per_second': 575.671, 'eval_steps_per_second': 9.003, 'epoch': 4.0}
{'loss': 0.0737, 'grad_norm': 1.9497218132019043, 'learning_rate': 2.501278772378517e-05, 'epoch': 5.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.5753242373466492, 'eval_accuracy': 0.86288, 'eval_runtime': 36.931, 'eval_samples_per_second': 676.937, 'eval_steps_per_second': 10.587, 'epoch': 5.0}
{'loss': 0.0501, 'grad_norm': 56.89706039428711, 'learning_rate': 2.0025575447570334e-05, 'epoch': 6.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.7401742339134216, 'eval_accuracy': 0.86068, 'eval_runtime': 36.5743, 'eval_samples_per_second': 683.54, 'eval_steps_per_second': 10.691, 'epoch': 6.0}
{'loss': 0.0309, 'grad_norm': 97.20088958740234, 'learning_rate': 1.5025575447570333e-05, 'epoch': 7.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.8668599128723145, 'eval_accuracy': 0.85644, 'eval_runtime': 36.7408, 'eval_samples_per_second': 680.442, 'eval_steps_per_second': 10.642, 'epoch': 7.0}
{'loss': 0.0238, 'grad_norm': 0.18740540742874146, 'learning_rate': 1.0025575447570333e-05, 'epoch': 8.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.8530956506729126, 'eval_accuracy': 0.85888, 'eval_runtime': 36.2921, 'eval_samples_per_second': 688.854, 'eval_steps_per_second': 10.774, 'epoch': 8.0}
{'loss': 0.0139, 'grad_norm': 0.11805669218301773, 'learning_rate': 5.025575447570333e-06, 'epoch': 9.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.9873777627944946, 'eval_accuracy': 0.85424, 'eval_runtime': 36.3041, 'eval_samples_per_second': 688.628, 'eval_steps_per_second': 10.77, 'epoch': 9.0}
{'loss': 0.0101, 'grad_norm': 0.1803833395242691, 'learning_rate': 2.557544757033248e-08, 'epoch': 10.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 1.0260963439941406, 'eval_accuracy': 0.85448, 'eval_runtime': 36.4065, 'eval_samples_per_second': 686.691, 'eval_steps_per_second': 10.74, 'epoch': 10.0}
{'train_runtime': 1459.6485, 'train_samples_per_second': 171.274, 'train_steps_per_second': 2.679, 'train_loss': 0.13654282489396116, 'epoch': 10.0}


TrainOutput(global_step=3910, training_loss=0.13654282489396116, metrics={'train_runtime': 1459.6485, 'train_samples_per_second': 171.274, 'train_steps_per_second': 2.679, 'total_flos': 1.452757219909632e+16, 'train_loss': 0.13654282489396116, 'epoch': 10.0})

In [14]:
log = trainer.state.log_history
log

[{'loss': 0.5892,
  'grad_norm': 32.99339294433594,
  'learning_rate': 4.501278772378517e-05,
  'epoch': 1.0,
  'step': 391},
 {'eval_loss': 0.5043091177940369,
  'eval_accuracy': 0.767,
  'eval_runtime': 42.665,
  'eval_samples_per_second': 585.96,
  'eval_steps_per_second': 9.164,
  'epoch': 1.0,
  'step': 391},
 {'loss': 0.2838,
  'grad_norm': 32.96120834350586,
  'learning_rate': 4.001278772378517e-05,
  'epoch': 2.0,
  'step': 782},
 {'eval_loss': 0.3074481189250946,
  'eval_accuracy': 0.86896,
  'eval_runtime': 42.5533,
  'eval_samples_per_second': 587.498,
  'eval_steps_per_second': 9.188,
  'epoch': 2.0,
  'step': 782},
 {'loss': 0.174,
  'grad_norm': 16.529510498046875,
  'learning_rate': 3.501278772378517e-05,
  'epoch': 3.0,
  'step': 1173},
 {'eval_loss': 0.31480881571769714,
  'eval_accuracy': 0.8716,
  'eval_runtime': 43.5217,
  'eval_samples_per_second': 574.426,
  'eval_steps_per_second': 8.984,
  'epoch': 3.0,
  'step': 1173},
 {'loss': 0.1159,
  'grad_norm': 38.455718

## **Visualize Training Metrics**

In [15]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Extract metrics
epochs = []
eval_losses = []
eval_accuracies = []

for entry in log:
    if 'eval_loss' in entry:
        epochs.append(entry['epoch'])
        eval_losses.append(entry['eval_loss'])
        eval_accuracies.append(entry['eval_accuracy'])

# Create subplot with two y-axes
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=epochs, y=eval_losses, name="Evaluation Loss"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=epochs, y=eval_accuracies, name="Evaluation Accuracy"),
    secondary_y=True,
)

# Set x-axis title
fig.update_xaxes(title_text="Epoch")

# Set y-axes titles
fig.update_yaxes(title_text="Loss", secondary_y=False)
fig.update_yaxes(title_text="Accuracy", secondary_y=True)

fig.update_layout(
    title_text="Training Metrics over Epochs",
    hovermode='x unified'
)

fig.show()

## **Save Training Log**

In [16]:
import json
import os
from datetime import datetime

# Create a logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True)

# Generate filename with timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_filename = f'logs/training_log_{timestamp}.json'

# Save the log
with open(log_filename, 'w') as f:
    json.dump(log, f, indent=4)

print(f'Training log saved to {log_filename}')

Training log saved to logs/training_log_20250420_204009.json
