## **Dataset**

In [1]:
# With small dataset: 50% full data
from datasets import load_dataset, concatenate_datasets

# Load IMDb dataset
dataset = load_dataset("imdb")
# rename label to labels
dataset = dataset.rename_column("label", "labels")
print(dataset)
# Tách riêng nhãn
train_dataset = dataset["train"]
positive = train_dataset.filter(lambda x: x["labels"] == 1)
negative = train_dataset.filter(lambda x: x["labels"] == 0)

# Lấy 50% (balanced): mỗi lớp lấy 12500 * 0.5 = 6250 mẫu
subset_pos = positive.shuffle(seed=42).select(range(6250))
subset_neg = negative.shuffle(seed=42).select(range(6250))

# Gộp lại và shuffle toàn bộ
balanced_subset = concatenate_datasets([subset_pos, subset_neg]).shuffle(seed=42)

# Thay vào tập train
dataset["train"] = balanced_subset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'labels'],
        num_rows: 50000
    })
})


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 12500
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'labels'],
        num_rows: 50000
    })
})

## **Tokenizer**

In [3]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFKC
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

# Initialize BPE tokenizer
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.normalizer = NFKC()
tokenizer.decoder = ByteLevelDecoder()

trainer = BpeTrainer(
    vocab_size=100_000,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)

tokenizer.train_from_iterator(dataset["train"]["text"], trainer)
tokenizer.save("gpt_tokenizer.json")

In [4]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file="gpt_tokenizer.json")
tokenizer.add_special_tokens({
    "bos_token": "<s>",
    "eos_token": "</s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>",
})

tokenizer.save_pretrained("gpt-tokenizer")



('gpt-tokenizer\\tokenizer_config.json',
 'gpt-tokenizer\\special_tokens_map.json',
 'gpt-tokenizer\\tokenizer.json')

In [5]:
# Tokenize function
MAX_LENGTH = 512

def tokenize(example):
    return tokenizer(example["text"], truncation=True, max_length=MAX_LENGTH)

tokenized_ds = dataset.map(
    tokenize, batched=True
)
tokenized_ds = tokenized_ds.remove_columns(["text"])
tokenized_ds.set_format("torch")

In [6]:
# data collator
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [7]:
tokenizer.pad_token_id

1

## **Model**

In [8]:
from transformers import GPT2Config, GPT2ForSequenceClassification

id2label = {0: 'Negative', 1: 'Positive'}
label2id = {'Negative': 0, 'Positive': 1}

config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=MAX_LENGTH,
    n_embd=512,
    n_layer=6,
    n_head=8,
    num_labels=2,
    haha=id2label,
    label2id=label2id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

model = GPT2ForSequenceClassification(config)
model.config.pad_token_id = tokenizer.pad_token_id

In [10]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(100000, 512)
    (wpe): Embedding(512, 512)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=512, out_features=2, bias=False)
)

In [12]:
# forward pass
import torch

model
model.eval()
inputs = tokenized_ds["train"][0]['input_ids'].unsqueeze(0)  # Add batch dimension
with torch.no_grad():
    outputs = model(inputs)
print(outputs)

SequenceClassifierOutputWithPast(loss=None, logits=tensor([[-0.0547, -0.8340]]), past_key_values=((tensor([[[[-0.2233,  0.2127,  0.3309,  ...,  0.9154,  0.5481,  0.0409],
          [ 0.0654, -0.1480,  0.2067,  ...,  0.9718,  0.4507, -0.7858],
          [ 0.1334, -0.0690,  0.3539,  ...,  0.3917,  0.6559,  0.4890],
          ...,
          [ 0.5805,  0.3378,  0.0202,  ..., -0.2326,  0.1182, -0.4077],
          [-0.5227, -0.1525, -0.0203,  ...,  0.4222, -0.6786, -0.4448],
          [-0.2692, -0.1326,  0.9521,  ..., -0.6867,  0.2072,  0.6282]],

         [[-0.4947,  0.5100, -0.4459,  ..., -1.0064, -0.0599, -0.2149],
          [-0.3767,  0.6193,  0.1077,  ...,  0.0618,  0.0299, -0.2817],
          [-0.2031,  0.6366,  0.8675,  ..., -0.0696, -0.0331, -0.2904],
          ...,
          [ 0.0515,  0.4890,  0.1167,  ...,  0.0982, -0.5603,  0.1197],
          [-0.8670,  0.3733, -0.2511,  ..., -0.3418,  0.1098,  0.1869],
          [-0.7320,  0.0771,  0.2766,  ..., -0.6837, -0.1786, -0.5623]],

   

## **Training**

In [13]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="imdb2-small-gpt2-small",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=1,
    fp16=True,
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

trainer.train()

  0%|          | 0/1960 [00:00<?, ?it/s]

{'loss': 0.7221, 'grad_norm': 50.928245544433594, 'learning_rate': 4.502551020408164e-05, 'epoch': 1.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.4920104444026947, 'eval_accuracy': 0.76048, 'eval_runtime': 36.3235, 'eval_samples_per_second': 688.26, 'eval_steps_per_second': 10.764, 'epoch': 1.0}
{'loss': 0.3909, 'grad_norm': 106.07376098632812, 'learning_rate': 4.002551020408164e-05, 'epoch': 2.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.3664415180683136, 'eval_accuracy': 0.84628, 'eval_runtime': 36.701, 'eval_samples_per_second': 681.181, 'eval_steps_per_second': 10.654, 'epoch': 2.0}
{'loss': 0.2224, 'grad_norm': 30.860477447509766, 'learning_rate': 3.502551020408164e-05, 'epoch': 3.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.3858794569969177, 'eval_accuracy': 0.84376, 'eval_runtime': 36.8485, 'eval_samples_per_second': 678.454, 'eval_steps_per_second': 10.611, 'epoch': 3.0}
{'loss': 0.1408, 'grad_norm': 58.14004898071289, 'learning_rate': 3.0025510204081635e-05, 'epoch': 4.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.5420794486999512, 'eval_accuracy': 0.85892, 'eval_runtime': 37.5134, 'eval_samples_per_second': 666.428, 'eval_steps_per_second': 10.423, 'epoch': 4.0}
{'loss': 0.0968, 'grad_norm': 11.716331481933594, 'learning_rate': 2.502551020408163e-05, 'epoch': 5.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.5041273832321167, 'eval_accuracy': 0.84844, 'eval_runtime': 43.8361, 'eval_samples_per_second': 570.306, 'eval_steps_per_second': 8.92, 'epoch': 5.0}
{'loss': 0.0563, 'grad_norm': 0.2504132390022278, 'learning_rate': 2.0025510204081632e-05, 'epoch': 6.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.765141487121582, 'eval_accuracy': 0.8588, 'eval_runtime': 44.5933, 'eval_samples_per_second': 560.623, 'eval_steps_per_second': 8.768, 'epoch': 6.0}
{'loss': 0.0556, 'grad_norm': 0.016125699505209923, 'learning_rate': 1.5076530612244898e-05, 'epoch': 7.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.7957320213317871, 'eval_accuracy': 0.85408, 'eval_runtime': 36.6845, 'eval_samples_per_second': 681.487, 'eval_steps_per_second': 10.658, 'epoch': 7.0}
{'loss': 0.0233, 'grad_norm': 0.24969352781772614, 'learning_rate': 1.0076530612244899e-05, 'epoch': 8.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.9121775031089783, 'eval_accuracy': 0.85164, 'eval_runtime': 35.307, 'eval_samples_per_second': 708.075, 'eval_steps_per_second': 11.074, 'epoch': 8.0}
{'loss': 0.0149, 'grad_norm': 0.04063006490468979, 'learning_rate': 5.076530612244898e-06, 'epoch': 9.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.8862590789794922, 'eval_accuracy': 0.85652, 'eval_runtime': 35.5185, 'eval_samples_per_second': 703.858, 'eval_steps_per_second': 11.008, 'epoch': 9.0}
{'loss': 0.0118, 'grad_norm': 0.03294048458337784, 'learning_rate': 7.653061224489796e-08, 'epoch': 10.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.9772167801856995, 'eval_accuracy': 0.8542, 'eval_runtime': 35.3112, 'eval_samples_per_second': 707.991, 'eval_steps_per_second': 11.073, 'epoch': 10.0}
{'train_runtime': 913.7683, 'train_samples_per_second': 136.796, 'train_steps_per_second': 2.145, 'train_loss': 0.17351058794527638, 'epoch': 10.0}


TrainOutput(global_step=1960, training_loss=0.17351058794527638, metrics={'train_runtime': 913.7683, 'train_samples_per_second': 136.796, 'train_steps_per_second': 2.145, 'total_flos': 7263183651569664.0, 'train_loss': 0.17351058794527638, 'epoch': 10.0})

In [16]:
log = trainer.state.log_history
log

[{'loss': 0.7221,
  'grad_norm': 50.928245544433594,
  'learning_rate': 4.502551020408164e-05,
  'epoch': 1.0,
  'step': 196},
 {'eval_loss': 0.4920104444026947,
  'eval_accuracy': 0.76048,
  'eval_runtime': 36.3235,
  'eval_samples_per_second': 688.26,
  'eval_steps_per_second': 10.764,
  'epoch': 1.0,
  'step': 196},
 {'loss': 0.3909,
  'grad_norm': 106.07376098632812,
  'learning_rate': 4.002551020408164e-05,
  'epoch': 2.0,
  'step': 392},
 {'eval_loss': 0.3664415180683136,
  'eval_accuracy': 0.84628,
  'eval_runtime': 36.701,
  'eval_samples_per_second': 681.181,
  'eval_steps_per_second': 10.654,
  'epoch': 2.0,
  'step': 392},
 {'loss': 0.2224,
  'grad_norm': 30.860477447509766,
  'learning_rate': 3.502551020408164e-05,
  'epoch': 3.0,
  'step': 588},
 {'eval_loss': 0.3858794569969177,
  'eval_accuracy': 0.84376,
  'eval_runtime': 36.8485,
  'eval_samples_per_second': 678.454,
  'eval_steps_per_second': 10.611,
  'epoch': 3.0,
  'step': 588},
 {'loss': 0.1408,
  'grad_norm': 58.

## **Visualize Training Metrics**

In [17]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Extract metrics
epochs = []
eval_losses = []
eval_accuracies = []

for entry in log:
    if 'eval_loss' in entry:
        epochs.append(entry['epoch'])
        eval_losses.append(entry['eval_loss'])
        eval_accuracies.append(entry['eval_accuracy'])

# Create subplot with two y-axes
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=epochs, y=eval_losses, name="Evaluation Loss"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=epochs, y=eval_accuracies, name="Evaluation Accuracy"),
    secondary_y=True,
)

# Set x-axis title
fig.update_xaxes(title_text="Epoch")

# Set y-axes titles
fig.update_yaxes(title_text="Loss", secondary_y=False)
fig.update_yaxes(title_text="Accuracy", secondary_y=True)

fig.update_layout(
    title_text="Training Metrics over Epochs",
    hovermode='x unified'
)

fig.show()

## **Save Training Log**

In [18]:
import json
import os
from datetime import datetime

# Create a logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True)

# Generate filename with timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_filename = f'logs/training_log_{timestamp}.json'

# Save the log
with open(log_filename, 'w') as f:
    json.dump(log, f, indent=4)

print(f'Training log saved to {log_filename}')

Training log saved to logs/training_log_20250421_015254.json
