## **Dataset**

In [1]:
# With small dataset: 50% full data
from datasets import load_dataset, concatenate_datasets

# Load IMDb dataset
dataset = load_dataset("imdb")
# rename label to labels
dataset = dataset.rename_column("label", "labels")
print(dataset)
# Tách riêng nhãn
train_dataset = dataset["train"]
positive = train_dataset.filter(lambda x: x["labels"] == 1)
negative = train_dataset.filter(lambda x: x["labels"] == 0)

# Lấy 50% (balanced): mỗi lớp lấy 12500 * 0.5 = 6250 mẫu
subset_pos = positive.shuffle(seed=42).select(range(6250))
subset_neg = negative.shuffle(seed=42).select(range(6250))

# Gộp lại và shuffle toàn bộ
balanced_subset = concatenate_datasets([subset_pos, subset_neg]).shuffle(seed=42)

# Thay vào tập train
dataset["train"] = balanced_subset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'labels'],
        num_rows: 50000
    })
})


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 12500
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'labels'],
        num_rows: 50000
    })
})

## **Tokenizer**

In [3]:
from transformers import AutoTokenizer

model_name = "binhphap5/gpt-small-c4"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
# Tokenize function
def tokenize(example):
    return tokenizer(
        example["text"], truncation=True, max_length=512
    )

# Tokenize full dataset
tokenized_ds = dataset.map(tokenize, batched=True)
tokenized_ds = tokenized_ds.remove_columns(["text"])
tokenized_ds.set_format("torch")

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
# data collator
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [6]:
tokenizer.pad_token_id

1

## **Model**

In [7]:
from transformers import AutoModelForSequenceClassification

id2label = {0: 'Negative', 1: 'Positive'}
label2id = {'Negative': 0, 'Positive': 1}

model = AutoModelForSequenceClassification.from_pretrained(
    "binhphap5/gpt-small-c4",
    num_labels=2, id2label=id2label, label2id=label2id
)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at binhphap5/gpt-small-c4 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(100000, 512)
    (wpe): Embedding(512, 512)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=512, out_features=2, bias=False)
)

In [9]:
# forward pass
import torch

model
model.eval()
inputs = tokenized_ds["train"][0]['input_ids'].unsqueeze(0)  # Add batch dimension
with torch.no_grad():
    outputs = model(inputs)
print(outputs)

SequenceClassifierOutputWithPast(loss=None, logits=tensor([[-1.1940, -0.6812]]), past_key_values=((tensor([[[[-0.1308,  0.1069,  0.3249,  ..., -0.2846,  0.1199, -1.3680],
          [-0.7837, -1.0238,  0.3990,  ...,  1.2165, -0.6532, -1.6430],
          [ 1.0589, -1.8017,  1.1344,  ...,  1.9712,  0.7138, -2.4735],
          ...,
          [ 0.4745, -0.7296,  0.7905,  ...,  0.6624,  0.4398, -1.7292],
          [ 0.1973,  0.0286, -0.5627,  ..., -0.8715, -0.4840,  0.4927],
          [-0.4916, -1.3149,  0.8568,  ...,  1.8119,  0.3936, -2.3611]],

         [[ 0.1239, -1.5156, -0.9522,  ...,  1.0415,  1.6890, -0.7142],
          [ 0.3164,  0.1083, -0.8988,  ..., -0.2610,  0.8030,  0.0190],
          [ 0.3050,  0.3035, -1.5784,  ..., -0.3005, -0.8333, -1.0332],
          ...,
          [ 0.6190, -0.5242, -0.4804,  ..., -0.3217, -0.3617,  0.0390],
          [-0.1187, -0.4420,  0.0983,  ...,  0.0218,  0.4178,  1.3835],
          [ 1.4370, -1.4236, -1.1639,  ...,  0.1105,  0.1519, -0.1020]],

   

## **Training**

In [10]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="imdb2-small-gpt2-small",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=1,
    fp16=True,
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/1960 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 0.4358, 'grad_norm': 41.559959411621094, 'learning_rate': 9.005102040816327e-06, 'epoch': 1.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.31055134534835815, 'eval_accuracy': 0.86876, 'eval_runtime': 36.854, 'eval_samples_per_second': 678.352, 'eval_steps_per_second': 10.609, 'epoch': 1.0}
{'loss': 0.2827, 'grad_norm': 26.93545150756836, 'learning_rate': 8.005102040816327e-06, 'epoch': 2.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.29730936884880066, 'eval_accuracy': 0.8736, 'eval_runtime': 38.518, 'eval_samples_per_second': 649.047, 'eval_steps_per_second': 10.151, 'epoch': 2.0}
{'loss': 0.2133, 'grad_norm': 4.854419231414795, 'learning_rate': 7.005102040816327e-06, 'epoch': 3.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.29191502928733826, 'eval_accuracy': 0.88896, 'eval_runtime': 44.8296, 'eval_samples_per_second': 557.668, 'eval_steps_per_second': 8.722, 'epoch': 3.0}
{'loss': 0.1589, 'grad_norm': 21.91950225830078, 'learning_rate': 6.005102040816327e-06, 'epoch': 4.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.30126485228538513, 'eval_accuracy': 0.88864, 'eval_runtime': 36.9126, 'eval_samples_per_second': 677.276, 'eval_steps_per_second': 10.593, 'epoch': 4.0}
{'loss': 0.1202, 'grad_norm': 24.916946411132812, 'learning_rate': 5.005102040816327e-06, 'epoch': 5.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.3328348696231842, 'eval_accuracy': 0.89052, 'eval_runtime': 37.072, 'eval_samples_per_second': 674.364, 'eval_steps_per_second': 10.547, 'epoch': 5.0}
{'loss': 0.0836, 'grad_norm': 36.7038459777832, 'learning_rate': 4.005102040816327e-06, 'epoch': 6.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.36387917399406433, 'eval_accuracy': 0.88988, 'eval_runtime': 43.4565, 'eval_samples_per_second': 575.287, 'eval_steps_per_second': 8.997, 'epoch': 6.0}
{'loss': 0.063, 'grad_norm': 28.68521499633789, 'learning_rate': 3.0102040816326534e-06, 'epoch': 7.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.4230346381664276, 'eval_accuracy': 0.8912, 'eval_runtime': 41.9991, 'eval_samples_per_second': 595.251, 'eval_steps_per_second': 9.31, 'epoch': 7.0}
{'loss': 0.0446, 'grad_norm': 16.597305297851562, 'learning_rate': 2.0102040816326533e-06, 'epoch': 8.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.49091866612434387, 'eval_accuracy': 0.88852, 'eval_runtime': 42.2716, 'eval_samples_per_second': 591.414, 'eval_steps_per_second': 9.25, 'epoch': 8.0}
{'loss': 0.0367, 'grad_norm': 2.623213052749634, 'learning_rate': 1.010204081632653e-06, 'epoch': 9.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.5251381993293762, 'eval_accuracy': 0.88932, 'eval_runtime': 36.7923, 'eval_samples_per_second': 679.49, 'eval_steps_per_second': 10.627, 'epoch': 9.0}
{'loss': 0.0271, 'grad_norm': 0.3470457196235657, 'learning_rate': 1.0204081632653063e-08, 'epoch': 10.0}


  0%|          | 0/391 [00:00<?, ?it/s]

{'eval_loss': 0.5393058061599731, 'eval_accuracy': 0.88904, 'eval_runtime': 36.6162, 'eval_samples_per_second': 682.758, 'eval_steps_per_second': 10.678, 'epoch': 10.0}
{'train_runtime': 935.5903, 'train_samples_per_second': 133.605, 'train_steps_per_second': 2.095, 'train_loss': 0.1466042263167245, 'epoch': 10.0}


TrainOutput(global_step=1960, training_loss=0.1466042263167245, metrics={'train_runtime': 935.5903, 'train_samples_per_second': 133.605, 'train_steps_per_second': 2.095, 'total_flos': 7263348450828288.0, 'train_loss': 0.1466042263167245, 'epoch': 10.0})

In [13]:
log = trainer.state.log_history
log

[{'loss': 0.4358,
  'grad_norm': 41.559959411621094,
  'learning_rate': 9.005102040816327e-06,
  'epoch': 1.0,
  'step': 196},
 {'eval_loss': 0.31055134534835815,
  'eval_accuracy': 0.86876,
  'eval_runtime': 36.854,
  'eval_samples_per_second': 678.352,
  'eval_steps_per_second': 10.609,
  'epoch': 1.0,
  'step': 196},
 {'loss': 0.2827,
  'grad_norm': 26.93545150756836,
  'learning_rate': 8.005102040816327e-06,
  'epoch': 2.0,
  'step': 392},
 {'eval_loss': 0.29730936884880066,
  'eval_accuracy': 0.8736,
  'eval_runtime': 38.518,
  'eval_samples_per_second': 649.047,
  'eval_steps_per_second': 10.151,
  'epoch': 2.0,
  'step': 392},
 {'loss': 0.2133,
  'grad_norm': 4.854419231414795,
  'learning_rate': 7.005102040816327e-06,
  'epoch': 3.0,
  'step': 588},
 {'eval_loss': 0.29191502928733826,
  'eval_accuracy': 0.88896,
  'eval_runtime': 44.8296,
  'eval_samples_per_second': 557.668,
  'eval_steps_per_second': 8.722,
  'epoch': 3.0,
  'step': 588},
 {'loss': 0.1589,
  'grad_norm': 21.9

## **Visualize Training Metrics**

In [14]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Extract metrics
epochs = []
eval_losses = []
eval_accuracies = []

for entry in log:
    if 'eval_loss' in entry:
        epochs.append(entry['epoch'])
        eval_losses.append(entry['eval_loss'])
        eval_accuracies.append(entry['eval_accuracy'])

# Create subplot with two y-axes
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=epochs, y=eval_losses, name="Evaluation Loss"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=epochs, y=eval_accuracies, name="Evaluation Accuracy"),
    secondary_y=True,
)

# Set x-axis title
fig.update_xaxes(title_text="Epoch")

# Set y-axes titles
fig.update_yaxes(title_text="Loss", secondary_y=False)
fig.update_yaxes(title_text="Accuracy", secondary_y=True)

fig.update_layout(
    title_text="Training Metrics over Epochs",
    hovermode='x unified'
)

fig.show()

## **Save Training Log**

In [15]:
import json
import os
from datetime import datetime

# Create a logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True)

# Generate filename with timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_filename = f'logs/training_log_{timestamp}.json'

# Save the log
with open(log_filename, 'w') as f:
    json.dump(log, f, indent=4)

print(f'Training log saved to {log_filename}')

Training log saved to logs/training_log_20250420_201226.json
