In [1]:
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from pathlib import Path
import os
os.environ["WANDB_DISABLED"] = "true"

model_name   = "cahya/gpt2-small-indonesian-522M"
data_file    = Path("data/data_komen_preprocessed.txt")   # ‚Üê your dataset
output_dir   = Path("finetuned-gpt2/finetuned-gpt2-IGkomen")

# Make sure the dataset exists
assert data_file.exists(), f"File not found: {data_file}"





In [2]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # suppress padding warnings

model = GPT2LMHeadModel.from_pretrained(model_name)

In [3]:
# Each line in the txt is treated as contiguous text;
# block_size defines how many tokens are packed together.
block_size = 64  # Increase to 128/256 if VRAM allows

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=str(data_file),
    block_size=block_size,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,      # causal LM (GPT‚Äëstyle) ‚Üí NOT masked
)
print(f"Loaded {len(train_dataset)} training blocks.")



Loaded 198 training blocks.


In [4]:
training_args = TrainingArguments(
    output_dir=str(output_dir),
    overwrite_output_dir=True,
    num_train_epochs=3,            # try 1 first if you‚Äôre GPU‚Äëlimited
    per_device_train_batch_size=4, # drop to 2 if OOM
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,5.2352


In [None]:
output_dir.mkdir(exist_ok=True, parents=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir.resolve()}")


In [None]:
from transformers import pipeline, set_seed
set_seed(42)

generator = pipeline(
    "text-generation",
    model=str(output_dir),
    tokenizer=str(output_dir),
)

prompt = "yaelah netizen"
sample  = generator(
    prompt,
    max_length=40,
    do_sample=True,
    temperature=1.2,
    top_p=0.95,
)
print(sample[0]["generated_text"])
