In [None]:
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from pathlib import Path
import os
os.environ["WANDB_DISABLED"] = "true"

# ✅ Step 1: Load and prepare dataset
import pandas as pd

input_path = "data/dataset_FND.csv"
output_txt_path = "data/finetuner/dataset_FND_text_only.txt"

# Read TSV/CSV file
df = pd.read_csv(input_path, sep=",")

# Save only the 'text' column to a plain text file
df["Text"].to_csv(output_txt_path, index=False, header=False)

print(f"Saved cleaned training text to: {output_txt_path}")




KeyboardInterrupt

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x0000022FC74B4F90>>
Traceback (most recent call last):
  File "C:\Users\MASTER CORE\.conda\envs\RALBERT\Lib\site-packages\ipykernel\ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 

KeyboardInterrupt



In [None]:
model_name   = "cahya/gpt2-small-indonesian-522M"
data_file    = Path("data/finetuner/dataset_FND_text_only.txt")   # ← your dataset
output_dir   = Path("finetuned-gpt2/finetuned-gpt2-FND")

# Make sure the dataset exists
assert data_file.exists(), f"File not found: {data_file}"

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # suppress padding warnings

model = GPT2LMHeadModel.from_pretrained(model_name)

In [None]:
# Each line in the txt is treated as contiguous text;
# block_size defines how many tokens are packed together.
block_size = 64  # Increase to 128/256 if VRAM allows

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=str(data_file),
    block_size=block_size,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,      # causal LM (GPT‑style) → NOT masked
)
print(f"Loaded {len(train_dataset)} training blocks.")

In [None]:
training_args = TrainingArguments(
    output_dir=str(output_dir),
    overwrite_output_dir=True,
    num_train_epochs=3,            # try 1 first if you’re GPU‑limited
    per_device_train_batch_size=4, # drop to 2 if OOM
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()

In [None]:
output_dir.mkdir(exist_ok=True, parents=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir.resolve()}")


In [None]:
from transformers import pipeline, set_seed
set_seed(42)

generator = pipeline(
    "text-generation",
    model=str(output_dir),
    tokenizer=str(output_dir),
)

prompt = "yaelah netizen"
sample  = generator(
    prompt,
    max_length=200,
    do_sample=True,
    temperature=1.2,
    top_p=0.95,
)
print(sample[0]["generated_text"])
