In [None]:
!pip install -q transformers datasets torch accelerate

In [None]:
import torch
import json
import math
import warnings
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
model_name = "HuggingFaceTB/SmolLM-135M"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token # SmolLM mendefinisikan PAD token, samakan dengan EOS
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [None]:
# Domain-specific Data
# Bayangkan model belum pernah melihat tulisan ini sebelumnya

JSON_PATH = "sk_wondr.json"

with open(JSON_PATH, "r", encoding="utf-8") as f:
    raw = json.load(f)

texts = []

for section in raw.get("sections", []):
    if "title" in section:
        texts.append(section["title"])
    for clause in section.get("clauses", []):
        if "text" in clause:
            texts.append(clause["text"])
        for sub in clause.get("subclauses", []):
            if "text" in sub:
                texts.append(sub["text"])

dataset = Dataset.from_dict({"text": texts})
dataset

In [None]:

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding=False
    )

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)


# secara otomatis menangani padding dan label untuk Causal LM (shifting input)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False)


In [None]:
training_args = TrainingArguments(
    output_dir="./smollm-cpt-results",
    overwrite_output_dir=True,
    num_train_epochs=8,
    per_device_train_batch_size=4,
    save_steps=10,
    logging_steps=1,         # Tampilkan progress loss setiap berapa step
    learning_rate=5e-5,      # Biasanya gunakan rate kecil untuk mencegah catastrophic forgetting
    weight_decay=0.01,
    fp16=torch.cuda.is_available(), # gunakan mixed precision jika GPU available
    report_to="none", # jangan pakai wandb
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
def evaluate_perplexity(model, dataset, tokenizer):
    model.eval()
    dataloader = DataLoader(
        dataset,
        batch_size=4,
        collate_fn=data_collator
    )

    total_loss = 0
    total_steps = 0

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(model.device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            total_steps += 1

    return math.exp(total_loss / total_steps)

In [None]:
ppl_before = evaluate_perplexity(model, tokenized_dataset, tokenizer)
print(f"Perplexity before CPT: {ppl_before:.2f}")

trainer.train()
trainer.save_model("./smollm-cpt-wondr")

In [None]:
model = AutoModelForCausalLM.from_pretrained("./smollm-cpt-wondr").to(device)

ppl_after = evaluate_perplexity(model, tokenized_dataset, tokenizer)
print(f"Perplexity after CPT: {ppl_after:.2f}")

print(f"Improvement: {ppl_before - ppl_after:.2f} points")

In [None]:
def generate_text(prompt, max_new_tokens=50):
    model.eval()  # WAJIB untuk inference

    inputs = tokenizer(
        prompt,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.3,        # CPT lebih stabil di 0.2–0.5
            top_p=0.9,
            repetition_penalty=1.2,
            eos_token_id=tokenizer.eos_token_id
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
prompts = [
    "Aplikasi wondr by BNI adalah",
    "Nasabah wajib",
    "Dalam hal terjadi kehilangan Smartphone",
    "Pengguna bertanggung jawab atas"
]

for p in prompts:
    print("="*50)
    print("Prompt:", p)
    print(generate_text(p, max_new_tokens=80))