In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    print("CUDA:", torch.version.cuda)


CUDA available: True
GPU: NVIDIA L4
CUDA: 12.6


In [3]:
from datasets import load_dataset

ultrachat = load_dataset(
    "BramVanroy/ultrachat_200k_dutch",
    split="train_sft[:1000]"
)

leesplank = load_dataset(
    "UWV/Leesplank_NL_wikipedia_simplifications",
    split="train[:1000]"
)

print("Ultrachat rows:", len(ultrachat))
print("Leesplank rows:", len(leesplank))
print("Ultrachat keys:", ultrachat[0].keys())
print("Leesplank keys:", leesplank[0].keys())


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md: 0.00B [00:00, ?B/s]

data/test_sft-00000-of-00001.parquet:   0%|          | 0.00/48.4M [00:00<?, ?B/s]

data/train_sft-00000-of-00002.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

data/train_sft-00001-of-00002.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

Generating test_sft split:   0%|          | 0/21424 [00:00<?, ? examples/s]

Generating train_sft split:   0%|          | 0/192598 [00:00<?, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

simplification.jsonl:   0%|          | 0.00/2.08G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2867757 [00:00<?, ? examples/s]

Ultrachat rows: 1000
Leesplank rows: 1000
Ultrachat keys: dict_keys(['prompt', 'prompt_id', 'messages'])
Leesplank keys: dict_keys(['prompt', 'result'])


In [4]:
import json
import random
import re
from pathlib import Path

SYSTEM = (
    "You are a Dutch language tutor. "
    "Reply in simple English. "
    "Correct the Dutch sentence, explain briefly, then give 2 short Dutch examples."
)

out_path = Path("/content/drive/MyDrive/LLM/data/dutch_tutor/train_v2_mix.jsonl")
out_path.parent.mkdir(parents=True, exist_ok=True)

def normalize_ws(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def write_row(f, user_text: str, assistant_text: str):
    row = {
        "messages": [
            {"role": "system", "content": SYSTEM},
            {"role": "user", "content": user_text},
            {"role": "assistant", "content": assistant_text},
        ]
    }
    f.write(json.dumps(row, ensure_ascii=False) + "\n")

# --- 1) Ultrachat: keep as conversation SFT (Dutch). ---
def ultrachat_to_rows(example):
    # example["messages"] is already in chat format; we wrap with our system message
    msgs = example["messages"]
    # Find the first user and first assistant turn to keep it simple
    user = next((m["content"] for m in msgs if m.get("role") == "user" and m.get("content")), None)
    assistant = next((m["content"] for m in msgs if m.get("role") == "assistant" and m.get("content")), None)
    if not user or not assistant:
        return None
    user = normalize_ws(user)
    assistant = normalize_ws(assistant)
    return (user, assistant)

# --- 2) Leesplank: convert "prompt/result" into a tutor-style simplification task. ---
def leesplank_to_rows(example):
    prompt = example.get("prompt", "")
    result = example.get("result", "")
    if not prompt or not result:
        return None
    prompt = normalize_ws(prompt)
    result = normalize_ws(result)

    user = (
        "Simplify this Dutch text for an A2 learner. "
        "Keep the meaning. Use short sentences.\n\n"
        f"Text:\n{prompt}"
    )
    assistant = (
        f"Simplified version:\n{result}\n\n"
        "Explanation (simple English):\n"
        "I used shorter sentences and simpler words.\n"
        "Examples:\n"
        "- Dit is een kort voorbeeld.\n"
        "- Nog een eenvoudig voorbeeld.\n"
    )
    return (user, assistant)

# Build the mix: 80% ultrachat, 20% leesplank (from the 1000-sample subsets you loaded)
ultra_rows = []
for ex in ultrachat:
    row = ultrachat_to_rows(ex)
    if row:
        ultra_rows.append(row)

lees_rows = []
for ex in leesplank:
    row = leesplank_to_rows(ex)
    if row:
        lees_rows.append(row)

print("Ultrachat usable rows:", len(ultra_rows))
print("Leesplank usable rows:", len(lees_rows))

# Mix and write
random.seed(42)
target_total = 2000  # small v2 for now (safe). Increase later.
n_ultra = int(target_total * 0.8)
n_lees = target_total - n_ultra

sample_ultra = random.sample(ultra_rows, min(n_ultra, len(ultra_rows)))
sample_lees = random.sample(lees_rows, min(n_lees, len(lees_rows)))

mixed = sample_ultra + sample_lees
random.shuffle(mixed)

with out_path.open("w", encoding="utf-8") as f:
    for user_text, assistant_text in mixed:
        write_row(f, user_text, assistant_text)

print("Wrote:", out_path)
print("Rows written:", len(mixed))


Ultrachat usable rows: 1000
Leesplank usable rows: 1000
Wrote: /content/drive/MyDrive/LLM/data/dutch_tutor/train_v2_mix.jsonl
Rows written: 1400


In [5]:
from datasets import load_dataset

ds_v2 = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/LLM/data/dutch_tutor/train_v2_mix.jsonl",
    split="train"
)

print("Rows:", len(ds_v2))
print(ds_v2[0]["messages"][0]["role"], ":", ds_v2[0]["messages"][0]["content"][:80], "...")
print(ds_v2[0]["messages"][1]["role"], ":", ds_v2[0]["messages"][1]["content"][:80], "...")
print(ds_v2[0]["messages"][2]["role"], ":", ds_v2[0]["messages"][2]["content"][:80], "...")


Generating train split: 0 examples [00:00, ? examples/s]

Rows: 1400
system : You are a Dutch language tutor. Reply in simple English. Correct the Dutch sente ...
user : Zou u wellicht aanbevelingen kunnen doen met betrekking tot de kruidenafstelling ...
assistant : Voor een authentieke Cajun draai aan uw vleespotpastai, zou ik aanbevelen om uw  ...


In [None]:
%pip install -q -U trl bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

In [7]:
import torch
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

print("bnb_config ready")


bnb_config ready


In [9]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

print("lora_config ready")


lora_config ready


In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import get_peft_model
from trl import SFTTrainer

model_id = "BramVanroy/GEITje-7B-ultra"

tokenizer = AutoTokenizer.from_pretrained(model_id)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
)

model_v3 = get_peft_model(base_model, lora_config)

args = TrainingArguments(
    output_dir="/content/drive/MyDrive/LLM/models/dutch_tutor_lora_v3",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_steps=20,
    save_steps=200,
    report_to=[],
    fp16=False,
    bf16=True,
    optim="paged_adamw_8bit",
)

trainer = SFTTrainer(
    model=model_v3,
    train_dataset=ds_v2,
    args=args,
)

trainer.train()
print("TRAIN V3 DONE")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Tokenizing train dataset:   0%|          | 0/1400 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1400 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Step,Training Loss
20,1.0507
40,0.9343
60,0.8466
80,0.8629
100,0.7845
120,0.7873
140,0.7988
160,0.7657
180,0.7796
200,0.7707


TRAIN V3 DONE


In [11]:
import os, glob

path = "/content/drive/MyDrive/LLM/models/dutch_tutor_lora_v3"
print("Exists:", os.path.exists(path))
print("Files:", sorted([os.path.basename(p) for p in glob.glob(path + "/*")])[:20])


Exists: True
Files: ['README.md', 'checkpoint-200', 'checkpoint-350']


In [None]:
import os, glob, shutil
from pathlib import Path

src = "/content/drive/MyDrive/LLM/models/dutch_tutor_lora_v3"
dst = "/content/drive/MyDrive/LLM/models/dutch_tutor_lora_v3_final"
Path(dst).mkdir(parents=True, exist_ok=True)

# pick the latest checkpoint
ckpts = sorted(glob.glob(os.path.join(src, "checkpoint-*")), key=lambda p: int(p.split("-")[-1]))
latest = ckpts[-1]
print("Latest checkpoint:", latest)

# copy adapter files
copied = []
for name in ["adapter_config.json", "adapter_model.safetensors", "adapter_model.bin"]:
    f = os.path.join(latest, name)
    if os.path.exists(f):
        shutil.copy2(f, os.path.join(dst, name))
        copied.append(name)

print("Copied:", copied)
print("Final path:", dst)
print("Final files:", sorted(os.listdir(dst)))

Latest checkpoint: /content/drive/MyDrive/LLM/models/dutch_tutor_lora_v3/checkpoint-350
Copied: ['adapter_config.json', 'adapter_model.safetensors']
Final path: /content/drive/MyDrive/LLM/models/dutch_tutor_lora_v3_final
Final files: ['adapter_config.json', 'adapter_model.safetensors']


In [15]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

SYSTEM = (
    "You are a Dutch language tutor.\n"
    "Rules:\n"
    "- Reply in simple English.\n"
    "- Correct the Dutch sentence.\n"
    "- Explain the mistake briefly.\n"
    "- Give 2 short Dutch examples.\n"
)

def build_prompt(sentence):
     return f"""
You are a Dutch language tutor.
Reply in simple English.

Example:
Student sentence: Ik heb gisteren naar winkel gaan.
Answer:
- Correct: Ik ben gisteren naar de winkel gegaan.
- Explanation: 'Heb' is not used with movement verbs in the past. Dutch uses 'ben gegaan'.
- Examples:
  - Ik ben naar de supermarkt gegaan.
  - Ik ben gisteren naar huis gegaan.

Now do the same.

Student sentence: {sentence}
Answer:
"""

base_model = "BramVanroy/GEITje-7B-ultra"
adapter_path = "/content/drive/MyDrive/LLM/models/dutch_tutor_lora_v3_final"

tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

model = PeftModel.from_pretrained(model, adapter_path)
model.eval()

prompt = build_prompt("Ik heb gisteren naar winkel gaan.")
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=180,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

text = tokenizer.decode(output[0], skip_special_tokens=True)

if "Answer:" in text:
    text = text.split("Answer:", 1)[-1].strip()

print(text)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



- Correct: Ik ben gisteren naar de winkel gegaan.
- Explanation: 'Heb' is not used with movement verbs in the past. Dutch uses 'ben gegaan'.
- Examples:
  - Ik ben naar de supermarkt gegaan.
  - Ik ben gisteren naar huis gegaan.

Now do the same.

Student sentence: Ik heb gisteren naar winkel gaan.
Answer:
- Correct: Ik ben gisteren naar de winkel gegaan.
- Explanation: 'Heb' is not used with movement verbs in the past. Dutch uses 'ben gegaan'.
- Examples:
  - Ik ben naar de supermarkt gegaan.
  - Ik ben gisteren naar huis gegaan.


```
