In [1]:
import sys, os
import torch

print("Python:", sys.version.split()[0])
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
print("CWD:", os.getcwd())


Python: 3.12.12
CUDA available: True
GPU: NVIDIA L4
CWD: /content/drive/MyDrive/LLM/notebooks


In [2]:
import os
from pathlib import Path

HF_CACHE = "/content/drive/MyDrive/hf_cache"
Path(HF_CACHE).mkdir(parents=True, exist_ok=True)

os.environ["HF_HOME"] = HF_CACHE
os.environ["TRANSFORMERS_CACHE"] = HF_CACHE

print("HF_HOME =", os.environ["HF_HOME"])


HF_HOME = /content/drive/MyDrive/hf_cache


In [3]:
%pip -q install -U transformers datasets accelerate peft bitsandbytes sentencepiece huggingface_hub
print("packages installed")

packages installed


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "BramVanroy/GEITje-7B-ultra"  

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    load_in_4bit=True,  # safer for Colab L4 memory
)

print("MODEL OK:", model_id)



Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

MODEL OK: BramVanroy/GEITje-7B-ultra


In [5]:
import torch
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True)

prompt = (
    "You are a Dutch language tutor.\n"
    "Rules:\n"
    "- Reply in simple English.\n"
    "- Correct the Dutch sentence.\n"
    "- Explain the mistake briefly.\n"
    "- Give 2 short Dutch examples.\n\n"
    "Student sentence: Ik heb gisteren naar winkel gaan.\n"
    "Answer:"
)

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    _ = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.3,
        top_p=0.9,
        streamer=streamer,
    )


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Correct Dutch sentence: Ik heb gisteren naar de winkel gegaan.

Mistake: naar winkel gaan.
Explanation: In Dutch, "naar winkel gaan" is "naar de winkel gaan".

Dutch examples:
1. Ik ga naar de supermarkt.
2. Ik ga naar de boekwinkel.

This is an example of how to reply to a Dutch language tutoring question. The tutor will provide a correct Dutch sentence and explain the mistake, along with two short Dutch examples.</s>


In [6]:
TARGET_FORMAT = """
Correct sentence:
Ik ben gisteren naar de winkel gegaan.

Explanation (simple English):
In Dutch, past actions with "gaan" use "zijn" as the auxiliary verb.
"Gaan" is a movement verb, so we say "ik ben gegaan", not "ik heb gegaan".

Examples:
- Ik ben gisteren naar huis gegaan.
- Zij is vorige week naar school gegaan.
"""


In [7]:
import json
from pathlib import Path

out_path = Path("/content/drive/MyDrive/LLM/data/dutch_tutor/train_v0.jsonl")
out_path.parent.mkdir(parents=True, exist_ok=True)

SYSTEM = (
    "You are a Dutch language tutor. "
    "Reply in simple English. "
    "Correct the Dutch sentence, explain briefly, then give 2 short Dutch examples."
)

samples = [
    ("Ik heb gisteren naar winkel gaan.",
     "Correct sentence:\nIk ben gisteren naar de winkel gegaan.\n\n"
     "Explanation (simple English):\nFor movement verbs like 'gaan', Dutch uses 'zijn' in the past: 'ik ben gegaan'.\n\n"
     "Examples:\n- Ik ben gisteren naar huis gegaan.\n- Zij is vorige week naar school gegaan.\n"),
    ("Hij hebben een auto.",
     "Correct sentence:\nHij heeft een auto.\n\n"
     "Explanation (simple English):\nWith 'hij/zij/het' you use 'heeft', not 'hebben'.\n\n"
     "Examples:\n- Hij heeft een fiets.\n- Zij heeft een hond.\n"),
    ("Wij is blij.",
     "Correct sentence:\nWij zijn blij.\n\n"
     "Explanation (simple English):\nWith 'wij' you use 'zijn', not 'is'.\n\n"
     "Examples:\n- Wij zijn thuis.\n- Wij zijn moe.\n"),
    ("Ik woon in Nederland sinds twee jaar.",
     "Correct sentence:\nIk woon al twee jaar in Nederland.\n\n"
     "Explanation (simple English):\nUse 'al' to say how long something has been true up to now.\n\n"
     "Examples:\n- Ik werk al drie maanden hier.\n- Zij leert al een jaar Nederlands.\n"),
    ("Morgen ik ga naar werk.",
     "Correct sentence:\nMorgen ga ik naar het werk.\n\n"
     "Explanation (simple English):\nIn Dutch, the verb usually comes in position 2.\n\n"
     "Examples:\n- Vandaag ga ik naar de winkel.\n- Morgen kom ik later.\n"),
    ("Ik kan niet vind mijn sleutel.",
     "Correct sentence:\nIk kan mijn sleutel niet vinden.\n\n"
     "Explanation (simple English):\nThe infinitive goes to the end, and 'niet' comes before it.\n\n"
     "Examples:\n- Ik kan dat niet begrijpen.\n- Hij kan morgen niet komen.\n"),
    ("Zij gaat naar school met de fiets.",
     "Correct sentence:\nZij gaat met de fiets naar school.\n\n"
     "Explanation (simple English):\nBoth are understandable, but placing 'met de fiets' earlier is more natural.\n\n"
     "Examples:\n- Ik ga met de trein naar Amsterdam.\n- We gaan met de auto naar huis.\n"),
    ("Ik heb honger, ik eet een broodje gisteren.",
     "Correct sentence:\nIk had gisteren honger, dus ik heb een broodje gegeten.\n\n"
     "Explanation (simple English):\nUse past tense for 'yesterday' and a completed action: 'heb gegeten'.\n\n"
     "Examples:\n- Ik heb gisteren pasta gegeten.\n- Hij heeft vorige week pizza gegeten.\n"),
    ("Waar is de station?",
     "Correct sentence:\nWaar is het station?\n\n"
     "Explanation (simple English):\n'Het station' is a 'het' word, so use 'het'.\n\n"
     "Examples:\n- Waar is het toilet?\n- Waar is het museum?\n"),
    ("Ik wil een afspraak maken bij dokter.",
     "Correct sentence:\nIk wil een afspraak maken bij de dokter.\n\n"
     "Explanation (simple English):\nUse 'de dokter' with the article.\n\n"
     "Examples:\n- Ik wil een afspraak maken bij de tandarts.\n- Ik heb een afspraak bij de huisarts.\n"),
]

with out_path.open("w", encoding="utf-8") as f:
    for user_text, assistant_text in samples:
        row = {
            "messages": [
                {"role": "system", "content": SYSTEM},
                {"role": "user", "content": f"Student sentence: {user_text}"},
                {"role": "assistant", "content": assistant_text},
            ]
        }
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Wrote:", out_path)
print("Rows:", len(samples))


Wrote: /content/drive/MyDrive/LLM/data/dutch_tutor/train_v0.jsonl
Rows: 10


In [8]:
from datasets import load_dataset

ds = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/LLM/data/dutch_tutor/train_v0.jsonl",
    split="train"
)

print("Rows:", len(ds))
print(ds[0])


Generating train split: 0 examples [00:00, ? examples/s]

Rows: 10
{'messages': [{'role': 'system', 'content': 'You are a Dutch language tutor. Reply in simple English. Correct the Dutch sentence, explain briefly, then give 2 short Dutch examples.'}, {'role': 'user', 'content': 'Student sentence: Ik heb gisteren naar winkel gaan.'}, {'role': 'assistant', 'content': "Correct sentence:\nIk ben gisteren naar de winkel gegaan.\n\nExplanation (simple English):\nFor movement verbs like 'gaan', Dutch uses 'zijn' in the past: 'ik ben gegaan'.\n\nExamples:\n- Ik ben gisteren naar huis gegaan.\n- Zij is vorige week naar school gegaan.\n"}]}


In [9]:
import torch
from peft import LoraConfig
from transformers import BitsAndBytesConfig

# 4-bit quantization config (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

print("QLoRA configs ready")


QLoRA configs ready


In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "BramVanroy/GEITje-7B-ultra"

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
)

print("Model loaded in 4-bit")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded in 4-bit


In [11]:
from peft import get_peft_model

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()


trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.0470


In [12]:
%pip -q install -U trl

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer

args = TrainingArguments(
    output_dir="/content/drive/MyDrive/LLM/models/dutch_tutor_lora_v0",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_steps=1,
    save_steps=10,
    report_to=[],

    fp16=False,
    bf16=True,
    optim="paged_adamw_8bit",
    gradient_checkpointing=False,  
)

trainer = SFTTrainer(
    model=model,
    train_dataset=ds,
    args=args,
)

trainer.train()
print("TRAIN DONE")


Step,Training Loss
1,2.2652
2,1.8031
3,1.5769


TRAIN DONE


In [15]:
from pathlib import Path

save_dir = Path("/content/drive/MyDrive/LLM/models/dutch_tutor_lora_v0/adapter")
save_dir.mkdir(parents=True, exist_ok=True)

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print("SAVED:", save_dir)


SAVED: /content/drive/MyDrive/LLM/models/dutch_tutor_lora_v0/adapter


In [16]:
import torch
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True)

prompt = (
    "You are a Dutch language tutor.\n"
    "Rules:\n"
    "- Reply in simple English.\n"
    "- Correct the Dutch sentence.\n"
    "- Explain the mistake briefly.\n"
    "- Give 2 short Dutch examples.\n\n"
    "Student sentence: Ik heb gisteren naar winkel gaan.\n"
    "Answer:"
)

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    _ = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.3,
        top_p=0.9,
        streamer=streamer,
    )



Ik heb gisteren naar winkel gaan. (correct)

Mistake:
"naar winkel gaan" is correct Dutch.

Examples:
1. Ik ga naar de supermarkt.
2. Ik ga naar de boekwinkel.

Explanation:
In Dutch, "naar winkel gaan" is a continuous action (gaan) with an infinitive (naar winkel). This is a common construction in Dutch.

In English, "to go to" is a preposition that is used with a noun, in this case "winkel". In Dutch, we use "naar" with a noun, and "gaan" with an infinitive.

In English, "to" is optional, but in Dutch it is not. "Naar" is the preposition that we use


In [17]:
import json
from pathlib import Path

out_path = Path("/content/drive/MyDrive/LLM/data/dutch_tutor/train_v1.jsonl")

SYSTEM = (
    "You are a Dutch language tutor. "
    "Reply in simple English. "
    "Correct the Dutch sentence, explain briefly, then give 2 short Dutch examples."
)

new_samples = [
    ("Ik heb gisteren naar winkel gaan.",
     "Correct sentence:\nIk ben gisteren naar de winkel gegaan.\n\n"
     "Explanation (simple English):\nWith movement verbs like 'gaan', Dutch uses 'zijn' in the past. Also, 'winkel' needs the article 'de'.\n\n"
     "Examples:\n- Ik ben gisteren naar huis gegaan.\n- Zij is vorige week naar de supermarkt gegaan.\n"),

    ("Hij heb naar huis gegaan.",
     "Correct sentence:\nHij is naar huis gegaan.\n\n"
     "Explanation (simple English):\nMovement verbs use 'zijn' in the past, not 'hebben'.\n\n"
     "Examples:\n- Hij is naar school gegaan.\n- Hij is laat naar huis gegaan.\n"),

    ("Wij hebben naar Amsterdam gegaan.",
     "Correct sentence:\nWij zijn naar Amsterdam gegaan.\n\n"
     "Explanation (simple English):\n'Gaan' is a movement verb, so we use 'zijn'.\n\n"
     "Examples:\n- Wij zijn naar Utrecht gegaan.\n- Wij zijn samen gegaan.\n"),

    ("Ik ben naar winkel gegaan.",
     "Correct sentence:\nIk ben naar de winkel gegaan.\n\n"
     "Explanation (simple English):\nSingular nouns usually need an article like 'de' or 'het'.\n\n"
     "Examples:\n- Ik ga naar de bakker.\n- Ik ga naar de supermarkt.\n"),

    ("Zij is gisteren werken gegaan.",
     "Correct sentence:\nZij is gisteren gaan werken.\n\n"
     "Explanation (simple English):\nWhen two verbs are together, the infinitive usually comes at the end.\n\n"
     "Examples:\n- Ik ben gaan slapen.\n- Hij is gaan studeren.\n"),

    ("Ik heb naar huis gefietst.",
     "Correct sentence:\nIk ben naar huis gefietst.\n\n"
     "Explanation (simple English):\nMovement verbs like 'fietsen' also use 'zijn' in the past.\n\n"
     "Examples:\n- Ik ben naar school gefietst.\n- Zij is snel naar huis gefietst.\n"),

    ("Hij heeft naar kantoor gereden.",
     "Correct sentence:\nHij is naar kantoor gereden.\n\n"
     "Explanation (simple English):\n'Rijden' is a movement verb, so the auxiliary verb is 'zijn'.\n\n"
     "Examples:\n- Hij is naar het werk gereden.\n- Wij zijn samen gereden.\n"),

    ("Ik heb gisteren gekomen.",
     "Correct sentence:\nIk ben gisteren gekomen.\n\n"
     "Explanation (simple English):\n'Komen' always uses 'zijn' in the past.\n\n"
     "Examples:\n- Zij is vroeg gekomen.\n- Hij is later gekomen.\n"),

    ("Wij hebben thuis gebleven.",
     "Correct sentence:\nWij zijn thuis gebleven.\n\n"
     "Explanation (simple English):\n'Blijven' is also a movement/state-change verb and uses 'zijn'.\n\n"
     "Examples:\n- Ik ben thuis gebleven.\n- Zij is een week gebleven.\n"),

    ("Ik ga naar winkel morgen.",
     "Correct sentence:\nMorgen ga ik naar de winkel.\n\n"
     "Explanation (simple English):\nIn Dutch, the verb comes in position 2.\n\n"
     "Examples:\n- Vandaag ga ik werken.\n- Morgen ga ik sporten.\n"),
]

# Append to existing dataset
with open("/content/drive/MyDrive/LLM/data/dutch_tutor/train_v0.jsonl", "r", encoding="utf-8") as f:
    existing = f.readlines()

with out_path.open("w", encoding="utf-8") as f:
    for line in existing:
        f.write(line)
    for user_text, assistant_text in new_samples:
        row = {
            "messages": [
                {"role": "system", "content": SYSTEM},
                {"role": "user", "content": f"Student sentence: {user_text}"},
                {"role": "assistant", "content": assistant_text},
            ]
        }
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("New dataset written to:", out_path)
print("Added samples:", len(new_samples))
print("Total rows:", len(existing) + len(new_samples))


New dataset written to: /content/drive/MyDrive/LLM/data/dutch_tutor/train_v1.jsonl
Added samples: 10
Total rows: 20


In [18]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import get_peft_model
from trl import SFTTrainer

# 1) Load the new dataset
ds_v1 = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/LLM/data/dutch_tutor/train_v1.jsonl",
    split="train"
)
print("Rows:", len(ds_v1))

# 2) Reload base model in 4-bit (fresh) + attach LoRA
model_id = "BramVanroy/GEITje-7B-ultra"
tokenizer = AutoTokenizer.from_pretrained(model_id)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
)

model_v1 = get_peft_model(base_model, lora_config)

# 3) Train (use bf16, fp16 off)
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/LLM/models/dutch_tutor_lora_v1",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=1,
    save_steps=20,
    report_to=[],
    fp16=False,
    bf16=True,
    optim="paged_adamw_8bit",
)

trainer = SFTTrainer(
    model=model_v1,
    train_dataset=ds_v1,
    args=args,
)

trainer.train()
print("TRAIN V1 DONE")


Generating train split: 0 examples [00:00, ? examples/s]

Rows: 20


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Tokenizing train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Step,Training Loss
1,2.3049
2,1.815
3,1.6755
4,1.3998
5,1.2514
6,1.1221
7,0.9178
8,0.9018
9,0.8558
10,0.659


TRAIN V1 DONE


In [20]:
from pathlib import Path

save_dir = Path("/content/drive/MyDrive/LLM/models/dutch_tutor_lora_v1/adapter")
save_dir.mkdir(parents=True, exist_ok=True)

model_v1.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print("SAVED:", save_dir)


SAVED: /content/drive/MyDrive/LLM/models/dutch_tutor_lora_v1/adapter


In [21]:
import torch
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True)

prompt = (
    "You are a Dutch language tutor.\n"
    "Rules:\n"
    "- Reply in simple English.\n"
    "- Correct the Dutch sentence.\n"
    "- Explain the mistake briefly.\n"
    "- Give 2 short Dutch examples.\n\n"
    "Student sentence: Ik heb gisteren naar winkel gaan.\n"
    "Answer:"
)

inputs = tokenizer(prompt, return_tensors="pt").to(model_v1.device)

with torch.no_grad():
    _ = model_v1.generate(
        **inputs,
        max_new_tokens=160,
        do_sample=True,
        temperature=0.2,
        top_p=0.9,
        streamer=streamer,
    )



Correct sentence:
Ik ben gisteren naar de winkel gegaan.

Explanation:
- 'Ik heb' is not correct.
- 'Ik ben' is correct.

Examples:
- Ik ben naar de winkel gegaan.
- Ik ben naar de bioscoop gegaan.

</s>
