In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install unsloth
# Get latest Unsloth
!pip install --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install evaluate
!pip install rouge_score

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype          = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit   = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer   = FastLanguageModel.from_pretrained(
    model_name     = "unsloth/mistral-7b-v0.3-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit   = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.5.10: Fast Mistral patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r                          = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules             = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha                 = 16,
    lora_dropout               = 0,         # Supports any, but = 0 is optimized
    bias                       = "none",    # Supports any, but = "none" is optimized

    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state               = 3407,
    use_rslora                 = False,     # We support rank stabilized LoRA
    loftq_config               = None,      # And LoftQ
)

Unsloth 2025.5.10 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from datasets import load_dataset, concatenate_datasets

dataset_basic  = load_dataset(
    "json",
    data_files = "/content/turkish_c_dataset_20250531_181341.json",
    split      = "train"
)

# Keep only the 'question' and 'answer' fields
dataset_basic  = dataset_basic.remove_columns([col for col in dataset_basic.column_names if col not in ["question", "answer"]])

dataset_expert = load_dataset(
    "json",
    data_files = "/content/expert_only_turkish_c_20250531_141759.json",
    split      = "train"
)

# Keep only the 'question' and 'answer' fields
dataset_expert = dataset_expert.remove_columns([col for col in dataset_expert.column_names if col not in ["question", "answer"]])
dataset        = concatenate_datasets([dataset_basic, dataset_expert])

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# from datasets import train_test_split # This import is not needed

dataset        = dataset.shuffle(seed=42)
split_dataset  = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset  = split_dataset["train"]
test_dataset   = split_dataset["test"]

In [None]:
print(dataset.column_names)

['question', 'answer']


In [None]:
from unsloth import to_sharegpt

dataset_sharegpt       = to_sharegpt(
    dataset,
    merged_prompt      = "Aşağıdaki C programlama sorusunu çözün:\n{question}",  # input
    output_column_name = "answer",  # output
)

Merging columns:   0%|          | 0/896 [00:00<?, ? examples/s]

Converting to ShareGPT:   0%|          | 0/896 [00:00<?, ? examples/s]

In [None]:
from pprint import pprint
pprint(dataset[0])

{'answer': '#include <stdio.h>\n'
           '\n'
           'int main() {\n'
           '    int arr[5];\n'
           '    int sum = 0;\n'
           '    printf("5 adet sayı girin:\\n");\n'
           '    for (int i = 0; i < 5; i++) {\n'
           '        scanf("%d", &arr[i]);\n'
           '    }\n'
           '    for (int i = 0; i < 5; i++) {\n'
           '        sum += *(arr + i); // Pointer aritmetiği kullanarak '
           'dizinin elemanlarına erişiyoruz.\n'
           '    }\n'
           '    printf("Toplam: %d\\n", sum);\n'
           '    return 0;\n'
           '}\n'
           '\n'
           '/* Açıklama: Bu kodda, kullanıcıdan 5 adet sayı alıyoruz ve '
           'bunları bir diziye kaydediyoruz. Daha sonra, pointer aritmetiği '
           'kullanarak dizinin elemanlarına erişip toplamlarını hesaplıyoruz. '
           "'arr + i' ifadesi, dizinin i'inci elemanına işaret eden bir "
           'pointer döner. */',
 'question': 'Aşağıdaki kod parçasında eksik olan k

In [None]:
from unsloth import to_sharegpt, standardize_sharegpt, apply_chat_template

def preprocess(dataset):
    dataset                = to_sharegpt(
        dataset,
        merged_prompt      = "Aşağıdaki C programlama sorusunu çözün:\n{question}",
        output_column_name = "answer"
    )
    dataset                = standardize_sharegpt(dataset)
    return apply_chat_template(
        dataset,
        tokenizer          = tokenizer,
        chat_template      = """Aşağıda bir C programlama sorusu verilmiştir.
Bu soruyu tamamlamak için gerekli olan C kodunu yazın.
>>> Soru:
{INPUT}
>>> Cevap:
{OUTPUT}""",
    )

train_dataset = preprocess(train_dataset)
test_dataset  = preprocess(test_dataset)


Merging columns:   0%|          | 0/806 [00:00<?, ? examples/s]

Converting to ShareGPT:   0%|          | 0/806 [00:00<?, ? examples/s]

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/806 [00:00<?, ? examples/s]

Unsloth: We automatically added an EOS token to stop endless generations.


Map:   0%|          | 0/806 [00:00<?, ? examples/s]

Merging columns:   0%|          | 0/90 [00:00<?, ? examples/s]

Converting to ShareGPT:   0%|          | 0/90 [00:00<?, ? examples/s]

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/90 [00:00<?, ? examples/s]

Unsloth: We automatically added an EOS token to stop endless generations.


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 1,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        per_device_eval_batch_size = 1,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        max_steps = 30,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,  # 🔍 Log every step
        eval_strategy = "steps",
        eval_steps = 1,     # 🔁 Evaluate every step (no metrics)
        save_steps = 15,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        dataloader_pin_memory = False,
        remove_unused_columns = True,
        group_by_length = False,
        report_to = None,
        logging_first_step = True,  # ✅ Show info from the very first step
    ),
)

trainer_stats = trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Unsloth: Tokenizing ["text"]:   0%|          | 0/806 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"]:   0%|          | 0/90 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 806 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 83,886,080/7,000,000,000 (1.20% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
1,0.8257,0.867521
2,0.75,0.826761
3,0.9262,0.765118
4,0.7955,0.713924
5,0.7223,0.662681
6,0.713,0.622059
7,0.7302,0.608082
8,0.5871,0.592497
9,0.5711,0.581999
10,0.6642,0.574837


Unsloth: Not an error, but MistralForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [None]:
model.save_pretrained("mistral_fine_tuned_model") # Local saving
tokenizer.save_pretrained("mistral_fine_tuned_model")

('mistral_fine_tuned_model/tokenizer_config.json',
 'mistral_fine_tuned_model/special_tokens_map.json',
 'mistral_fine_tuned_model/chat_template.jinja',
 'mistral_fine_tuned_model/tokenizer.model',
 'mistral_fine_tuned_model/added_tokens.json',
 'mistral_fine_tuned_model/tokenizer.json')

In [None]:
# Memory and time usage
start_gpu_memory     = torch.cuda.max_memory_reserved()                 / 1024 / 1024 / 1024
max_memory           = torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024

used_memory          = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage      = round(used_memory / max_memory * 100, 3)
lora_percentage      = round(used_memory_for_lora / max_memory * 100, 3)

print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

2609.9333 seconds used for training.
43.5 minutes used for training.
Peak reserved memory = 6.883 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 46.692 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [None]:
# --- 🧪 Simplified Post-Training Evaluation ---
from tqdm import tqdm
import torch
import re
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Prepare model for inference
FastLanguageModel.for_inference(model)

# Select subset to avoid OOM
subset = test_dataset.select(range(50))

# Generate predictions
predictions = []
references = []

for example in tqdm(subset):
    # Extract question and answer
    question = re.search(r">>> Soru:\n(.*?)\n>>> Cevap:", example["text"], re.DOTALL)
    answer = re.search(r">>> Cevap:\n(.*)", example["text"], re.DOTALL)

    if not question or not answer:
        continue

    # Generate response
    messages = [{"role": "user", "content": question.group(1).strip()}]
    input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=256, pad_token_id=tokenizer.eos_token_id)

    pred = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    ref = answer.group(1).strip()

    predictions.append(pred)
    references.append(ref)

# Calculate all metrics
bleu = evaluate.load("bleu").compute(predictions=predictions, references=[[r] for r in references])
rouge = evaluate.load("rouge").compute(predictions=predictions, references=references)

# Accuracy: exact match
exact_match = sum(p.lower() == r.lower() for p, r in zip(predictions, references)) / len(predictions)

# F1: token overlap
def token_f1(predictions, references):
    scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = set(pred.lower().split())
        ref_tokens = set(ref.lower().split())

        if not ref_tokens:
            scores.append(1.0 if not pred_tokens else 0.0)
            continue

        precision = len(pred_tokens & ref_tokens) / len(pred_tokens) if pred_tokens else 0
        recall = len(pred_tokens & ref_tokens) / len(ref_tokens)
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        scores.append(f1)
    return np.mean(scores)

f1_score = token_f1(predictions, references)

# Perplexity: simplified calculation
def simple_perplexity(model, tokenizer, texts):
    total_loss = 0
    total_count = 0

    for text in tqdm(texts[:10], desc="Calculating perplexity"):  # Use fewer samples
        tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to("cuda")
        with torch.no_grad():
            outputs = model(**tokens, labels=tokens.input_ids)
            total_loss += outputs.loss.item()
            total_count += 1

    return torch.exp(torch.tensor(total_loss / total_count)).item()

perplexity = simple_perplexity(model, tokenizer, [ex["text"] for ex in subset])

100%|██████████| 50/50 [14:12<00:00, 17.04s/it]
Calculating perplexity: 100%|██████████| 10/10 [00:05<00:00,  1.79it/s]


In [None]:
# 🖨️ Print results
print("\n📌 Evaluation Results:")
print(f"BLEU:           {bleu['bleu']:.4f}")
print(f"ROUGE-1:        {rouge['rouge1']:.4f}")
print(f"ROUGE-L:        {rouge['rougeL']:.4f}")
print(f"Exact Match:    {exact_match:.4f}")
print(f"F1 Score:       {f1_score:.4f}")
print(f"Perplexity:     {perplexity:.4f}")
print(f"Examples:       {len(predictions)}")


📌 Evaluation Results:
BLEU:           0.2839
ROUGE-1:        0.4627
ROUGE-L:        0.2947
Exact Match:    0.0000
F1 Score:       0.3531
Perplexity:     1.7124
Examples:       50


In [None]:
# 🧠 Ask your fine-tuned model a custom C programming question

question = """Bir C programı yazın. Bu program kullanıcıdan bir tamsayı almalı ve bu sayının asal olup olmadığını kontrol etmelidir."""

messages = [{"role": "user", "content": question}]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda")

with torch.no_grad():
    outputs = model.generate(
        input_ids,
        max_new_tokens=256,
        pad_token_id=tokenizer.eos_token_id
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("📥 Soru:\n", question)
print("📤 Model Cevabı:\n", response)

📥 Soru:
 Bir C programı yazın. Bu program kullanıcıdan bir tamsayı almalı ve bu sayının asal olup olmadığını kontrol etmelidir.
📤 Model Cevabı:
 Aşağıda bir C programlama sorusu verilmiştir.
Bu soruyu tamamlamak için gerekli olan C kodunu yazın.
>>> Soru:
Bir C programı yazın. Bu program kullanıcıdan bir tamsayı almalı ve bu sayının asal olup olmadığını kontrol etmelidir.
>>> Cevap:
#include <stdio.h>

int main() {
    int sayi;
    printf("Bir tamsayı girin: ");
    scanf("%d", &sayi);

    if (sayi < 2) {
        printf("%d asal değildir.\n", sayi);
        return 0;
    }

    for (int i = 2; i <= sayi / 2; i++) {
        if (sayi % i == 0) {
            printf("%d asal değildir. %d'in bir tam böleni var.\n", sayi, i);
            return 0;
        }
    }

    printf("%d asal sayıdır.\n", sayi);
    return 0;
}

// Açıklama: Bu program, kullanıcıdan bir tamsayı alır ve bu sayının asal olup olmadığını kontrol eder. Asal bir sayı, sadece 1 ve öz neg
