In [1]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install datasets evaluate rouge_score trl peft accelerate bitsandbytes

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-01y5bkcf/unsloth_43ade61ca84e473cbf0ef9914c867e2a
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-01y5bkcf/unsloth_43ade61ca84e473cbf0ef9914c867e2a
  Resolved https://github.com/unslothai/unsloth.git to commit 378d03c19652f146a69e6015436299c839d841d2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.10.1 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.10.1-py3-none-any.whl.metadata (31 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.gi

In [2]:
import torch
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset
from transformers import TrainingArguments, AutoTokenizer, GenerationConfig
from trl import SFTTrainer
import evaluate
from rouge_score import rouge_scorer

import pandas as pd
import numpy as np
import re
from google.colab import userdata
import random
from tqdm import tqdm


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [3]:
max_seq_length = 2048
dtype = None
load_in_4bit = True # Use 4bit quantization to reduce memory usage (Set True for qLoRA)
model_name = "unsloth/Llama-3.2-3B-Instruct"

# Load Model

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = userdata.get('HF_TOKEN'),
    output_hidden_states=True,
    output_attentions=False,
)

==((====))==  Unsloth 2025.10.1: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",

    use_gradient_checkpointing = "unsloth",
    random_state = 1,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.10.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


# Data Prep

In [6]:
def formatting_prompts_func(examples):
  questions = examples['question']
  answers = examples['correct_answer']
  distractor1 = examples['distractor1']
  distractor2 = examples['distractor2']
  distractor3 = examples['distractor3']
  supports = examples['support']

  texts = []

  system_prompt = 'You are a science teacher. Select the correct choice from the multiple-choice options and provide a supporting explanation.'

  for q, a, d1, d2, d3, s in zip(questions, answers, distractor1, distractor2, distractor3, supports):
    o = [a, d1, d2, d3]
    random.shuffle(o)

    user_message = f'''### Question:
<<{q}>>

### Options:
<<{o[0]}>>
<<{o[1]}>>
<<{o[2]}>>
<<{o[3]}>>
'''

    assistant_response = f'''### Answer:
<<{a}>>

### Support:
<<{s}>>'''

    messages = [
        {"role": "system", "content": system_prompt},
        {'role': 'user', 'content': user_message},
        {'role':'assistant', 'content': assistant_response}
    ]

    formatted_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    texts.append(formatted_text)

  return {'text':texts}

In [7]:
def filter_nonempty_examples(dataset):
    def has_text(example):
        return all(len(example[field].strip()) > 0 for field in ['question', 'correct_answer', 'support'])

    return dataset.filter(has_text)

train_dataset = load_dataset("allenai/sciq", split = "train").shuffle(seed = 1).select(range(2125))
eval_dataset = load_dataset("allenai/sciq", split = "validation").shuffle(seed = 1).select(range(850))
train_dataset[0]

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.99M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/339k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/343k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11679 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'question': 'What layer of soil, essential for farming, has the highest proportion of organic material?',
 'distractor3': 'subsoil',
 'distractor1': 'bedrock',
 'distractor2': 'humus',
 'correct_answer': 'topsoil',
 'support': 'Topsoil has the highest proportion of organic material. Topsoil is essential for farming.'}

In [8]:
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True)

train_dataset[0]

Map:   0%|          | 0/2125 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

{'question': 'What layer of soil, essential for farming, has the highest proportion of organic material?',
 'distractor3': 'subsoil',
 'distractor1': 'bedrock',
 'distractor2': 'humus',
 'correct_answer': 'topsoil',
 'support': 'Topsoil has the highest proportion of organic material. Topsoil is essential for farming.',
 'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 14 Oct 2025\n\nYou are a science teacher. Select the correct choice from the multiple-choice options and provide a supporting explanation.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n### Question:\n<<What layer of soil, essential for farming, has the highest proportion of organic material?>>\n\n### Options:\n<<subsoil>>\n<<topsoil>>\n<<humus>>\n<<bedrock>><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n### Answer:\n<<topsoil>>\n\n### Support:\n<<Topsoil has the highest proportion of organic material. Topsoil is essential for farm

In [9]:
def parse_output(text):
    answer_match = re.findall(r"### Answer:\n<<(.*)>>\n\n", text, flags=re.MULTILINE)
    support_match = re.findall(r"### Support:\n<<(.*)>>", text, flags=re.MULTILINE)


    answer = answer_match[-1].strip() if answer_match else ""
    support = support_match[-1].strip() if support_match else ""

    return {"correct_answer": answer, "support": support}

parse_output(train_dataset[0]['text'])

{'correct_answer': 'topsoil',
 'support': 'Topsoil has the highest proportion of organic material. Topsoil is essential for farming.'}

# FineTuning

In [10]:
training_args = TrainingArguments(
    output_dir = "./results",
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 2,

    learning_rate = 1e-4,
    num_train_epochs = 5,
    lr_scheduler_type = "cosine",
    warmup_ratio=0.1,
    weight_decay=0.05,

    eval_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",
    greater_is_better = False,


    logging_strategy="steps",
    logging_steps = 10,

    fp16 = not torch.cuda.is_bf16_supported(),
    report_to = "none",
    seed = 1,
    optim = "adamw_8bit",
    torch_compile=True,
)

In [11]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = True,
)

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/2125 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/850 [00:00<?, ? examples/s]

In [12]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,125 | Num Epochs = 5 | Total steps = 665
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 12,156,928 of 3,224,906,752 (0.38% trained)


Unsloth: Enabled auto compiling
Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,1.1407,1.113823
2,1.0996,1.091299
3,1.0594,1.090239
4,0.9344,1.110753
5,0.9117,1.125508


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


TrainOutput(global_step=665, training_loss=1.112531642626999, metrics={'train_runtime': 2329.1425, 'train_samples_per_second': 4.562, 'train_steps_per_second': 0.286, 'total_flos': 6.566694749908992e+16, 'train_loss': 1.112531642626999, 'epoch': 5.0})

In [13]:
best_model_path = "./best_model"
trainer.save_model(best_model_path)
tokenizer.save_pretrained(best_model_path)

('./best_model/tokenizer_config.json',
 './best_model/special_tokens_map.json',
 './best_model/chat_template.jinja',
 './best_model/tokenizer.json')

# Evaluate

In [14]:
test_dataset = filter_nonempty_examples(load_dataset("allenai/sciq", split = "test")).shuffle(seed = 1).select(range(850))
test_dataset = test_dataset.map(formatting_prompts_func, batched=True)

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

In [15]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = best_model_path,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.10.1: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [16]:
all_predictions = []
all_labels = []

system_prompt = 'You are a science teacher. Select the correct choice from the multiple-choice options and provide a supporting explanation.'


for item in tqdm(test_dataset):
    o = [item['correct_answer'], item['distractor1'], item['distractor2'], item['distractor3']]
    random.shuffle(o)

    prompt = f'''### Question:
<<{item['question']}>>

### Options:
<<{o[0]}>>
<<{o[1]}>>
<<{o[2]}>>
<<{o[3]}>>
'''


    label_text = f'''### Answer:
<<{item['correct_answer']}>>

### Support:
<<{item['support']}>>'''

    messages_inf = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': prompt},
    ]

    prompt_for_model = tokenizer.apply_chat_template(messages_inf, tokenize=False, add_generation_prompt=False)


    inputs = tokenizer(prompt_for_model, return_tensors="pt").to("cuda")


    outputs = model.generate(**inputs, max_new_tokens=256, use_cache=True)
    prediction_text = tokenizer.batch_decode(outputs)[0]


    all_predictions.append(prediction_text)
    all_labels.append(label_text)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 850/850 [48:51<00:00,  3.45s/it]


In [20]:
print('\n\n')
print(all_predictions[75])
print('------')
print(parse_output(all_predictions[75]))
print('------')
print(all_labels[75])
print('------')
print(parse_output(all_labels[75]))
print('\n\n')




<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 14 Oct 2025

You are a science teacher. Select the correct choice from the multiple-choice options and provide a supporting explanation.<|eot_id|><|start_header_id|>user<|end_header_id|>

### Question:
<<A vector is an organism that carries what disease-causing microorganisms from one person or animal to another?>>

### Options:
<<genome>>
<<microbe>>
<<microbe>>
<<pathogen>><|eot_id|><|start_header_id|>assistant<|end_header_id|>

### Answer:
<<pathogen>>

### Support:
<<Vector A vector is an organism that carries a pathogen from one person or animal to another. Vectors are often animals, such as mosquitoes, ticks, or fleas. Pathogens are organisms that cause disease. Vectors and pathogens are shown in Figure below.>><|eot_id|>
------
{'correct_answer': 'pathogen', 'support': 'Vector A vector is an organism that carries a pathogen from one person or animal 

In [22]:
parsed_preds = [parse_output(p) for p in all_predictions]
parsed_labels = [parse_output(l) for l in all_labels]



correct = sum( (pa['correct_answer'].strip().lower() == la['correct_answer'].strip().lower()) for pa, la in zip(parsed_preds, parsed_labels) )
accuracy = correct / len(parsed_preds) if parsed_preds else 0.0
print(f"Accuracy on best model: {accuracy:.4f}")

Accuracy on best model: 0.8941
