### Install dependencies
Install the required libraries to load the base model, run 4-bit quantization on GPU, fine-tune with LoRA (PEFT), and handle the dataset.

In [None]:
!pip -q install -U transformers accelerate bitsandbytes
!pip -q install -U transformers accelerate bitsandbytes peft datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h

# Import libraries 

In [None]:
import os
from pathlib import Path
import random
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import json, re, torch
from peft import PeftModel
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling

### Load the dataset
Read the JSON file from the Kaggle input path, load it into memory, and quickly verify the dataset size and schema (keys of the first record).


In [None]:
path = Path("/kaggle/input/chapter1/carey_sundberg_partA_ch1_100_mcq_fa.json")

data = json.loads(path.read_text(encoding="utf-8"))
print("Loaded:", len(data))
print("First keys:", list(data[0].keys()))


Loaded: 100
First keys: ['question', 'options', 'correct', 'reasoning']


### Train/test split (80/20)
Shuffle the dataset with a fixed random seed, then split it into 80% training and 20% testing to evaluate the model before and after fine-tuning.


In [2]:
seed = 1
rng = random.Random(seed)

idx = list(range(len(data)))
rng.shuffle(idx)

n_test = int(0.2 * len(data)) 
test_idx = set(idx[:n_test])

train = [data[i] for i in range(len(data)) if i not in test_idx]
test  = [data[i] for i in range(len(data)) if i in test_idx]

print("Train:", len(train), " Test:", len(test))


Train: 80  Test: 20


### Save split datasets
Write the 80/20 split into `train.json` and `test.json` in the Kaggle working directory so they can be reused for training and evaluation.


In [3]:
out_train = Path("/kaggle/working/train.json")
out_test  = Path("/kaggle/working/test.json")

out_train.write_text(json.dumps(train, ensure_ascii=False, indent=2), encoding="utf-8")
out_test.write_text(json.dumps(test, ensure_ascii=False, indent=2), encoding="utf-8")

print("Saved:", out_train, out_test)


Saved: /kaggle/working/train.json /kaggle/working/test.json


### Load the base model (4-bit)
Load the Qwen2.5-3B-Instruct model with 4-bit NF4 quantization to reduce GPU memory usage, then initialize the tokenizer and set the model to evaluation mode.


In [None]:
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"  

bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=bnb
)
model.eval()

print("Loaded:", MODEL_NAME)
print("Device:", model.device)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

2025-12-28 20:23:36.619085: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766953416.770698      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766953416.814906      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766953417.170908      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766953417.170940      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766953417.170943      55 computation_placer.cc:177] computation placer alr

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Loaded: Qwen/Qwen2.5-3B-Instruct
Device: cuda:0


### Baseline inference (single example)
Run the base model on one test question using a strict multiple-choice prompt, then parse the output to extract a single letter (A/B/C/D) and compare it with the ground-truth answer.


In [None]:


test = json.load(open("/kaggle/working/test.json", "r", encoding="utf-8"))

def build_prompt(item):
    opts = []
    for o in item["options"]:
        opts.append(o.replace("A:", "A)").replace("B:", "B)").replace("C:", "C)").replace("D:", "D)"))
    return (
        "یک سوال چهارگزینه‌ای شیمی آلی به زبان فارسی داریم.\n"
        "فقط حرف گزینهٔ صحیح را از بین A/B/C/D برگردان و هیچ متن دیگری ننویس.\n\n"
        f"سوال:\n{item['question']}\n\n"
        f"گزینه‌ها:\n" + "\n".join(opts) + "\n\n"
        "پاسخ:"
    )

def extract_choice(text):
    m = re.search(r"\b([ABCD])\b", text.upper())
    return m.group(1) if m else None

@torch.no_grad()
def predict_one(item):
    prompt = build_prompt(item)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    out = model.generate(
        **inputs,
        max_new_tokens=5,
        do_sample=False,
        temperature=0.0
    )
    gen = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return gen, extract_choice(gen)

ex = test[5]
gen, pred = predict_one(ex)

print("ID:", ex.get("id"))
print("Gold:", ex["correct"])
print("Model raw output:", repr(gen))
print("Parsed:", pred)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


ID: None
Gold: B
Model raw output: ' D\nD) '
Parsed: D


### Baseline evaluation (full test set)
Evaluate the base model on all test questions, compute baseline accuracy, track any outputs that fail to parse into A/B/C/D, and print a few incorrect examples for quick error analysis.


In [None]:
TEST_PATH = "/kaggle/working/test.json"

test = json.load(open(TEST_PATH, "r", encoding="utf-8"))
print("Test size:", len(test))

def build_prompt(item):
    opts = []
    for o in item["options"]:
        opts.append(o.replace("A:", "A)").replace("B:", "B)").replace("C:", "C)").replace("D:", "D)"))
    return (
        "یک سوال چهارگزینه‌ای شیمی آلی به زبان فارسی داریم.\n"
        "فقط حرف گزینهٔ صحیح را از بین A/B/C/D برگردان و هیچ متن دیگری ننویس.\n\n"
        f"سوال:\n{item['question']}\n\n"
        f"گزینه‌ها:\n" + "\n".join(opts) + "\n\n"
        "پاسخ:"
    )

def extract_choice(text):
    m = re.search(r"\b([ABCD])\b", text.upper())
    return m.group(1) if m else None

@torch.no_grad()
def predict_choice(item):
    prompt = build_prompt(item)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    out = model.generate(
        **inputs,
        max_new_tokens=5,
        do_sample=False,
        temperature=0.0
    )
    gen = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
    return gen, extract_choice(gen)

correct = 0
none_count = 0
rows = [] 

for ex in test:
    raw, pred = predict_choice(ex)
    gold = ex["correct"]
    ok = (pred == gold)
    if pred is None:
        none_count += 1
    if ok:
        correct += 1

    rows.append({
        "id": ex.get("id"),
        "gold": gold,
        "pred": pred,
        "raw": raw,
        "is_correct": ok
    })

acc = correct / len(test)
print("Baseline Accuracy:", acc)
print("No-parse outputs:", none_count)

wrong = [r for r in rows if not r["is_correct"]]
print("Wrong:", len(wrong))
for r in wrong[:10]:
    print("-"*80)
    print("id:", r["id"])
    print("gold:", r["gold"], " pred:", r["pred"])
    print("raw output:", repr(r["raw"]))


Test size: 20
Baseline Accuracy: 0.75
No-parse outputs: 0
Wrong: 5
--------------------------------------------------------------------------------
id: None
gold: A  pred: B
raw output: 'B\n\nپاسخ'
--------------------------------------------------------------------------------
id: None
gold: A  pred: B
raw output: 'B\nB) آ'
--------------------------------------------------------------------------------
id: None
gold: B  pred: D
raw output: 'D\nD)'
--------------------------------------------------------------------------------
id: None
gold: C  pred: D
raw output: 'D) افزای'
--------------------------------------------------------------------------------
id: None
gold: A  pred: B
raw output: 'B\nB) ا'


### Build the SFT training file (prompt + answer)
Convert `train.json` into a JSONL file where each record contains a single `text` field: a formatted multiple-choice prompt followed by the correct letter. This format is used for supervised fine-tuning (SFT).


In [None]:
TRAIN_PATH = Path("/kaggle/working/train.json")
OUT_PATH = Path("/kaggle/working/train_sft.jsonl")

train = json.loads(TRAIN_PATH.read_text(encoding="utf-8"))
print("Train size:", len(train))

def build_text(item):
    opts = "\n".join([
        o.replace("A:", "A)").replace("B:", "B)").replace("C:", "C)").replace("D:", "D)")
        for o in item["options"]
    ])
    prompt = (
        "### دستورالعمل:\n"
        "یک سوال چهارگزینه‌ای شیمی آلی به زبان فارسی داریم. فقط حرف گزینهٔ صحیح را از بین A/B/C/D برگردان و هیچ متن دیگری ننویس.\n\n"
        "### سوال:\n"
        f"{item['question'].strip()}\n\n"
        "### گزینه‌ها:\n"
        f"{opts}\n\n"
        "### پاسخ:\n"
    )
    return prompt + item["correct"].strip()

OUT_PATH.write_text(
    "\n".join(json.dumps({"id": x.get("id",""), "text": build_text(x)}, ensure_ascii=False) for x in train),
    encoding="utf-8"
)

print("Saved:", OUT_PATH)

sample = json.loads(OUT_PATH.read_text(encoding="utf-8").splitlines()[0])
print("\n--- sample ---\n", sample["text"])


Train size: 80
Saved: /kaggle/working/train_sft.jsonl

--- sample ---
 ### دستورالعمل:
یک سوال چهارگزینه‌ای شیمی آلی به زبان فارسی داریم. فقط حرف گزینهٔ صحیح را از بین A/B/C/D برگردان و هیچ متن دیگری ننویس.

### سوال:
در نظریه اوربیتال مولکولی، برهم‌کنش سازنده (هم‌فاز) بین دو اوربیتال اتمی معمولاً چه نتیجه‌ای دارد؟

### گزینه‌ها:
A) ایجاد اوربیتال پیوندی با انرژی کمتر از اوربیتال‌های اتمی
B) ایجاد اوربیتال ضدپیوندی با انرژی کمتر
C) ایجاد اوربیتال ناپیوندی با انرژی بسیار بالاتر
D) حذف کامل هم‌پوشانی و بدون تغییر انرژی

### پاسخ:
A


### Tokenize the training data
Load the SFT JSONL file, tokenize each training example (with truncation to a maximum sequence length), and create a causal language modeling data collator for efficient batching during fine-tuning.


In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

ds = load_dataset("json", data_files={"train": "/kaggle/working/train_sft.jsonl"})["train"]
print("train records:", len(ds))

MAX_LEN = 512 

def tok(ex):
    return tokenizer(ex["text"], truncation=True, max_length=MAX_LEN, padding=False)

tok_ds = ds.map(tok, remove_columns=ds.column_names)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[?25h

Generating train split: 0 examples [00:00, ? examples/s]

train records: 80


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

### Fine-tune with QLoRA (LoRA adapters)
Attach LoRA adapters to the 4-bit base model and fine-tune only these lightweight parameters using the tokenized training set. Save the trained adapter for later evaluation and reuse.


In [None]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.10,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

training_args = TrainingArguments(
    output_dir="/kaggle/working/carey_qwen3b_qlora_run_v2",
    num_train_epochs=2,
    per_device_train_batch_size=1,      
    gradient_accumulation_steps=16,     
    learning_rate=5e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    max_grad_norm=0.3,
    logging_steps=5,
    save_strategy="epoch",
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_ds,
    data_collator=data_collator
)

trainer.train()
trainer.save_model("/kaggle/working/carey_lora_adapter_v2")
print("Saved adapter to /kaggle/working/carey_lora_adapter_v2")



trainable params: 14,966,784 || all params: 3,100,905,472 || trainable%: 0.4827


Step,Training Loss
5,1.8958
10,1.6975


Saved adapter to /kaggle/working/carey_lora_adapter_v2


### Post fine-tuning evaluation
Run the fine-tuned model on the full test set using the same multiple-choice prompt format, extract the predicted option (A/B/C/D), and compute the final accuracy to compare against the baseline.


In [None]:
TEST_PATH = "/kaggle/working/test.json"
ADAPTER_PATH = "/kaggle/working/carey_lora_adapter"

test = json.load(open(TEST_PATH, "r", encoding="utf-8"))
print("Test size:", len(test))

def build_prompt(item):
    opts = []
    for o in item["options"]:
        opts.append(o.replace("A:", "A)").replace("B:", "B)").replace("C:", "C)").replace("D:", "D)"))
    return (
        "یک سوال چهارگزینه‌ای شیمی آلی به زبان فارسی داریم.\n"
        "فقط حرف گزینهٔ صحیح را از بین A/B/C/D برگردان و هیچ متن دیگری ننویس.\n\n"
        f"سوال:\n{item['question']}\n\n"
        f"گزینه‌ها:\n" + "\n".join(opts) + "\n\n"
        "پاسخ:"
    )

def extract_choice(text):
    m = re.search(r"\b([ABCD])\b", text.upper())
    return m.group(1) if m else None

@torch.no_grad()
def predict(item):
    prompt = build_prompt(item)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    out = model.generate(**inputs, max_new_tokens=5, do_sample=False, temperature=0.0)
    gen = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
    return extract_choice(gen)

correct = 0
for ex in test:
    if predict(ex) == ex["correct"]:
        correct += 1

acc_after = correct / len(test)
print("Accuracy after fine-tune:", acc_after)


Test size: 20
Accuracy after fine-tune: 0.85


Report

### Final report (summary)

In this project, we first generated a Persian multiple-choice (MCQ) dataset based on *Advanced Organic Chemistry (Carey)* using a large language model, then manually reviewed and standardized the questions. Initially, the questions covered the entire book; however, because the dataset was small (100 questions) and the book is very broad, the coverage per topic/chapter was insufficient and the fine-tuning improvements were limited. Therefore, we narrowed the scope to a single chapter to make the training data more focused and consistent.

Next, we split the dataset into **80% training / 20% testing**, loaded **Qwen2.5-3B-Instruct** in **4-bit** mode, and measured baseline performance. We then fine-tuned the model using **QLoRA/LoRA** on the training set and evaluated it again on the held-out test set.

Key challenges included **TRL/SFTTrainer version incompatibilities on Kaggle** (causing repeated argument errors) and the small test set size (20 questions), which makes evaluation more sensitive to small changes. After switching to the stable **Transformers Trainer** workflow and focusing on one chapter, the fine-tuned model improved from **0.75 accuracy before fine-tuning to 0.85 after fine-tuning** (+0.10).
