In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
from unsloth import FastLanguageModel
import torch

fourbit_models = [
    "unsloth/Qwen3-1.7B-unsloth-bnb-4bit", # Qwen 14B 2x faster
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    "unsloth/Qwen3-32B-unsloth-bnb-4bit",

    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/Phi-4",
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit" # [NEW] We support TTS models!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B",
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-06-23 14:13:12.628741: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750687992.798167      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750687992.853537      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.5: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 6.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/168k [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/4.67k [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth 2025.6.5 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [4]:
import pandas as pd
df = pd.read_csv("/kaggle/input/intra-cuet-ml-contest/train.csv")

In [5]:
test = pd.read_csv("/kaggle/input/intra-cuet-ml-contest/test.csv")

In [6]:
def generate_mcq_prompt(example):
    question = example["question"]
    options = example["options"]  # Assumes a list like ["A. ...", "B. ...", ...]
    answer = example["answer"]    # Assumes string like "A"

    full_question = f"{question}\n" + "\n".join(options)
    return {
        "conversations": [
            {"role": "user", "content": full_question},
            {"role": "assistant", "content": answer}
        ]
    }


In [7]:
from datasets import Dataset
hf_dataset = Dataset.from_pandas(df)
processed_dataset = hf_dataset.map(generate_mcq_prompt)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [8]:
formatted_data = tokenizer.apply_chat_template(
    processed_dataset["conversations"],
    tokenize = False,
)

In [9]:
train_texts = pd.Series(formatted_data)
train_texts.name = "text"
combined_dataset = Dataset.from_pandas(pd.DataFrame(train_texts))
combined_dataset = combined_dataset.shuffle(seed=3407)

In [10]:
from unsloth import is_bfloat16_supported
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = combined_dataset,  # YOUR prepared dataset
    eval_dataset = None,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        bf16 = is_bfloat16_supported(),
        fp16 = not is_bfloat16_supported(),
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,  # Increase this if needed
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none",
    ),
)

trainer_stats = trainer.train()


average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.


Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/1500 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,500 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 128,450,560/14,000,000,000 (0.92% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.0784
2,2.0322
3,2.0048
4,1.7773
5,1.5007
6,1.3975
7,1.4167
8,1.0633
9,1.026
10,0.8853


In [11]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,500 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 128,450,560/14,000,000,000 (0.92% trained)


Step,Training Loss
1,0.5852
2,0.6523
3,0.55
4,0.5265
5,0.5274
6,0.5875
7,0.5911
8,0.5457
9,0.4818
10,0.4762


In [12]:
messages = [
    {"role" : "user", "content" : "Solve (x + 2)^2 = 0."}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True,
    enable_thinking = True,
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 1024,
    temperature = 0.1, top_p = 0.95, top_k = 50,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)


<think>

</think>

B<|im_end|>


In [13]:
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/vocab.json',
 'lora_model/merges.txt',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

In [None]:
from unsloth import FastLanguageModel

base_model_name = "unsloth/Qwen3-14B"  # Same base model you used for training

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/kaggle/working/lora_model",
    max_seq_length = 2048,
    load_in_4bit = True,
    dtype = torch.float16,
)

# Apply the saved LoRA weights
model.load_adapter("/kaggle/working/lora_model")
model.eval()


In [None]:
import pandas as pd

df_test = pd.read_csv("/kaggle/input/your-dataset/test.csv")


In [None]:
def format_mcq_prompt(row):
    options = [
        f"A. {row['option_1']}",
        f"B. {row['option_2']}",
        f"C. {row['option_3']}",
        f"D. {row['option_4']}",
    ]
    full_question = f"{row['question']}\n" + "\n".join(options)
    return [{"role": "user", "content": full_question}]


In [None]:
from transformers import TextStreamer
import torch

predictions = []

for idx, row in df_test.iterrows():
    messages = format_mcq_prompt(row)
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1,
            temperature=0.0,
            do_sample=False,
        )
    prediction = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:]).strip()
    prediction = prediction[0] if prediction and prediction[0] in ["A", "B", "C", "D"] else "A"  # fallback
    predictions.append(prediction)


In [None]:
submission = pd.DataFrame({
    "id": df_test["id"],
    "answer": predictions
})
submission.to_csv("submission.csv", index=False)
