In [1]:
!pip install transformers peft trl bitsandbytes datasets

Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (

In [21]:
import os
import gc
import torch

import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
import pandas as pd
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import KTOTrainer
import bitsandbytes as bnb
from datasets import Dataset
import random
from trl import KTOConfig
import numpy as np

In [3]:
random.seed(42)
np.random.seed(42)

In [4]:
model_id = "meta-llama/Llama-3.2-1B"
# token = os.environ('HF_TOKEN')
token = ''

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token, trust_remote_code=True)

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [10]:
pairs_df = pd.read_csv("/content/train_dataset_for_kto.csv")

In [11]:
print(f"Loaded {len(pairs_df)} pairs from dataset")

Loaded 15642 pairs from dataset


In [12]:
user_prompts = [
    "Please create a medical multiple-choice question with four possible answers, only one correct.",
    "Generate a single medical multiple-choice question with exactly one correct answer.",
    "I need a medical multiple-choice question (4 options) with one correct answer.",
    "Write a medical MCQ with four answer choices, only one of which is correct.",
    "Produce a medical multiple-choice question with four options and identify a single correct choice.",
]

In [13]:
eval_size = int(len(pairs_df) * 0.2)

eval_indices = np.random.choice(len(pairs_df), size=eval_size, replace=False)
eval_df = pairs_df.iloc[eval_indices].reset_index(drop=True)
train_df = pairs_df.drop(index=eval_indices).reset_index(drop=True)

print(f"Split dataset into {len(train_df)} training pairs and {len(eval_df)} evaluation pairs")

Split dataset into 12514 training pairs and 3128 evaluation pairs


In [14]:
eval_dataset = Dataset.from_pandas(eval_df)
train_dataset = Dataset.from_pandas(train_df)

In [15]:
def add_prompt(example):
    prompt = random.choice(user_prompts)
    example["prompt"] = prompt
    return example

In [16]:
train_dataset = train_dataset.map(add_prompt)
eval_dataset = eval_dataset.map(add_prompt)

Map:   0%|          | 0/12514 [00:00<?, ? examples/s]

Map:   0%|          | 0/3128 [00:00<?, ? examples/s]

In [17]:
print("\nTraining example:")
print(train_dataset[0])
print("\nEvaluation example:")
print(eval_dataset[0])


Training example:
{'id': 'OIC-155-19-A', 'completion': 'Question: Which of the following situations would necessitate hospitalization for non-necrotizing bacterial dermohypodermatitis treatment?\r\na) A 5-year-old child with a mild infection who lives with supportive family\r\nb) An adult with diabetes and a localized skin infection showing signs of worsening\r\nc) A healthy teenager with a small, uncomplicated bacterial skin infection\r\nd) A 20-year-old individual with penicillin allergy who responds well to clindamycin treatment', 'label': True, 'source': 'gemma_9b_distractor_quality', 'prompt': 'Please create a medical multiple-choice question with four possible answers, only one correct.'}

Evaluation example:
{'id': 'OIC-184-05-A', 'completion': 'Question: When must an employee who has suffered an accident at work report it to the CPAM?\r\na) Within 24 hours\r\nb) Within 48 hours\r\nc) Within 2 years\r\nd) As soon as possible', 'label': False, 'source': 'phi3.5b_distractor_quali

In [18]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules="all-linear")

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16
# )

In [19]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=token,
    torch_dtype=torch.float16,
    load_in_8bit=True,          # This enables 8-bit quantization
    device_map="auto"
)

model.config.use_cache = False
model = prepare_model_for_kbit_training(model)

model = get_peft_model(model, peft_config)

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [20]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

print_trainable_parameters(model)

trainable params: 11272192 || all params: 1247086592 || trainable%: 0.90


In [23]:
new_model = "model"

training_args = KTOConfig(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    remove_unused_columns=True,
    learning_rate=5.0e-06,
    eval_strategy = "steps",   # run evals by step
    eval_steps = 1000,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    num_train_epochs=8,
    save_strategy="steps",
    save_steps=1000,
    logging_steps=10,
    output_dir=new_model,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    bf16=True,
    report_to="none",
    beta=0.1,
    max_prompt_length=2048,
    max_length=2048,
    load_best_model_at_end=True
)

In [None]:
kto_trainer = KTOTrainer(
    model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    peft_config=peft_config
)



Extracting prompt from train dataset:   0%|          | 0/12514 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/12514 [00:00<?, ? examples/s]

Extracting prompt from eval dataset:   0%|          | 0/3128 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/3128 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/12514 [00:00<?, ? examples/s]

In [None]:
kto_trainer.train()