# Imports

In [1]:
import numpy as np
import pandas as pd
import torch
import datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    EarlyStoppingCallback
)
from transformers.trainer_utils import get_last_checkpoint
from trl import DPOConfig, DPOTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import evaluate
import wandb
from datetime import datetime
import time
from tqdm.auto import tqdm
import sqlite3
import sqlparse
import _config

import os
import psutil
import GPUtil
import gc

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["WANDB_API_KEY"] = _config.WANDB_API_KEY
os.environ["WANDB_PROJECT"] = _config.WANDB_PROJECT

ENABLE_THINKING = False

2026-01-13 10:52:16.776603: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Utils

In [2]:
def get_vm_usage_metrics():
    # CPU usage
    cpu_load = psutil.cpu_percent(interval=1, percpu=True)
    for id, load in enumerate(cpu_load):
        print(f"CPU {id} load: {load:.2f}")
    # RAM usage
    ram = psutil.virtual_memory()
    print(f"RAM Total: {ram.total/(1024**3):.2f} GB, Used: {(ram.used)/(1024**3):.2f} GB")
    # GPU
    if torch.cuda.is_available():
        gpus = GPUtil.getGPUs()
        for gpu in gpus:
            print(f"GPU {gpu.id} ({gpu.name}) load: {gpu.load*100}%")
            print(f"GPU {gpu.id} ({gpu.name}) VRAM Total: {gpu.memoryTotal} MB, Used {gpu.memoryUsed} MB")
    # Disk 
    disk = psutil.disk_usage('/')
    print(f"Disk Total: {disk.total/(1024**3):.2f} GB, Used: {(disk.used)/(1024**3):.2f} GB")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')
get_vm_usage_metrics()

Device: cuda
CPU 0 load: 0.00
CPU 1 load: 0.00
CPU 2 load: 3.00
CPU 3 load: 0.00
RAM Total: 27.40 GB, Used: 1.96 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 3.0 MB
Disk Total: 60.95 GB, Used: 57.29 GB


In [3]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"Trainable params: {trainable_params} || All params: {all_param} || Trainable %: {100 * trainable_params / all_param:.2f}"
    )

# Data

In [4]:
data = pd.read_csv('preference_data.xlsx')

print(data.shape)
data.head()

(6943, 5)


Unnamed: 0,sql_prompt,sql_context,sql,model_used,completion
0,What is the average moisture level for each cr...,"CREATE TABLE crop_moisture (id INT, crop_id IN...","SELECT type, AVG(moisture) as avg_moisture FRO...",meta-llama/llama-4-maverick-17b-128e-instruct,"SELECT type, moisture as avg_moisture FROM cro..."
1,Add a new job title called 'Data Science Manag...,CREATE TABLE JobTitle (JobTitleID INT PRIMARY ...,"INSERT INTO JobTitle (JobTitleID, JobTitleName...",meta-llama/llama-4-maverick-17b-128e-instruct,"INSERT INTO JobTitel (JobTitleID, JobTitleName..."
2,What is the total number of military equipment...,CREATE TABLE MaintenanceRequests (RequestID IN...,SELECT COUNT(*) FROM MaintenanceRequests WHERE...,meta-llama/llama-4-scout-17b-16e-instruct,SELECT COUNT(*) FROM MaintenanceRequests WHERE...
3,Insert a new record into the 'community_educat...,"CREATE TABLE community_education (id INT, prog...","INSERT INTO community_education (id, program, ...",moonshotai/kimi-k2-instruct,"""INSERT INTO community_education (id, program,..."
4,How many users signed up daily in the 'games' ...,"CREATE TABLE signups (user_id INT, category TE...","SELECT DATE(timestamp) as signup_date, COUNT(D...",moonshotai/kimi-k2-instruct,"SELECT DATE(timestamp) as signup_date, COUNT(u..."


In [5]:
dataset = []
for id in range(data.shape[0]):
    dataset.append({
        'prompt': [{'role': 'user', 'content': data.loc[id, 'sql_prompt']}],
        'chosen': [{'role': 'assistant', 'content': data.loc[id, 'sql']}],
        'rejected': [{'role': 'assistant', 'content': data.loc[id, 'completion']}]
    })
dataset = datasets.Dataset.from_list(dataset)

split = dataset.train_test_split(test_size=0.1, seed=42)
ds_train = split['train']
ds_valid = split['test']

ds_train

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 6248
})

# Models

In [6]:
checkpoint = "Qwen/Qwen3-0.6B"
model_path = "./qlora-final_model_all_linear_r64-output/final/"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    quantization_config=bnb_config
)

# Manually set LoRA parameters as trainable
for name, param in model.named_parameters():
    if "lora" in name:
        param.requires_grad = True


# model = prepare_model_for_kbit_training(model)
model.config.use_cache = False
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

print_trainable_parameters(model)
get_vm_usage_metrics()

Trainable params: 40370176 || All params: 416219136 || Trainable %: 9.70
CPU 0 load: 2.90
CPU 1 load: 1.00
CPU 2 load: 0.00
CPU 3 load: 0.00
RAM Total: 27.40 GB, Used: 2.29 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 1421.0 MB
Disk Total: 60.95 GB, Used: 57.29 GB


# DPO

In [7]:
torch.cuda.empty_cache()

timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
RUN_NAME = f'sft-dpo-qlora-lr1e5-epochs1-{timestamp}'
OUTPUT_DIR = './sft-dpo-output'
RESUME_TRAINING = False

PER_DEVICE_BATCH_SIZE = 2
effective_batch_size = 16
epochs=1
learning_rate = 1e-5
warmup_ratio = 0.1
lora_r = 16*4
lora_alpha = 64*4
lora_dropout = 0.01

gradient_accumulation_steps = int(effective_batch_size / PER_DEVICE_BATCH_SIZE)

wandb.init(
    project=os.environ["WANDB_PROJECT"],
    name=RUN_NAME,
    # id=run_id ,         # resume previous run if available
    # resume="allow",    # allows resuming crashed run
)



training_args = DPOConfig(
    output_dir=OUTPUT_DIR,
    
    num_train_epochs=epochs,
    beta=0.1,
    
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    lr_scheduler_type="cosine",
    warmup_ratio=warmup_ratio,
    save_strategy="steps",
    save_steps=gradient_accumulation_steps*5,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=gradient_accumulation_steps*5,
    per_device_eval_batch_size=PER_DEVICE_BATCH_SIZE*2,
    eval_accumulation_steps=4,
    logging_strategy="steps",
    logging_steps=gradient_accumulation_steps*5,
    report_to=['wandb'],
    run_name=RUN_NAME,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    max_grad_norm=1,
    load_best_model_at_end=True,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    # generate_during_eval=True
)

# peft_config = LoraConfig(
#     r=lora_r,
#     lora_alpha=lora_alpha,
#     lora_dropout=lora_dropout,
#     bias="none",
#     task_type="CAUSAL_LM",
#     target_modules='all-linear'
# )
# # model.requires_grad_(False)                     # freeze base weights (precautionary)
# model_peft = get_peft_model(model, peft_config) # inject a LoRA adapter

trainer = DPOTrainer(
    processing_class=tokenizer,
    # model=model_peft,
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


# Training setup summary
dataset_size = len(ds_train)
steps_per_epoch = dataset_size // (PER_DEVICE_BATCH_SIZE * gradient_accumulation_steps)
total_steps = steps_per_epoch * epochs
warmup_steps = int(total_steps * warmup_ratio)

print("===== Training Setup Summary =====")
print(f"Num epochs:            {epochs}")
print(f"Effective batch size:  {effective_batch_size}")
print(f"Per-device batch size: {PER_DEVICE_BATCH_SIZE}")
print(f"Gradient accumulation: {gradient_accumulation_steps}")
print(f"Dataset size:          {dataset_size}")
print(f"Steps per epoch:       {steps_per_epoch}")
print(f"Total training steps:  {total_steps}")
print(f"Warmup steps:          {warmup_steps}")
print(f"Logging steps:         {training_args.logging_steps}")
print("===================================")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")


# Training
last_checkpoint = None
if RESUME_TRAINING and os.path.isdir(OUTPUT_DIR):
    last_checkpoint = get_last_checkpoint(OUTPUT_DIR)

if last_checkpoint is not None:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("Starting fresh training run")
    trainer.train()

print(f"End time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")

[34m[1mwandb[0m: Currently logged in as: [33molialeshka[0m ([33molialeshka-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Extracting prompt in train dataset:   0%|          | 0/6248 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/6248 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/6248 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/695 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/695 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/695 [00:00<?, ? examples/s]

/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


===== Training Setup Summary =====
Num epochs:            1
Effective batch size:  16
Per-device batch size: 2
Gradient accumulation: 8
Dataset size:          6248
Steps per epoch:       390
Total training steps:  390
Warmup steps:          39
Logging steps:         40
Start time: 2026-01-13_10-52-41
Starting fresh training run


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
40,0.4028,0.189363,0.387018,-4.877092,0.939655,5.264111,-59.419876,-206.670685,-2.375577,-2.390511
80,0.1515,0.208402,-2.715468,-9.585531,0.928161,6.870064,-90.444725,-253.755081,-2.077241,-2.270951
120,0.1759,0.177546,-6.578011,-14.39365,0.949713,7.815641,-129.07016,-301.836273,-1.994979,-2.188892
160,0.1029,0.147586,-8.777228,-16.922455,0.945402,8.145226,-151.062332,-327.124298,-2.371669,-2.510992
200,0.0887,0.157927,-6.635251,-16.287214,0.945402,9.651963,-129.642563,-320.771881,-2.365505,-2.552347
240,0.1069,0.157995,-6.606999,-16.268812,0.941092,9.661814,-129.360031,-320.587891,-2.471972,-2.644765


FileNotFoundError: [Errno 2] No such file or directory: './sft-dpo-output/checkpoint-160/pytorch_model.bin'

In [8]:
model.save_pretrained(f"{OUTPUT_DIR}/best_model")

# Test

In [10]:
OUTPUT_DIR = './sft-dpo-output'
checkpoint = f"{OUTPUT_DIR}/checkpoint-160/"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype=torch.float16).to(device)
model.eval()

ds = datasets.load_dataset('gretelai/synthetic_text_to_sql', streaming=False)
ds_train, ds_test = ds['train'], ds['test']


def construct_message(prompt, context):
    return [
        {"role": "system", "content": f"The user asks a question. Your task is to generate the SQL query to answer that question. Return SQL query only. The context of the question is the following: '{context}'"},
        {"role": "user", "content": prompt}
    ]

def generate_model_response_batch(messages_list, enable_thinking=True, max_new_tokens=512):
    texts = [
        tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=enable_thinking
        )
        for messages in messages_list
    ]

    model_inputs = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        padding_side='left'
    ).to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_new_tokens
    )

    responses = []
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids):
        # Slice to get only generated part
        output_only_ids = output_ids[len(input_ids):].tolist()

        # Try to find `</think>` (id 151668)
        try:
            index = len(output_only_ids) - output_only_ids[::-1].index(151668)
        except ValueError:
            index = 0

        if enable_thinking:
            thinking_content = tokenizer.decode(
                output_only_ids[:index],
                skip_special_tokens=True
            ).strip("\n")
            content = tokenizer.decode(
                output_only_ids[index:],
                skip_special_tokens=True
            ).strip("\n")
        else:
            thinking_content = None
            content = tokenizer.decode(
                output_only_ids,
                skip_special_tokens=True
            ).strip("\n")

        responses.append({
            'thinking_content': thinking_content,
            'content': content
        })

    return responses


rouge = evaluate.load("rouge")

def normalize_sql(sql):
    return sqlparse.format(sql, reindent=True, keyword_case='upper').strip()

def compute_rouge(reference, prediction):
    result = rouge.compute(predictions=[prediction], references=[reference])
    return result['rougeL']

def evaluate_sql_response(reference, prediction, sql_context):
    # ROUGE-L
    rouge_score = compute_rouge(reference, prediction)
    
    # execution check
    try:
        conn = sqlite3.connect(":memory:")
        cursor = conn.cursor()
        
        cursor.executescript(sql_context)
        cursor.execute(reference)
        ref_result = cursor.fetchall()
        
        cursor.execute(prediction)
        model_result = cursor.fetchall()
        
        execution_match = ref_result == model_result
    except Exception:
        execution_match = False
    finally:
        conn.close()
    
    # final score
    if execution_match:
        final_score = 1.0
    else:
        final_score = 0.7 * rouge_score

    return {
        "rougeL": round(rouge_score, 4),
        "execution_match": execution_match,
        "final_score": final_score
    }

In [11]:
BATCH_SIZE = 32
ENABLE_THINKING = False
MAX_NEW_TOKENS = 512


prompts = [ds_test[id]['sql_prompt'] for id in range(len(ds_test))]
contexts = [ds_test[id]['sql_context'] for id in range(len(ds_test))]

responses = []
print(f"Start time: {time.ctime(time.time())}")
for i in tqdm(range(0, len(prompts), BATCH_SIZE)):
    batch_prompts = prompts[i : i + BATCH_SIZE]
    batch_contexts = contexts[i : i + BATCH_SIZE]

    messages_list = [
        construct_message(prompt=p, context=c)
        for p, c in zip(batch_prompts, batch_contexts)
    ]

    batch_responses = generate_model_response_batch(messages_list, enable_thinking=ENABLE_THINKING, max_new_tokens=MAX_NEW_TOKENS)

    responses.extend(batch_responses)

print(f"End time: {time.ctime(time.time())}")

Start time: Tue Jan 13 13:02:24 2026


  0%|          | 0/183 [00:00<?, ?it/s]

End time: Tue Jan 13 14:45:10 2026


In [None]:
references = [ds_test[id]['sql'] for id in range(len(ds_test))]
predictions = [responses[id]['content'] for id in range(len(ds_test))]

scores = [
    evaluate_sql_response(
        reference=reference,
        prediction=prediction,
        sql_context=context
    )
    for reference, prediction, context in tqdm(zip(references, predictions, contexts), total=len(ds_test))
]

  0%|          | 0/5851 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [14]:
print(f"Mean test set score: {np.mean([score['final_score'] for score in scores]):.3f}")

Mean test set score: 0.702
