# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import datasets
import evaluate
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, EarlyStoppingCallback
from transformers.trainer_utils import get_last_checkpoint
from trl import SFTTrainer
from peft import PromptTuningConfig, PromptTuningInit, get_peft_model
from accelerate import cpu_offload
import sqlite3
import sqlparse
from tqdm.auto import tqdm
from datetime import datetime
import pickle
import wandb
import psutil
import GPUtil
import os
import gc
import math

import _config

In [2]:
ENABLE_THINKING = False

In [3]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

os.environ["WANDB_API_KEY"] = _config.WANDB_API_KEY
os.environ["WANDB_PROJECT"] = _config.WANDB_PROJECT

# Utils

In [4]:
def get_vm_usage_metrics():
    # CPU usage
    cpu_load = psutil.cpu_percent(interval=1, percpu=True)
    for id, load in enumerate(cpu_load):
        print(f"CPU {id} load: {load:.2f}")
    # RAM usage
    ram = psutil.virtual_memory()
    print(f"RAM Total: {ram.total/(1024**3):.2f} GB, Used: {(ram.used)/(1024**3):.2f} GB")
    # GPU
    if torch.cuda.is_available():
        gpus = GPUtil.getGPUs()
        for gpu in gpus:
            print(f"GPU {gpu.id} ({gpu.name}) load: {gpu.load*100}%")
            print(f"GPU {gpu.id} ({gpu.name}) VRAM Total: {gpu.memoryTotal} MB, Used {gpu.memoryUsed} MB")
    # Disk 
    disk = psutil.disk_usage('/')
    print(f"Disk Total: {disk.total/(1024**3):.2f} GB, Used: {(disk.used)/(1024**3):.2f} GB")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')
get_vm_usage_metrics()

Device: cuda
CPU 0 load: 0.00
CPU 1 load: 0.00
CPU 2 load: 1.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 1.62 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 3.0 MB
Disk Total: 60.95 GB, Used: 35.30 GB


In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"Trainable params: {trainable_params} || All params: {all_param} || Trainable %: {100 * trainable_params / all_param:.2f}"
    )


def return_num_trainable_parameters(model):
    """
    Returns the number of trainable parameters in the model.
    """
    trainable_params = 0
    for _, param in model.named_parameters():
        if param.requires_grad:
            trainable_params += param.numel()
    return trainable_params

# Data

In [6]:
ds = datasets.load_dataset('gretelai/synthetic_text_to_sql', streaming=False)
ds_train, ds_test = ds['train'], ds['test']

split = ds_train.train_test_split(test_size=0.025, seed=42)
ds_train = split['train']
ds_valid = split['test']

ds_train

Dataset({
    features: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'],
    num_rows: 97500
})

# Model

In [7]:
checkpoint = "Qwen/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    device_map="cuda",
)
# model = cpu_offload(model)

print_trainable_parameters(model)
get_vm_usage_metrics()

Trainable params: 596049920 || All params: 596049920 || Trainable %: 100.00
CPU 0 load: 1.00
CPU 1 load: 0.00
CPU 2 load: 2.90
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 1.93 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2987.0 MB
Disk Total: 60.95 GB, Used: 35.30 GB


In [8]:
def construct_message(prompt, context):
    return [
        {"role": "system", "content": f"The user asks a question. Your task is to generate the SQL query to answer that question. Return SQL query only. The context of the question is the following: '{context}'"},
        {"role": "user", "content": prompt}
    ]

In [9]:
def generate_model_response_batch(model, tokenizer, messages_list, enable_thinking=True, max_new_tokens=512):
    texts = [
        tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=enable_thinking
        )
        for messages in messages_list
    ]

    model_inputs = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        padding_side='left'
    ).to(model.device)

    model.eval()
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_new_tokens
    )

    responses = []
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids):
        # Slice to get only generated part
        output_only_ids = output_ids[len(input_ids):].tolist()

        # Try to find `</think>` (id 151668)
        try:
            index = len(output_only_ids) - output_only_ids[::-1].index(151668)
        except ValueError:
            index = 0

        if enable_thinking:
            thinking_content = tokenizer.decode(
                output_only_ids[:index],
                skip_special_tokens=True
            ).strip("\n")
            content = tokenizer.decode(
                output_only_ids[index:],
                skip_special_tokens=True
            ).strip("\n")
        else:
            thinking_content = None
            content = tokenizer.decode(
                output_only_ids,
                skip_special_tokens=True
            ).strip("\n")

        responses.append({
            'thinking_content': thinking_content,
            'content': content
        })

    return responses

# Evaluate

In [10]:
rouge = evaluate.load("rouge")

def normalize_sql(sql):
    return sqlparse.format(sql, reindent=True, keyword_case='upper').strip()

def compute_rouge(reference, prediction):
    result = rouge.compute(predictions=[prediction], references=[reference])
    return result['rougeL']

def evaluate_sql_response(reference, prediction, sql_context):
    # ROUGE-L
    rouge_score = compute_rouge(reference, prediction)
    
    # execution check
    try:
        conn = sqlite3.connect(":memory:")
        cursor = conn.cursor()
        
        cursor.executescript(sql_context)
        cursor.execute(reference)
        ref_result = cursor.fetchall()
        
        cursor.execute(prediction)
        model_result = cursor.fetchall()
        
        execution_match = ref_result == model_result
    except Exception:
        execution_match = False
    finally:
        conn.close()
    
    # final score
    if execution_match:
        final_score = 1.0
    else:
        final_score = 0.7 * rouge_score

    return {
        "rougeL": round(rouge_score, 4),
        "execution_match": execution_match,
        "final_score": final_score
    }

# Formatting functions

In [11]:
# used for training
def construct_message_with_assistant_content(example):
    messages = construct_message(example['sql_prompt'], example['sql_context'])
    messages.append({
        'role': 'assistant',
        'content': example['sql']
    })
    return {'messages': messages}

In [12]:
def formatting_func(example, enable_thinking=ENABLE_THINKING):
    return tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False, # no generation prompt during training
        enable_thinking=ENABLE_THINKING 
    )

# Prompt tuning - step 1

In [13]:
TRAIN_SIZE = 4096
VALID_SIZE = 1024

ds_train_sample = ds_train.take(TRAIN_SIZE)
ds_valid_sample = ds_valid.take(VALID_SIZE)

len(ds_train_sample), len(ds_valid_sample), len(ds_test)

ds_train_with_assistant_content = ds_train_sample.map(construct_message_with_assistant_content)
ds_valid_with_assistant_content = ds_valid_sample.map(construct_message_with_assistant_content)

In [14]:
get_vm_usage_metrics()

CPU 0 load: 2.00
CPU 1 load: 0.00
CPU 2 load: 2.00
CPU 3 load: 4.00
RAM Total: 27.41 GB, Used: 2.02 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2987.0 MB
Disk Total: 60.95 GB, Used: 35.22 GB


In [15]:
torch.cuda.empty_cache()

In [None]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

sweep_config = {
    'name': f'sweep-prompt-tuning-step1-epochs1-samples{TRAIN_SIZE}-{timestamp}',
    'method': 'bayes',
    'metric': {
        'name': 'eval_loss',
        'goal': 'minimize'   
    },
    'parameters': {
        'optimizer': {'values': ['adam', 'adamw', 'nadam', 'adamax']},
        'effective_batch_size': {'values': [16, 32, 64, 128, 256, 512]},
        'learning_rate': {'values': [5e-6, 1e-6, 1e-7, 5e-7, 1e-8]}, # lower learning rates
        'weight_decay': {'values': [0.0, 0.01, 0.1]},
        'betas': {'values': [(0.9, 0.999), (0.95, 0.999), (0.9, 0.9999)]},
        'warmup_ratio': {'values': [0.05, 0.1, 0.2]},
        'epochs': {'values': [1]},
        'num_virtual_tokens': {'values': [10, 20, 50, 100]},
    }
}

sweep_id = wandb.sweep(sweep_config)
# sweep_id = '9a4oj3so' # continue the crashed sweep

optimizer_map = {
    "adam": torch.optim.Adam,
    "adamw": torch.optim.AdamW,
    "nadam": torch.optim.NAdam,
    "adamax": torch.optim.Adamax
}

def sweep_train():
    with wandb.init() as run:
        model = AutoModelForCausalLM.from_pretrained(checkpoint)
        config = wandb.config  
        PER_DEVICE_BATCH_SIZE = 2  # higher values --> OOM
        gradient_accumulation_steps = int(config.effective_batch_size / PER_DEVICE_BATCH_SIZE)
        
        training_args = TrainingArguments(
            per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
            gradient_accumulation_steps=gradient_accumulation_steps,
            learning_rate=config.learning_rate,
            num_train_epochs=config.epochs,
            weight_decay=config.weight_decay,
            lr_scheduler_type="cosine",
            warmup_ratio=config.warmup_ratio,
            save_strategy="no",
            eval_strategy="epoch",
            logging_strategy="steps",
            logging_steps=1,
            report_to=['wandb'],
            fp16=True,
            fp16_full_eval=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            max_grad_norm=1,
            # load_best_model_at_end=True
        )
        
        def build_optimizer(model):
            optimizer_class = optimizer_map[config.optimizer]
            return optimizer_class(
                model.parameters(),
                lr=config.learning_rate,
                weight_decay=config.weight_decay,
                betas=config.betas
            )

        peft_config = PromptTuningConfig(
            task_type="CAUSAL_LM",
            num_virtual_tokens=config.num_virtual_tokens,
            prompt_tuning_init=PromptTuningInit.RANDOM,
            tokenizer_name_or_path=checkpoint
        )
        model.requires_grad_(False)                     # freeze base weights (precautionary)
        model_peft = get_peft_model(model, peft_config) # inject prompt tuning parameters

        print_trainable_parameters(model_peft)
        num_trainable_parameters = return_num_trainable_parameters(model_peft)
        
        trainer = SFTTrainer(
            model=model_peft,
            train_dataset=ds_train_with_assistant_content,
            eval_dataset=ds_valid_with_assistant_content,
            formatting_func=formatting_func,
            args=training_args,
            optimizers=(build_optimizer(model_peft), None),  # (optimizer, scheduler)
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        trainer.train()

        for log in trainer.state.log_history:
            if 'eval_loss' in log:
                wandb.log({
                    "eval_loss": log['eval_loss'],
                    "eval_perplexity": math.exp(log['eval_loss']),
                    "step": log['step'],
                    "learning_rate": config.learning_rate,
                    "weight_decay": config.weight_decay,
                    "betas": config.betas,
                    "warmup_ratio": config.warmup_ratio,
                    "effective_batch_size": config.effective_batch_size,
                    "optimizer": config.optimizer,
                    "num_trainable_parameters": num_trainable_parameters
                })
        wandb.finish(); # finish the run
        del trainer
        torch.cuda.empty_cache()
        gc.collect()
        
wandb.agent(sweep_id, function=sweep_train, count=30)

Create sweep with ID: lc4wbh65
Sweep URL: https://wandb.ai/olialeshka-none/text-to-sql/sweeps/lc4wbh65


[34m[1mwandb[0m: Agent Starting Run: mxb2punu with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 512
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 5e-06
[34m[1mwandb[0m: 	num_virtual_tokens: 50
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.01
[34m[1mwandb[0m: Currently logged in as: [33molialeshka[0m ([33molialeshka-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Trainable params: 51200 || All params: 596101120 || Trainable %: 0.01


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,8.1115,8.258953,3.174531,799399.0,0.174135


0,1
effective_batch_size,▁
eval/entropy,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁

0,1
effective_batch_size,512
eval/entropy,3.17453
eval/loss,8.25895
eval/mean_token_accuracy,0.17414
eval/num_tokens,799399
eval/runtime,51.7755
eval/samples_per_second,19.778
eval/steps_per_second,2.472
eval_loss,8.25895
eval_perplexity,3862.04878


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 4yt3x7uo with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.9999]
[34m[1mwandb[0m: 	effective_batch_size: 256
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 1e-07
[34m[1mwandb[0m: 	num_virtual_tokens: 20
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.1


Trainable params: 20480 || All params: 596070400 || Trainable %: 0.00


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss


# Prompt tuning - step 2 - the final model

In [16]:
torch.cuda.empty_cache()
get_vm_usage_metrics()

print(len(ds_train), len(ds_valid), len(ds_test))

ds_train_with_assistant_content = ds_train.map(construct_message_with_assistant_content)
ds_valid_with_assistant_content = ds_valid.map(construct_message_with_assistant_content)

CPU 0 load: 2.00
CPU 1 load: 2.00
CPU 2 load: 3.00
CPU 3 load: 3.00
RAM Total: 27.41 GB, Used: 1.99 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2407.0 MB
Disk Total: 60.95 GB, Used: 35.22 GB
97500 2500 5851


In [None]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# resuming the prev run
# timestamp = '2025-08-19_08-33-30'
RUN_NAME = f'prompt-tuning-final-model-{timestamp}'
# run_id = 'imoh6jtd'
wandb.init(
    project=os.environ["WANDB_PROJECT"],
    name=RUN_NAME,
    # id=run_id,         # resume previous run if available
    resume="allow",    # allows resuming crashed run
)


RESUME_TRAINING = True
OUTPUT_DIR = "./prompt-tuning-final_model-output"
PER_DEVICE_BATCH_SIZE = 2  # higher values --> OOM

optimizer = 'nadam'
effective_batch_size = 32
learning_rate = 1e-8
weight_decay = 0.01
betas = (0.95, 0.999)
warmup_ratio = 0.2
epochs = 1
gradient_accumulation_steps = int(effective_batch_size / PER_DEVICE_BATCH_SIZE)
num_virtual_tokens = 10

optimizer_map = {
    "adam": torch.optim.Adam,
    "adamw": torch.optim.AdamW,
    "nadam": torch.optim.NAdam,
    "adamax": torch.optim.Adamax
}

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    lr_scheduler_type="cosine",
    warmup_ratio=warmup_ratio,
    save_strategy="steps",
    save_steps=gradient_accumulation_steps*5,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=gradient_accumulation_steps*5,
    logging_strategy="steps",
    logging_steps=gradient_accumulation_steps*5,
    report_to=['wandb'],
    run_name=RUN_NAME,
    fp16=True,
    fp16_full_eval=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    max_grad_norm=1,
    load_best_model_at_end=True,
    gradient_checkpointing=True
)

def build_optimizer(model):
    optimizer_class = optimizer_map[optimizer]
    return optimizer_class(
        model.parameters(),
        lr=learning_rate,
        weight_decay=weight_decay,
        betas=betas
    )


peft_config = PromptTuningConfig(
    task_type="CAUSAL_LM",
    num_virtual_tokens=num_virtual_tokens,
    prompt_tuning_init=PromptTuningInit.RANDOM,
    tokenizer_name_or_path=checkpoint
)
model.requires_grad_(False)                     # freeze base weights (precautionary)
model_peft = get_peft_model(model, peft_config) # inject prompt tuning parameters

print_trainable_parameters(model_peft)
num_trainable_parameters = return_num_trainable_parameters(model_peft)
        
model.requires_grad_(False)                     # freeze base weights (precautionary)
model_peft = get_peft_model(model, peft_config) # inject a LoRA adapter

trainer = SFTTrainer(
    model=model_peft,
    train_dataset=ds_train_with_assistant_content,
    eval_dataset=ds_valid_with_assistant_content,
    formatting_func=formatting_func,
    args=training_args,
    optimizers=(build_optimizer(model_peft), None),  # (optimizer, scheduler)
    callbacks=[EarlyStoppingCallback(early_stopping_patience=25)]
)


# Training setup summary
dataset_size = len(ds_train_with_assistant_content)
steps_per_epoch = dataset_size // (PER_DEVICE_BATCH_SIZE * gradient_accumulation_steps)
total_steps = steps_per_epoch * epochs
warmup_steps = int(total_steps * warmup_ratio)

print("===== Training Setup Summary =====")
print(f"Num epochs:            {epochs}")
print(f"Effective batch size:  {effective_batch_size}")
print(f"Per-device batch size: {PER_DEVICE_BATCH_SIZE}")
print(f"Gradient accumulation: {gradient_accumulation_steps}")
print(f"Dataset size:          {dataset_size}")
print(f"Steps per epoch:       {steps_per_epoch}")
print(f"Total training steps:  {total_steps}")
print(f"Warmup steps:          {warmup_steps}")
print(f"Logging steps:         {training_args.logging_steps}")
print("===================================")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")


# Training
last_checkpoint = None
if RESUME_TRAINING and os.path.isdir(OUTPUT_DIR):
    last_checkpoint = get_last_checkpoint(OUTPUT_DIR)

if last_checkpoint is not None:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("Starting fresh training run")
    trainer.train()

print(f"End time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")


# WandB logging of eval metrics
for log in trainer.state.log_history:
    if 'eval_loss' in log:
        wandb.log({
            "eval_loss": log['eval_loss'],
            "eval_perplexity": math.exp(log['eval_loss']),
            "step": log['step'],
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "betas": betas,
            "warmup_ratio": warmup_ratio,
            "effective_batch_size": effective_batch_size,
            "optimizer": optimizer,
            "num_trainable_parameters": num_trainable_parameters
        })

wandb.finish()  # finish the run

Trainable params: 10240 || All params: 596060160 || Trainable %: 0.00
===== Training Setup Summary =====
Num epochs:            1
Effective batch size:  32
Per-device batch size: 2
Gradient accumulation: 16
Dataset size:          97500
Steps per epoch:       3046
Total training steps:  3046
Warmup steps:          609
Logging steps:         80
Start time: 2025-10-18_16-30-45
Starting fresh training run


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
80,7.2097,7.22197,2.813579,496224.0,0.230796
160,7.161,7.220797,2.813698,999432.0,0.230785
240,7.2111,7.22403,2.814249,1494927.0,0.230727


In [None]:
model_path = os.path.join(OUTPUT_DIR, 'final')
trainer.save_model(model_path)

## Test

In [13]:
torch.cuda.empty_cache()
get_vm_usage_metrics()

CPU 0 load: 0.00
CPU 1 load: 0.00
CPU 2 load: 2.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 1.93 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2407.0 MB
Disk Total: 60.95 GB, Used: 35.30 GB


In [14]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

model_path = './prompt-tuning-final_model-output/checkpoint-1680'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoPeftModelForCausalLM.from_pretrained(model_path).to(device)
model.eval()

PeftModelForCausalLM(
  (base_model): Qwen3ForCausalLM(
    (model): Qwen3Model(
      (embed_tokens): Embedding(151936, 1024)
      (layers): ModuleList(
        (0-27): 28 x Qwen3DecoderLayer(
          (self_attn): Qwen3Attention(
            (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
            (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
            (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
          )
          (mlp): Qwen3MLP(
            (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
            (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
            (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
            (act_fn): SiLU()
          )
          (input_layernor

In [None]:
BATCH_SIZE = 32
ENABLE_THINKING = False
MAX_NEW_TOKENS = 512


prompts = [ds_test[id]['sql_prompt'] for id in range(len(ds_test))]
contexts = [ds_test[id]['sql_context'] for id in range(len(ds_test))]

responses = []
print(f"Start time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")
for i in tqdm(range(0, len(prompts), BATCH_SIZE)):
    batch_prompts = prompts[i : i + BATCH_SIZE]
    batch_contexts = contexts[i : i + BATCH_SIZE]

    messages_list = [
        construct_message(prompt=p, context=c)
        for p, c in zip(batch_prompts, batch_contexts)
    ]

    batch_responses = generate_model_response_batch(model, tokenizer, messages_list, enable_thinking=ENABLE_THINKING, max_new_tokens=MAX_NEW_TOKENS)

    responses.extend(batch_responses)

print(f"End time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")

Start time: 2025-10-19_07-12-59


  0%|          | 0/183 [00:00<?, ?it/s]



In [20]:
references = [ds_test[id]['sql'] for id in range(len(ds_test))]
predictions = [responses[id]['content'] for id in range(len(ds_test))]

scores = [
    evaluate_sql_response(
        reference=reference,
        prediction=prediction,
        sql_context=context
    )
    for reference, prediction, context in tqdm(zip(references, predictions, contexts), total=len(ds_test))
]

  0%|          | 0/5851 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [19]:
print(f"Mean test set score: {np.mean([score['final_score'] for score in scores]):.3f}")

Mean test set score: 0.052


In [None]:
with open('prompt_tuning_test_predictions.pkl', 'wb') as f:
    pickle.dump(predictions, f)