# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import datasets
import evaluate
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig, EarlyStoppingCallback
from transformers.trainer_utils import get_last_checkpoint
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from accelerate import cpu_offload
import sqlite3
import sqlparse
from tqdm.auto import tqdm
from datetime import datetime
import pickle
import wandb
import psutil
import GPUtil
import os
import gc
import math

import _config

In [2]:
ENABLE_THINKING = False

In [3]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

os.environ["WANDB_API_KEY"] = _config.WANDB_API_KEY
os.environ["WANDB_PROJECT"] = _config.WANDB_PROJECT

# Utils

In [4]:
def get_vm_usage_metrics():
    # CPU usage
    cpu_load = psutil.cpu_percent(interval=1, percpu=True)
    for id, load in enumerate(cpu_load):
        print(f"CPU {id} load: {load:.2f}")
    # RAM usage
    ram = psutil.virtual_memory()
    print(f"RAM Total: {ram.total/(1024**3):.2f} GB, Used: {(ram.used)/(1024**3):.2f} GB")
    # GPU
    if torch.cuda.is_available():
        gpus = GPUtil.getGPUs()
        for gpu in gpus:
            print(f"GPU {gpu.id} ({gpu.name}) load: {gpu.load*100}%")
            print(f"GPU {gpu.id} ({gpu.name}) VRAM Total: {gpu.memoryTotal} MB, Used {gpu.memoryUsed} MB")
    # Disk 
    disk = psutil.disk_usage('/')
    print(f"Disk Total: {disk.total/(1024**3):.2f} GB, Used: {(disk.used)/(1024**3):.2f} GB")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')
get_vm_usage_metrics()

Device: cuda
CPU 0 load: 0.00
CPU 1 load: 0.00
CPU 2 load: 2.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 1.62 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 3.0 MB
Disk Total: 60.95 GB, Used: 38.36 GB


In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"Trainable params: {trainable_params} || All params: {all_param} || Trainable %: {100 * trainable_params / all_param:.2f}"
    )

# Data

In [6]:
ds = datasets.load_dataset('gretelai/synthetic_text_to_sql', streaming=False)
ds_train, ds_test = ds['train'], ds['test']

split = ds_train.train_test_split(test_size=0.025, seed=42)
ds_train = split['train']
ds_valid = split['test']

ds_train

Dataset({
    features: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'],
    num_rows: 97500
})

# Model

In [None]:
checkpoint = "Qwen/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    device_map="cuda",
)
# model = cpu_offload(model)

print_trainable_parameters(model)
get_vm_usage_metrics()

In [7]:
def construct_message(prompt, context):
    return [
        {"role": "system", "content": f"The user asks a question. Your task is to generate the SQL query to answer that question. Return SQL query only. The context of the question is the following: '{context}'"},
        {"role": "user", "content": prompt}
    ]

In [8]:
def generate_model_response_batch(model, tokenizer, messages_list, enable_thinking=True, max_new_tokens=512):
    texts = [
        tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=enable_thinking
        )
        for messages in messages_list
    ]

    model_inputs = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        padding_side='left'
    ).to(model.device)

    model.eval()
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_new_tokens
    )

    responses = []
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids):
        # Slice to get only generated part
        output_only_ids = output_ids[len(input_ids):].tolist()

        # Try to find `</think>` (id 151668)
        try:
            index = len(output_only_ids) - output_only_ids[::-1].index(151668)
        except ValueError:
            index = 0

        if enable_thinking:
            thinking_content = tokenizer.decode(
                output_only_ids[:index],
                skip_special_tokens=True
            ).strip("\n")
            content = tokenizer.decode(
                output_only_ids[index:],
                skip_special_tokens=True
            ).strip("\n")
        else:
            thinking_content = None
            content = tokenizer.decode(
                output_only_ids,
                skip_special_tokens=True
            ).strip("\n")

        responses.append({
            'thinking_content': thinking_content,
            'content': content
        })

    return responses

# Evaluate

In [9]:
rouge = evaluate.load("rouge")

def normalize_sql(sql):
    return sqlparse.format(sql, reindent=True, keyword_case='upper').strip()

def compute_rouge(reference, prediction):
    result = rouge.compute(predictions=[prediction], references=[reference])
    return result['rougeL']

def evaluate_sql_response(reference, prediction, sql_context):
    # ROUGE-L
    rouge_score = compute_rouge(reference, prediction)
    
    # execution check
    try:
        conn = sqlite3.connect(":memory:")
        cursor = conn.cursor()
        
        cursor.executescript(sql_context)
        cursor.execute(reference)
        ref_result = cursor.fetchall()
        
        cursor.execute(prediction)
        model_result = cursor.fetchall()
        
        execution_match = ref_result == model_result
    except Exception:
        execution_match = False
    finally:
        conn.close()
    
    # final score
    if execution_match:
        final_score = 1.0
    else:
        final_score = 0.7 * rouge_score

    return {
        "rougeL": round(rouge_score, 4),
        "execution_match": execution_match,
        "final_score": final_score
    }

# Formatting functions

In [10]:
# used for training
def construct_message_with_assistant_content(example):
    messages = construct_message(example['sql_prompt'], example['sql_context'])
    messages.append({
        'role': 'assistant',
        'content': example['sql']
    })
    return {'messages': messages}

In [11]:
def formatting_func(example, enable_thinking=ENABLE_THINKING):
    return tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False, # no generation prompt during training
        enable_thinking=ENABLE_THINKING 
    )

# LoRA- step 1

In [12]:
TRAIN_SIZE = 4096
VALID_SIZE = 1024

ds_train_sample = ds_train.take(TRAIN_SIZE)
ds_valid_sample = ds_valid.take(VALID_SIZE)

len(ds_train_sample), len(ds_valid_sample), len(ds_test)

ds_train_with_assistant_content = ds_train_sample.map(construct_message_with_assistant_content)
ds_valid_with_assistant_content = ds_valid_sample.map(construct_message_with_assistant_content)

In [13]:
get_vm_usage_metrics()

CPU 0 load: 1.00
CPU 1 load: 0.00
CPU 2 load: 1.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 1.79 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2987.0 MB
Disk Total: 60.95 GB, Used: 32.28 GB


In [14]:
torch.cuda.empty_cache()

In [None]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

sweep_config = {
    'name': f'sweep-lora-step1-epochs1-samples{TRAIN_SIZE}-{timestamp}',
    'method': 'bayes',
    'metric': {
        'name': 'eval_loss',
        'goal': 'minimize'   
    },
    'parameters': {
        'optimizer': {'values': ['adam', 'adamw', 'nadam', 'adamax']},
        'effective_batch_size': {'values': [16, 32, 64, 128, 256, 512]},
        'learning_rate': {'values': [1e-4, 5e-5, 1e-5, 5e-6, 1e-6]},
        'weight_decay': {'values': [0.0, 0.01, 0.1]},
        'betas': {'values': [(0.9, 0.999), (0.95, 0.999), (0.9, 0.9999)]},
        'warmup_ratio': {'values': [0.05, 0.1, 0.2]},
        'epochs': {'values': [1]},
        'lora_r': {'values': [4, 8, 16, 32]},
        'lora_alpha': {'values': [2, 4, 8, 16, 32, 64]},
        'lora_dropout': {'values': [0.01, 0.05, 0.1, 0.2]}
    }
}

# sweep_id = wandb.sweep(sweep_config)
sweep_id = '9a4oj3so' # continue the crashed sweep

optimizer_map = {
    "adam": torch.optim.Adam,
    "adamw": torch.optim.AdamW,
    "nadam": torch.optim.NAdam,
    "adamax": torch.optim.Adamax
}

def sweep_train():
    with wandb.init() as run:
        model = AutoModelForCausalLM.from_pretrained(checkpoint)
        config = wandb.config  
        PER_DEVICE_BATCH_SIZE = 2  # higher values --> OOM
        gradient_accumulation_steps = int(config.effective_batch_size / PER_DEVICE_BATCH_SIZE)
        
        training_args = TrainingArguments(
            per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
            gradient_accumulation_steps=gradient_accumulation_steps,
            learning_rate=config.learning_rate,
            num_train_epochs=config.epochs,
            weight_decay=config.weight_decay,
            lr_scheduler_type="cosine",
            warmup_ratio=config.warmup_ratio,
            save_strategy="no",
            eval_strategy="epoch",
            logging_strategy="steps",
            logging_steps=1,
            report_to=['wandb'],
            fp16=True,
            fp16_full_eval=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            max_grad_norm=1,
            # load_best_model_at_end=True
        )
        
        def build_optimizer(model):
            optimizer_class = optimizer_map[config.optimizer]
            return optimizer_class(
                model.parameters(),
                lr=config.learning_rate,
                weight_decay=config.weight_decay,
                betas=config.betas
            )


        peft_config = LoraConfig(
            r=config.lora_r,
            lora_alpha=config.lora_alpha,
            lora_dropout=config.lora_dropout,
            bias="none",
            task_type="CAUSAL_LM"
        )
        model.requires_grad_(False)                     # freeze base weights (precautionary)
        model_peft = get_peft_model(model, peft_config) # inject a LoRA adapter
        
        trainer = SFTTrainer(
            model=model_peft,
            train_dataset=ds_train_with_assistant_content,
            eval_dataset=ds_valid_with_assistant_content,
            formatting_func=formatting_func,
            args=training_args,
            optimizers=(build_optimizer(model_peft), None),  # (optimizer, scheduler)
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        trainer.train()

        for log in trainer.state.log_history:
            if 'eval_loss' in log:
                wandb.log({
                    "eval_loss": log['eval_loss'],
                    "eval_perplexity": math.exp(log['eval_loss']),
                    "step": log['step'],
                    "learning_rate": config.learning_rate,
                    "weight_decay": config.weight_decay,
                    "betas": config.betas,
                    "warmup_ratio": config.warmup_ratio,
                    "effective_batch_size": config.effective_batch_size,
                    "optimizer": config.optimizer
                })
        wandb.finish(); # finish the run
        del trainer
        torch.cuda.empty_cache()
        gc.collect()
        
wandb.agent(sweep_id, function=sweep_train, count=60)

[34m[1mwandb[0m: Agent Starting Run: a8dgyigc with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.9999]
[34m[1mwandb[0m: 	effective_batch_size: 16
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.2
[34m[1mwandb[0m: 	lora_r: 8
[34m[1mwandb[0m: 	optimizer: adamw
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.1
[34m[1mwandb[0m: Currently logged in as: [33molialeshka[0m ([33molialeshka-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss


# LoRA - step 2

In [15]:
torch.cuda.empty_cache()
get_vm_usage_metrics()


TRAIN_SIZE = 8192
VALID_SIZE = 2048

ds_train_sample = ds_train.take(TRAIN_SIZE)
ds_valid_sample = ds_valid.take(VALID_SIZE)

print(len(ds_train_sample), len(ds_valid_sample), len(ds_test))

ds_train_with_assistant_content = ds_train_sample.map(construct_message_with_assistant_content)
ds_valid_with_assistant_content = ds_valid_sample.map(construct_message_with_assistant_content)

CPU 0 load: 0.00
CPU 1 load: 0.00
CPU 2 load: 1.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 1.73 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2407.0 MB
Disk Total: 60.95 GB, Used: 35.50 GB
8192 2048 5851


Map:   0%|          | 0/8192 [00:00<?, ? examples/s]

Map:   0%|          | 0/2048 [00:00<?, ? examples/s]

In [None]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

sweep_config = {
    'name': f'sweep-lora-step2-epochs1-samples{TRAIN_SIZE}-{timestamp}',
    'method': 'grid',
    'metric': {
        'name': 'eval_loss',
        'goal': 'minimize'   
    },
    'parameters': {
        'optimizer': {'values': ['adamw', 'nadam']},
        'effective_batch_size': {'values': [16]},
        'learning_rate': {'values': [1e-5]}, # best results from bayes search have 1e-4 -> setting to a lower value
        'weight_decay': {'values': [0.0]},
        'betas': {'values': [(0.9, 0.9999)]},
        'warmup_ratio': {'values': [0.05, 0.1, 0.2]},
        'epochs': {'values': [1]},
        'lora_r': {'values': [8, 16, 32]},
        'lora_alpha': {'values': [64]},
        'lora_dropout': {'values': [0.01, 0.05, 0.1, 0.2]}
    }
}

sweep_id = wandb.sweep(sweep_config)
# sweep_id = '9a4oj3so' # continue the crashed sweep

optimizer_map = {
    "adam": torch.optim.Adam,
    "adamw": torch.optim.AdamW,
    "nadam": torch.optim.NAdam,
    "adamax": torch.optim.Adamax
}

def sweep_train():
    with wandb.init() as run:
        model = AutoModelForCausalLM.from_pretrained(checkpoint)
        config = wandb.config  
        PER_DEVICE_BATCH_SIZE = 2  # higher values --> OOM
        gradient_accumulation_steps = int(config.effective_batch_size / PER_DEVICE_BATCH_SIZE)
        
        training_args = TrainingArguments(
            per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
            gradient_accumulation_steps=gradient_accumulation_steps,
            learning_rate=config.learning_rate,
            num_train_epochs=config.epochs,
            weight_decay=config.weight_decay,
            lr_scheduler_type="cosine",
            warmup_ratio=config.warmup_ratio,
            save_strategy="no",
            eval_strategy="epoch",
            logging_strategy="steps",
            logging_steps=1,
            report_to=['wandb'],
            fp16=True,
            fp16_full_eval=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            max_grad_norm=1,
            # load_best_model_at_end=True
        )
        
        def build_optimizer(model):
            optimizer_class = optimizer_map[config.optimizer]
            return optimizer_class(
                model.parameters(),
                lr=config.learning_rate,
                weight_decay=config.weight_decay,
                betas=config.betas
            )


        peft_config = LoraConfig(
            r=config.lora_r,
            lora_alpha=config.lora_alpha,
            lora_dropout=config.lora_dropout,
            bias="none",
            task_type="CAUSAL_LM"
        )
        model.requires_grad_(False)                     # freeze base weights (precautionary)
        model_peft = get_peft_model(model, peft_config) # inject a LoRA adapter
        
        trainer = SFTTrainer(
            model=model_peft,
            train_dataset=ds_train_with_assistant_content,
            eval_dataset=ds_valid_with_assistant_content,
            formatting_func=formatting_func,
            args=training_args,
            optimizers=(build_optimizer(model_peft), None),  # (optimizer, scheduler)
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        trainer.train()

        for log in trainer.state.log_history:
            if 'eval_loss' in log:
                wandb.log({
                    "eval_loss": log['eval_loss'],
                    "eval_perplexity": math.exp(log['eval_loss']),
                    "step": log['step'],
                    "learning_rate": config.learning_rate,
                    "weight_decay": config.weight_decay,
                    "betas": config.betas,
                    "warmup_ratio": config.warmup_ratio,
                    "effective_batch_size": config.effective_batch_size,
                    "optimizer": config.optimizer
                })
        wandb.finish(); # finish the run
        del trainer
        torch.cuda.empty_cache()
        gc.collect()
        
wandb.agent(sweep_id, function=sweep_train)

Create sweep with ID: ub224w5u
Sweep URL: https://wandb.ai/olialeshka-none/text-to-sql/sweeps/ub224w5u


[34m[1mwandb[0m: Agent Starting Run: d1jy1p86 with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.9999]
[34m[1mwandb[0m: 	effective_batch_size: 16
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	lora_alpha: 64
[34m[1mwandb[0m: 	lora_dropout: 0.01
[34m[1mwandb[0m: 	lora_r: 8
[34m[1mwandb[0m: 	optimizer: adamw
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0


[1;34mwandb[0m: 
[1;34mwandb[0m: ðŸš€ View run [33mfloral-sweep-1[0m at: [34mhttps://wandb.ai/olialeshka-none/text-to-sql/runs/r450607u[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251004_103745-r450607u/logs[0m


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss


# LoRA - step 3 - the final model

In [12]:
torch.cuda.empty_cache()
get_vm_usage_metrics()

print(len(ds_train), len(ds_valid), len(ds_test))

ds_train_with_assistant_content = ds_train.map(construct_message_with_assistant_content)
ds_valid_with_assistant_content = ds_valid.map(construct_message_with_assistant_content)

CPU 0 load: 0.00
CPU 1 load: 0.00
CPU 2 load: 0.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 1.84 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2407.0 MB
Disk Total: 60.95 GB, Used: 32.44 GB
97500 2500 5851


In [None]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# resuming the prev run
# timestamp = '2025-08-19_08-33-30'
RUN_NAME = f'lora-final-model-{timestamp}'
# run_id = 'imoh6jtd'
wandb.init(
    project=os.environ["WANDB_PROJECT"],
    name=RUN_NAME,
    # id=run_id,         # resume previous run if available
    resume="allow",    # allows resuming crashed run
)


RESUME_TRAINING = True
OUTPUT_DIR = "./lora-final_model-output"
PER_DEVICE_BATCH_SIZE = 2  # higher values --> OOM

optimizer = 'nadam'
effective_batch_size = 16
learning_rate = 1e-5
weight_decay = 0.0
betas = (0.9, 0.9999)
warmup_ratio = 0.2
epochs = 1
gradient_accumulation_steps = int(effective_batch_size / PER_DEVICE_BATCH_SIZE)
lora_r = 16
lora_alpha = 64
lora_dropout = 0.01

optimizer_map = {
    "adam": torch.optim.Adam,
    "adamw": torch.optim.AdamW,
    "nadam": torch.optim.NAdam,
    "adamax": torch.optim.Adamax
}

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    lr_scheduler_type="cosine",
    warmup_ratio=warmup_ratio,
    save_strategy="steps",
    save_steps=gradient_accumulation_steps*5,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=gradient_accumulation_steps*5,
    logging_strategy="steps",
    logging_steps=gradient_accumulation_steps*5,
    report_to=['wandb'],
    run_name=RUN_NAME,
    fp16=True,
    fp16_full_eval=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    max_grad_norm=1,
    load_best_model_at_end=True,
    gradient_checkpointing=True
)

def build_optimizer(model):
    optimizer_class = optimizer_map[optimizer]
    return optimizer_class(
        model.parameters(),
        lr=learning_rate,
        weight_decay=weight_decay,
        betas=betas
    )


peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM"
)
model.requires_grad_(False)                     # freeze base weights (precautionary)
model_peft = get_peft_model(model, peft_config) # inject a LoRA adapter

trainer = SFTTrainer(
    model=model_peft,
    train_dataset=ds_train_with_assistant_content,
    eval_dataset=ds_valid_with_assistant_content,
    formatting_func=formatting_func,
    args=training_args,
    optimizers=(build_optimizer(model_peft), None),  # (optimizer, scheduler)
    callbacks=[EarlyStoppingCallback(early_stopping_patience=25)]
)


# Training setup summary
dataset_size = len(ds_train_with_assistant_content)
steps_per_epoch = dataset_size // (PER_DEVICE_BATCH_SIZE * gradient_accumulation_steps)
total_steps = steps_per_epoch * epochs
warmup_steps = int(total_steps * warmup_ratio)

print("===== Training Setup Summary =====")
print(f"Num epochs:            {epochs}")
print(f"Effective batch size:  {effective_batch_size}")
print(f"Per-device batch size: {PER_DEVICE_BATCH_SIZE}")
print(f"Gradient accumulation: {gradient_accumulation_steps}")
print(f"Dataset size:          {dataset_size}")
print(f"Steps per epoch:       {steps_per_epoch}")
print(f"Total training steps:  {total_steps}")
print(f"Warmup steps:          {warmup_steps}")
print(f"Logging steps:         {training_args.logging_steps}")
print("===================================")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")


# Training
last_checkpoint = None
if RESUME_TRAINING and os.path.isdir(OUTPUT_DIR):
    last_checkpoint = get_last_checkpoint(OUTPUT_DIR)

if last_checkpoint is not None:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("Starting fresh training run")
    trainer.train()

print(f"End time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")


# WandB logging of eval metrics
for log in trainer.state.log_history:
    if 'eval_loss' in log:
        wandb.log({
            "eval_loss": log['eval_loss'],
            "eval_perplexity": math.exp(log['eval_loss']),
            "step": log['step'],
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "betas": betas,
            "warmup_ratio": warmup_ratio,
            "effective_batch_size": effective_batch_size,
            "optimizer": optimizer
        })

wandb.finish()  # finish the run

[34m[1mwandb[0m: Currently logged in as: [33molialeshka[0m ([33molialeshka-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Applying formatting function to train dataset:   0%|          | 0/97500 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/97500 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/97500 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/2500 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2500 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2500 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


===== Training Setup Summary =====
Num epochs:            1
Effective batch size:  16
Per-device batch size: 2
Gradient accumulation: 8
Dataset size:          97500
Steps per epoch:       6093
Total training steps:  6093
Warmup steps:          1218
Logging steps:         40
Start time: 2025-10-05_06-32-05
Starting fresh training run


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
40,2.2114,2.221138,1.160082,124369.0,0.661763
80,2.1887,2.151783,1.170986,246617.0,0.666132
120,2.0633,2.045051,1.1743,372357.0,0.674203
160,1.9606,1.8985,1.163218,496224.0,0.690094
200,1.7639,1.712549,1.134681,624347.0,0.711704
240,1.5752,1.465905,1.063029,749722.0,0.730832
280,1.2792,1.13172,0.885653,875852.0,0.776071
320,0.9867,0.88548,0.773703,999432.0,0.808772
360,0.8224,0.773341,0.707053,1123822.0,0.836914
400,0.7354,0.721101,0.662706,1249546.0,0.845503


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
model_path = os.path.join(OUTPUT_DIR, 'final')
trainer.save_model(model_path)

## Test

In [15]:
torch.cuda.empty_cache()
get_vm_usage_metrics()

CPU 0 load: 2.00
CPU 1 load: 0.00
CPU 2 load: 1.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 3.82 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2515.0 MB
Disk Total: 60.95 GB, Used: 33.67 GB


In [16]:
model_path = './lora-final_model-output/final'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
model.eval()

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=1024, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.01, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=1024, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): lora.Linear(
            (base_layer): Linear(in_features=1024, out_features=1024

In [None]:
BATCH_SIZE = 32
ENABLE_THINKING = False
MAX_NEW_TOKENS = 512


prompts = [ds_test[id]['sql_prompt'] for id in range(len(ds_test))]
contexts = [ds_test[id]['sql_context'] for id in range(len(ds_test))]

responses = []
print(f"Start time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")
for i in tqdm(range(0, len(prompts), BATCH_SIZE)):
    batch_prompts = prompts[i : i + BATCH_SIZE]
    batch_contexts = contexts[i : i + BATCH_SIZE]

    messages_list = [
        construct_message(prompt=p, context=c)
        for p, c in zip(batch_prompts, batch_contexts)
    ]

    batch_responses = generate_model_response_batch(model, tokenizer, messages_list, enable_thinking=ENABLE_THINKING, max_new_tokens=MAX_NEW_TOKENS)

    responses.extend(batch_responses)

print(f"End time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")

Start time: 2025-10-05_15-15-10


  0%|          | 0/183 [00:00<?, ?it/s]

In [None]:
references = [ds_test[id]['sql'] for id in range(len(ds_test))]
predictions = [responses[id]['content'] for id in range(len(ds_test))]

scores = [
    evaluate_sql_response(
        reference=reference,
        prediction=prediction,
        sql_context=context
    )
    for reference, prediction, context in tqdm(zip(references, predictions, contexts), total=len(ds_test))
]

In [21]:
print(f"Mean test set score: {np.mean([score['final_score'] for score in scores]):.3f}")

Mean test set score: 0.736


In [None]:
with open('lora_test_predictions.pkl', 'wb') as f:
    pickle.dump(predictions, f)

# LoRA - step 4 - the final model (full LoRA)

In [12]:
torch.cuda.empty_cache()
get_vm_usage_metrics()

print(len(ds_train), len(ds_valid), len(ds_test))

ds_train_with_assistant_content = ds_train.map(construct_message_with_assistant_content)
ds_valid_with_assistant_content = ds_valid.map(construct_message_with_assistant_content)

CPU 0 load: 0.00
CPU 1 load: 0.00
CPU 2 load: 0.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 1.78 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2407.0 MB
Disk Total: 60.95 GB, Used: 33.68 GB
97500 2500 5851


In [None]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# resuming the prev run
# timestamp = '2025-08-19_08-33-30'
RUN_NAME = f'lora-final-model-all-linear-{timestamp}'
# run_id = 'imoh6jtd'
wandb.init(
    project=os.environ["WANDB_PROJECT"],
    name=RUN_NAME,
    # id=run_id,         # resume previous run if available
    resume="allow",    # allows resuming crashed run
)


RESUME_TRAINING = True
OUTPUT_DIR = "./lora-final_model_all_linear-output"
PER_DEVICE_BATCH_SIZE = 2  # higher values --> OOM

optimizer = 'nadam'
effective_batch_size = 16
learning_rate = 1e-5
weight_decay = 0.0
betas = (0.9, 0.9999)
warmup_ratio = 0.2
epochs = 1
gradient_accumulation_steps = int(effective_batch_size / PER_DEVICE_BATCH_SIZE)
lora_r = 16
lora_alpha = 64
lora_dropout = 0.01

optimizer_map = {
    "adam": torch.optim.Adam,
    "adamw": torch.optim.AdamW,
    "nadam": torch.optim.NAdam,
    "adamax": torch.optim.Adamax
}

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    lr_scheduler_type="cosine",
    warmup_ratio=warmup_ratio,
    save_strategy="steps",
    save_steps=gradient_accumulation_steps*5,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=gradient_accumulation_steps*5,
    logging_strategy="steps",
    logging_steps=gradient_accumulation_steps*5,
    report_to=['wandb'],
    run_name=RUN_NAME,
    fp16=True,
    fp16_full_eval=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    max_grad_norm=1,
    load_best_model_at_end=True,
    gradient_checkpointing=True
)

def build_optimizer(model):
    optimizer_class = optimizer_map[optimizer]
    return optimizer_class(
        model.parameters(),
        lr=learning_rate,
        weight_decay=weight_decay,
        betas=betas
    )


peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules='all-linear'
)
model.requires_grad_(False)                     # freeze base weights (precautionary)
model_peft = get_peft_model(model, peft_config) # inject a LoRA adapter

trainer = SFTTrainer(
    model=model_peft,
    train_dataset=ds_train_with_assistant_content,
    eval_dataset=ds_valid_with_assistant_content,
    formatting_func=formatting_func,
    args=training_args,
    optimizers=(build_optimizer(model_peft), None),  # (optimizer, scheduler)
    callbacks=[EarlyStoppingCallback(early_stopping_patience=25)]
)


# Training setup summary
dataset_size = len(ds_train_with_assistant_content)
steps_per_epoch = dataset_size // (PER_DEVICE_BATCH_SIZE * gradient_accumulation_steps)
total_steps = steps_per_epoch * epochs
warmup_steps = int(total_steps * warmup_ratio)

print("===== Training Setup Summary =====")
print(f"Num epochs:            {epochs}")
print(f"Effective batch size:  {effective_batch_size}")
print(f"Per-device batch size: {PER_DEVICE_BATCH_SIZE}")
print(f"Gradient accumulation: {gradient_accumulation_steps}")
print(f"Dataset size:          {dataset_size}")
print(f"Steps per epoch:       {steps_per_epoch}")
print(f"Total training steps:  {total_steps}")
print(f"Warmup steps:          {warmup_steps}")
print(f"Logging steps:         {training_args.logging_steps}")
print("===================================")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")


# Training
last_checkpoint = None
if RESUME_TRAINING and os.path.isdir(OUTPUT_DIR):
    last_checkpoint = get_last_checkpoint(OUTPUT_DIR)

if last_checkpoint is not None:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("Starting fresh training run")
    trainer.train()

print(f"End time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")


# WandB logging of eval metrics
for log in trainer.state.log_history:
    if 'eval_loss' in log:
        wandb.log({
            "eval_loss": log['eval_loss'],
            "eval_perplexity": math.exp(log['eval_loss']),
            "step": log['step'],
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "betas": betas,
            "warmup_ratio": warmup_ratio,
            "effective_batch_size": effective_batch_size,
            "optimizer": optimizer
        })

wandb.finish()  # finish the run

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


===== Training Setup Summary =====
Num epochs:            1
Effective batch size:  16
Per-device batch size: 2
Gradient accumulation: 8
Dataset size:          97500
Steps per epoch:       6093
Total training steps:  6093
Warmup steps:          1218
Logging steps:         40
Start time: 2025-10-06_06-59-49
Starting fresh training run


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
40,2.2064,2.196743,1.164162,124369.0,0.664029
80,2.1063,1.990358,1.163912,246617.0,0.681711
120,1.8126,1.678701,1.103828,372357.0,0.717856
160,1.4527,1.201856,0.853513,496224.0,0.779109
200,0.9819,0.82773,0.746682,624347.0,0.818777
240,0.7547,0.71461,0.65208,749722.0,0.845053
280,0.6795,0.675604,0.634108,875852.0,0.849026
320,0.651,0.650463,0.627987,999432.0,0.851038
360,0.6379,0.627056,0.621941,1123822.0,0.853274
400,0.6084,0.603315,0.621214,1249546.0,0.859687


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [17]:
model_path = os.path.join(OUTPUT_DIR, 'final')
trainer.save_model(model_path)

## Test

In [18]:
torch.cuda.empty_cache()
get_vm_usage_metrics()

CPU 0 load: 2.00
CPU 1 load: 1.00
CPU 2 load: 2.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 2.87 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2647.0 MB
Disk Total: 60.95 GB, Used: 33.99 GB


In [19]:
model_path = './lora-final_model_all_linear-output/final'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
model.eval()

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=1024, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.01, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=1024, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=1024, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
              (default

In [None]:
BATCH_SIZE = 32
ENABLE_THINKING = False
MAX_NEW_TOKENS = 512


prompts = [ds_test[id]['sql_prompt'] for id in range(len(ds_test))]
contexts = [ds_test[id]['sql_context'] for id in range(len(ds_test))]

responses = []
print(f"Start time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")
for i in tqdm(range(0, len(prompts), BATCH_SIZE)):
    batch_prompts = prompts[i : i + BATCH_SIZE]
    batch_contexts = contexts[i : i + BATCH_SIZE]

    messages_list = [
        construct_message(prompt=p, context=c)
        for p, c in zip(batch_prompts, batch_contexts)
    ]

    batch_responses = generate_model_response_batch(model, tokenizer, messages_list, enable_thinking=ENABLE_THINKING, max_new_tokens=MAX_NEW_TOKENS)

    responses.extend(batch_responses)

print(f"End time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")

Start time: 2025-10-06_18-55-47


  0%|          | 0/183 [00:00<?, ?it/s]

In [None]:
references = [ds_test[id]['sql'] for id in range(len(ds_test))]
predictions = [responses[id]['content'] for id in range(len(ds_test))]

scores = [
    evaluate_sql_response(
        reference=reference,
        prediction=prediction,
        sql_context=context
    )
    for reference, prediction, context in tqdm(zip(references, predictions, contexts), total=len(ds_test))
]

In [23]:
print(f"Mean test set score: {np.mean([score['final_score'] for score in scores]):.3f}")

Mean test set score: 0.756


# LoRA - step 5 - higher rank

In [16]:
torch.cuda.empty_cache()
get_vm_usage_metrics()

print(len(ds_train), len(ds_valid), len(ds_test))

ds_train_with_assistant_content = ds_train.map(construct_message_with_assistant_content)
ds_valid_with_assistant_content = ds_valid.map(construct_message_with_assistant_content)

CPU 0 load: 1.00
CPU 1 load: 2.00
CPU 2 load: 0.00
CPU 3 load: 1.00
RAM Total: 27.41 GB, Used: 1.90 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2407.0 MB
Disk Total: 60.95 GB, Used: 35.11 GB
97500 2500 5851


In [None]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# resuming the prev run
timestamp = '2025-10-07_08-21-41'
RUN_NAME = f'lora-final-model-all-linear-r64-{timestamp}'
run_id = 'sla5zrpx'
wandb.init(
    project=os.environ["WANDB_PROJECT"],
    name=RUN_NAME,
    id=run_id,         # resume previous run if available
    resume="allow",    # allows resuming crashed run
)


RESUME_TRAINING = True
OUTPUT_DIR = "./lora-final_model_all_linear_r64-output"
PER_DEVICE_BATCH_SIZE = 2  # higher values --> OOM

optimizer = 'nadam'
effective_batch_size = 16
learning_rate = 1e-5
weight_decay = 0.0
betas = (0.9, 0.9999)
warmup_ratio = 0.2
epochs = 1
gradient_accumulation_steps = int(effective_batch_size / PER_DEVICE_BATCH_SIZE)
lora_r = 16*4
lora_alpha = 64*4
lora_dropout = 0.01

optimizer_map = {
    "adam": torch.optim.Adam,
    "adamw": torch.optim.AdamW,
    "nadam": torch.optim.NAdam,
    "adamax": torch.optim.Adamax
}

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    lr_scheduler_type="cosine",
    warmup_ratio=warmup_ratio,
    save_strategy="steps",
    save_steps=gradient_accumulation_steps*5,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=gradient_accumulation_steps*5,
    logging_strategy="steps",
    logging_steps=gradient_accumulation_steps*5,
    report_to=['wandb'],
    run_name=RUN_NAME,
    fp16=True,
    fp16_full_eval=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    max_grad_norm=1,
    load_best_model_at_end=True,
    gradient_checkpointing=True
)

def build_optimizer(model):
    optimizer_class = optimizer_map[optimizer]
    return optimizer_class(
        model.parameters(),
        lr=learning_rate,
        weight_decay=weight_decay,
        betas=betas
    )


peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules='all-linear'
)
model.requires_grad_(False)                     # freeze base weights (precautionary)
model_peft = get_peft_model(model, peft_config) # inject a LoRA adapter

trainer = SFTTrainer(
    model=model_peft,
    train_dataset=ds_train_with_assistant_content,
    eval_dataset=ds_valid_with_assistant_content,
    formatting_func=formatting_func,
    args=training_args,
    optimizers=(build_optimizer(model_peft), None),  # (optimizer, scheduler)
    callbacks=[EarlyStoppingCallback(early_stopping_patience=25)]
)


# Training setup summary
dataset_size = len(ds_train_with_assistant_content)
steps_per_epoch = dataset_size // (PER_DEVICE_BATCH_SIZE * gradient_accumulation_steps)
total_steps = steps_per_epoch * epochs
warmup_steps = int(total_steps * warmup_ratio)

print("===== Training Setup Summary =====")
print(f"Num epochs:            {epochs}")
print(f"Effective batch size:  {effective_batch_size}")
print(f"Per-device batch size: {PER_DEVICE_BATCH_SIZE}")
print(f"Gradient accumulation: {gradient_accumulation_steps}")
print(f"Dataset size:          {dataset_size}")
print(f"Steps per epoch:       {steps_per_epoch}")
print(f"Total training steps:  {total_steps}")
print(f"Warmup steps:          {warmup_steps}")
print(f"Logging steps:         {training_args.logging_steps}")
print("===================================")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")


# Training
last_checkpoint = None
if RESUME_TRAINING and os.path.isdir(OUTPUT_DIR):
    last_checkpoint = get_last_checkpoint(OUTPUT_DIR)

if last_checkpoint is not None:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("Starting fresh training run")
    trainer.train()

print(f"End time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")


# WandB logging of eval metrics
for log in trainer.state.log_history:
    if 'eval_loss' in log:
        wandb.log({
            "eval_loss": log['eval_loss'],
            "eval_perplexity": math.exp(log['eval_loss']),
            "step": log['step'],
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "betas": betas,
            "warmup_ratio": warmup_ratio,
            "effective_batch_size": effective_batch_size,
            "optimizer": optimizer
        })

wandb.finish()  # finish the run

In [14]:
model_path = os.path.join(OUTPUT_DIR, 'final')
trainer.save_model(model_path)

In [None]:
BATCH_SIZE = 32
ENABLE_THINKING = False
MAX_NEW_TOKENS = 512


prompts = [ds_test[id]['sql_prompt'] for id in range(len(ds_test))]
contexts = [ds_test[id]['sql_context'] for id in range(len(ds_test))]

responses = []
print(f"Start time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")
for i in tqdm(range(0, len(prompts), BATCH_SIZE)):
    batch_prompts = prompts[i : i + BATCH_SIZE]
    batch_contexts = contexts[i : i + BATCH_SIZE]

    messages_list = [
        construct_message(prompt=p, context=c)
        for p, c in zip(batch_prompts, batch_contexts)
    ]

    batch_responses = generate_model_response_batch(model, tokenizer, messages_list, enable_thinking=ENABLE_THINKING, max_new_tokens=MAX_NEW_TOKENS)

    responses.extend(batch_responses)

print(f"End time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")

## Test

In [17]:
torch.cuda.empty_cache()
get_vm_usage_metrics()

CPU 0 load: 0.00
CPU 1 load: 3.00
CPU 2 load: 0.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 1.93 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2407.0 MB
Disk Total: 60.95 GB, Used: 35.11 GB


In [22]:
model_path = './lora-final_model_all_linear_r64-output/final'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

print_trainable_parameters(model)
model.eval()

Trainable params: 0 || All params: 636420096 || Trainable %: 0.00


Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=1024, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.01, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=1024, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=1024, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
              (default

In [14]:
BATCH_SIZE = 32
ENABLE_THINKING = False
MAX_NEW_TOKENS = 512


prompts = [ds_test[id]['sql_prompt'] for id in range(len(ds_test))]
contexts = [ds_test[id]['sql_context'] for id in range(len(ds_test))]

responses = []
print(f"Start time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")
for i in tqdm(range(0, len(prompts), BATCH_SIZE)):
    batch_prompts = prompts[i : i + BATCH_SIZE]
    batch_contexts = contexts[i : i + BATCH_SIZE]

    messages_list = [
        construct_message(prompt=p, context=c)
        for p, c in zip(batch_prompts, batch_contexts)
    ]

    batch_responses = generate_model_response_batch(model, tokenizer, messages_list, enable_thinking=ENABLE_THINKING, max_new_tokens=MAX_NEW_TOKENS)

    responses.extend(batch_responses)

print(f"End time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")

Start time: 2025-10-08_06-35-48


  0%|          | 0/183 [00:00<?, ?it/s]

End time: 2025-10-08_07-25-10


In [15]:
references = [ds_test[id]['sql'] for id in range(len(ds_test))]
predictions = [responses[id]['content'] for id in range(len(ds_test))]

scores = [
    evaluate_sql_response(
        reference=reference,
        prediction=prediction,
        sql_context=context
    )
    for reference, prediction, context in tqdm(zip(references, predictions, contexts), total=len(ds_test))
]

  0%|          | 0/5851 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [16]:
print(f"Mean test set score: {np.mean([score['final_score'] for score in scores]):.3f}")

Mean test set score: 0.774
