# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import datasets
import evaluate
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, EarlyStoppingCallback
from transformers.trainer_utils import get_last_checkpoint
from trl import SFTTrainer
from accelerate import cpu_offload
import sqlite3
import sqlparse
from tqdm.auto import tqdm
from datetime import datetime
import pickle
import wandb
import psutil
import GPUtil
import os
import gc
import math

import _config

In [2]:
ENABLE_THINKING = False

In [3]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

os.environ["WANDB_API_KEY"] = _config.WANDB_API_KEY
os.environ["WANDB_PROJECT"] = _config.WANDB_PROJECT

# Utils

In [4]:
def get_vm_usage_metrics():
    # CPU usage
    cpu_load = psutil.cpu_percent(interval=1, percpu=True)
    for id, load in enumerate(cpu_load):
        print(f"CPU {id} load: {load:.2f}")
    # RAM usage
    ram = psutil.virtual_memory()
    print(f"RAM Total: {ram.total/(1024**3):.2f} GB, Used: {(ram.used)/(1024**3):.2f} GB")
    # GPU
    if torch.cuda.is_available():
        gpus = GPUtil.getGPUs()
        for gpu in gpus:
            print(f"GPU {gpu.id} ({gpu.name}) load: {gpu.load*100}%")
            print(f"GPU {gpu.id} ({gpu.name}) VRAM Total: {gpu.memoryTotal} MB, Used {gpu.memoryUsed} MB")
    # Disk 
    disk = psutil.disk_usage('/')
    print(f"Disk Total: {disk.total/(1024**3):.2f} GB, Used: {(disk.used)/(1024**3):.2f} GB")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')
get_vm_usage_metrics()

Device: cuda
CPU 0 load: 0.00
CPU 1 load: 0.00
CPU 2 load: 0.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 1.12 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 3.0 MB
Disk Total: 60.95 GB, Used: 48.28 GB


# Data

In [5]:
ds = datasets.load_dataset('gretelai/synthetic_text_to_sql', streaming=False)
ds_train, ds_test = ds['train'], ds['test']

split = ds_train.train_test_split(test_size=0.025, seed=42)
ds_train = split['train']
ds_valid = split['test']

ds_train

Dataset({
    features: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'],
    num_rows: 97500
})

# Model

In [6]:
checkpoint = "Qwen/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    device_map="cuda",
)
# model = cpu_offload(model)
get_vm_usage_metrics()

CPU 0 load: 0.00
CPU 1 load: 2.00
CPU 2 load: 1.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 1.35 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2985.0 MB
Disk Total: 60.95 GB, Used: 48.28 GB


In [8]:
def generate_model_response_batch(model, tokenizer, messages_list, enable_thinking=True, max_new_tokens=512):
    texts = [
        tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=enable_thinking
        )
        for messages in messages_list
    ]

    model_inputs = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        padding_side='left'
    ).to(model.device)

    model.eval()
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_new_tokens
    )

    responses = []
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids):
        # Slice to get only generated part
        output_only_ids = output_ids[len(input_ids):].tolist()

        # Try to find `</think>` (id 151668)
        try:
            index = len(output_only_ids) - output_only_ids[::-1].index(151668)
        except ValueError:
            index = 0

        if enable_thinking:
            thinking_content = tokenizer.decode(
                output_only_ids[:index],
                skip_special_tokens=True
            ).strip("\n")
            content = tokenizer.decode(
                output_only_ids[index:],
                skip_special_tokens=True
            ).strip("\n")
        else:
            thinking_content = None
            content = tokenizer.decode(
                output_only_ids,
                skip_special_tokens=True
            ).strip("\n")

        responses.append({
            'thinking_content': thinking_content,
            'content': content
        })

    return responses

# Evaluate

In [9]:
rouge = evaluate.load("rouge")

def normalize_sql(sql):
    return sqlparse.format(sql, reindent=True, keyword_case='upper').strip()

def compute_rouge(reference, prediction):
    result = rouge.compute(predictions=[prediction], references=[reference])
    return result['rougeL']

def evaluate_sql_response(reference, prediction, sql_context):
    # ROUGE-L
    rouge_score = compute_rouge(reference, prediction)
    
    # execution check
    try:
        conn = sqlite3.connect(":memory:")
        cursor = conn.cursor()
        
        cursor.executescript(sql_context)
        cursor.execute(reference)
        ref_result = cursor.fetchall()
        
        cursor.execute(prediction)
        model_result = cursor.fetchall()
        
        execution_match = ref_result == model_result
    except Exception:
        execution_match = False
    finally:
        conn.close()
    
    # final score
    if execution_match:
        final_score = 1.0
    else:
        final_score = 0.7 * rouge_score

    return {
        "rougeL": round(rouge_score, 4),
        "execution_match": execution_match,
        "final_score": final_score
    }

# Formatting functions

In [10]:
# used for training
def construct_message_with_assistant_content(example):
    messages = construct_message(example['sql_prompt'], example['sql_context'])
    messages.append({
        'role': 'assistant',
        'content': example['sql']
    })
    return {'messages': messages}

In [11]:
def formatting_func(example, enable_thinking=ENABLE_THINKING):
    return tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False, # no generation prompt during training
        enable_thinking=ENABLE_THINKING 
    )

# SFT - step 1

In [12]:
TRAIN_SIZE = 4096
VALID_SIZE = 1024

ds_train_sample = ds_train.take(TRAIN_SIZE)
ds_valid_sample = ds_valid.take(VALID_SIZE)

len(ds_train_sample), len(ds_valid_sample), len(ds_test)

ds_train_with_assistant_content = ds_train_sample.map(construct_message_with_assistant_content)
ds_valid_with_assistant_content = ds_valid_sample.map(construct_message_with_assistant_content)

In [14]:
get_vm_usage_metrics()

CPU 0 load: 0.00
CPU 1 load: 3.00
CPU 2 load: 0.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 3.07 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2385.0 MB
Disk Total: 60.95 GB, Used: 31.83 GB


In [15]:
torch.cuda.empty_cache()

In [None]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

sweep_config = {
    'name': f'sweep-sft-step1-epochs1-samples{TRAIN_SIZE}-{timestamp}',
    'method': 'bayes',
    'metric': {
        'name': 'eval_loss',
        'goal': 'minimize'   
    },
    'parameters': {
        'optimizer': {'values': ['adam', 'adamw', 'nadam', 'adamax']},
        'effective_batch_size': {'values': [16, 32, 64, 128, 256, 512]},
        'learning_rate': {'values': [1e-4, 5e-5, 1e-5, 5e-6, 1e-6]},
        'weight_decay': {'values': [0.0, 0.01, 0.1]},
        'betas': {'values': [(0.9, 0.999), (0.95, 0.999), (0.9, 0.9999)]},
        'warmup_ratio': {'values': [0.05, 0.1, 0.2]},
        'epochs': {'values': [1]}
    }
}

# sweep_id = wandb.sweep(sweep_config)
sweep_id = 'je9m6in0' # continue the crashed sweep

optimizer_map = {
    "adam": torch.optim.Adam,
    "adamw": torch.optim.AdamW,
    "nadam": torch.optim.NAdam,
    "adamax": torch.optim.Adamax
}

def sweep_train():
    with wandb.init() as run:
        config = wandb.config  
        PER_DEVICE_BATCH_SIZE = 2  # higher values --> OOM
        gradient_accumulation_steps = int(config.effective_batch_size / PER_DEVICE_BATCH_SIZE)
        
        training_args = TrainingArguments(
            per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
            gradient_accumulation_steps=gradient_accumulation_steps,
            learning_rate=config.learning_rate,
            num_train_epochs=config.epochs,
            weight_decay=config.weight_decay,
            lr_scheduler_type="cosine",
            warmup_ratio=config.warmup_ratio,
            save_strategy="no",
            eval_strategy="epoch",
            logging_strategy="steps",
            logging_steps=1,
            report_to=['wandb'],
            fp16=True,
            fp16_full_eval=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            max_grad_norm=1,
            # load_best_model_at_end=True
        )
        
        def build_optimizer(model):
            optimizer_class = optimizer_map[config.optimizer]
            return optimizer_class(
                model.parameters(),
                lr=config.learning_rate,
                weight_decay=config.weight_decay,
                betas=config.betas
            )
            
        trainer = SFTTrainer(
            model=model,
            train_dataset=ds_train_with_assistant_content,
            eval_dataset=ds_valid_with_assistant_content,
            formatting_func=formatting_func,
            args=training_args,
            optimizers=(build_optimizer(model), None),  # (optimizer, scheduler)
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        trainer.train()

        for log in trainer.state.log_history:
            if 'eval_loss' in log:
                wandb.log({
                    "eval_loss": log['eval_loss'],
                    "eval_perplexity": math.exp(log['eval_loss']),
                    "step": log['step'],
                    "learning_rate": config.learning_rate,
                    "weight_decay": config.weight_decay,
                    "betas": config.betas,
                    "warmup_ratio": config.warmup_ratio,
                    "effective_batch_size": config.effective_batch_size,
                    "optimizer": config.optimizer
                })
        wandb.finish(); # finish the run
        del trainer
        torch.cuda.empty_cache()
        gc.collect()
        
wandb.agent(sweep_id, function=sweep_train, count=20)

[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: br443b1o with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.9999]
[34m[1mwandb[0m: 	effective_batch_size: 64
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 5e-06
[34m[1mwandb[0m: 	optimizer: adamw
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.01
[34m[1mwandb[0m: Currently logged in as: [33molialeshka[0m ([33molialeshka-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.5108,0.537351


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,64
eval/loss,0.53735
eval/mean_token_accuracy,0.86883
eval/num_tokens,799399
eval/runtime,41.8312
eval/samples_per_second,24.479
eval/steps_per_second,3.06
eval_loss,0.53735
eval_perplexity,1.71147
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: 3z5ja3hb with config:
[34m[1mwandb[0m: 	betas: [0.95, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 64
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4321,0.465656


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,64
eval/loss,0.46566
eval/mean_token_accuracy,0.87883
eval/num_tokens,799399
eval/runtime,41.7945
eval/samples_per_second,24.501
eval/steps_per_second,3.063
eval_loss,0.46566
eval_perplexity,1.59306
learning_rate,5e-05


[34m[1mwandb[0m: Agent Starting Run: k7whhbck with config:
[34m[1mwandb[0m: 	betas: [0.95, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 128
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adamax
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4478,0.472673


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,128
eval/loss,0.47267
eval/mean_token_accuracy,0.87743
eval/num_tokens,799399
eval/runtime,41.7536
eval/samples_per_second,24.525
eval/steps_per_second,3.066
eval_loss,0.47267
eval_perplexity,1.60428
learning_rate,0.0001


[34m[1mwandb[0m: Agent Starting Run: vhwxy94h with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 32
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 5e-06
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.1


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4508,0.477449


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,32
eval/loss,0.47745
eval/mean_token_accuracy,0.87827
eval/num_tokens,799399
eval/runtime,41.7705
eval/samples_per_second,24.515
eval/steps_per_second,3.064
eval_loss,0.47745
eval_perplexity,1.61196
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: iquhvgo2 with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 16
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	optimizer: adamax
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.01


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.5158,0.481466


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,16
eval/loss,0.48147
eval/mean_token_accuracy,0.87842
eval/num_tokens,799399
eval/runtime,41.7977
eval/samples_per_second,24.499
eval/steps_per_second,3.062
eval_loss,0.48147
eval_perplexity,1.61845
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: 2kdrqip1 with config:
[34m[1mwandb[0m: 	betas: [0.95, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 128
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.1


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4602,0.483716


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,128
eval/loss,0.48372
eval/mean_token_accuracy,0.87699
eval/num_tokens,799399
eval/runtime,41.5545
eval/samples_per_second,24.642
eval/steps_per_second,3.08
eval_loss,0.48372
eval_perplexity,1.62209
learning_rate,0.0001


[34m[1mwandb[0m: Agent Starting Run: 7vxe2iku with config:
[34m[1mwandb[0m: 	betas: [0.95, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 512
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adamax
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.1


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4974,0.515504


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,512
eval/loss,0.5155
eval/mean_token_accuracy,0.87527
eval/num_tokens,799399
eval/runtime,41.6101
eval/samples_per_second,24.609
eval/steps_per_second,3.076
eval_loss,0.5155
eval_perplexity,1.67448
learning_rate,0.0001


[34m[1mwandb[0m: Agent Starting Run: wqtvwgms with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.9999]
[34m[1mwandb[0m: 	effective_batch_size: 32
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	optimizer: adamax
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.01


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4526,0.482452


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,32
eval/loss,0.48245
eval/mean_token_accuracy,0.87738
eval/num_tokens,799399
eval/runtime,41.5981
eval/samples_per_second,24.616
eval/steps_per_second,3.077
eval_loss,0.48245
eval_perplexity,1.62004
learning_rate,5e-05


[34m[1mwandb[0m: Agent Starting Run: tni7vibm with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 64
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.01


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4691,0.503751


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,64
eval/loss,0.50375
eval/mean_token_accuracy,0.87577
eval/num_tokens,799399
eval/runtime,41.46
eval/samples_per_second,24.699
eval/steps_per_second,3.087
eval_loss,0.50375
eval_perplexity,1.65492
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: lbipx3vm with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.9999]
[34m[1mwandb[0m: 	effective_batch_size: 128
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adamax
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.1


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4919,0.515153


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,128
eval/loss,0.51515
eval/mean_token_accuracy,0.87156
eval/num_tokens,799399
eval/runtime,41.4722
eval/samples_per_second,24.691
eval/steps_per_second,3.086
eval_loss,0.51515
eval_perplexity,1.67389
learning_rate,0.0001


[34m[1mwandb[0m: Agent Starting Run: 49x1p1yo with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.9999]
[34m[1mwandb[0m: 	effective_batch_size: 512
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 1e-06
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.1


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4982,0.515965


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,512
eval/loss,0.51596
eval/mean_token_accuracy,0.87167
eval/num_tokens,799399
eval/runtime,41.6978
eval/samples_per_second,24.558
eval/steps_per_second,3.07
eval_loss,0.51596
eval_perplexity,1.67525
learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: 9xnwc3f2 with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 256
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adamax
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4939,0.515855


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,256
eval/loss,0.51585
eval/mean_token_accuracy,0.87012
eval/num_tokens,799399
eval/runtime,41.7764
eval/samples_per_second,24.511
eval/steps_per_second,3.064
eval_loss,0.51585
eval_perplexity,1.67507
learning_rate,0.0001


[34m[1mwandb[0m: Agent Starting Run: u8s9fh66 with config:
[34m[1mwandb[0m: 	betas: [0.95, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 512
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 1e-06
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.1


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4942,0.515453


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,512
eval/loss,0.51545
eval/mean_token_accuracy,0.87024
eval/num_tokens,799399
eval/runtime,41.7769
eval/samples_per_second,24.511
eval/steps_per_second,3.064
eval_loss,0.51545
eval_perplexity,1.6744
learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: o86ugvtp with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.9999]
[34m[1mwandb[0m: 	effective_batch_size: 512
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 5e-06
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.01


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4918,0.512805


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,512
eval/loss,0.5128
eval/mean_token_accuracy,0.87107
eval/num_tokens,799399
eval/runtime,41.7681
eval/samples_per_second,24.516
eval/steps_per_second,3.065
eval_loss,0.5128
eval_perplexity,1.66997
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: bo4whbdv with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.9999]
[34m[1mwandb[0m: 	effective_batch_size: 512
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.5736,0.581259


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,512
eval/loss,0.58126
eval/mean_token_accuracy,0.86136
eval/num_tokens,799399
eval/runtime,41.8214
eval/samples_per_second,24.485
eval/steps_per_second,3.061
eval_loss,0.58126
eval_perplexity,1.78829
learning_rate,0.0001


[34m[1mwandb[0m: Agent Starting Run: qpnyytus with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.9999]
[34m[1mwandb[0m: 	effective_batch_size: 512
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.01


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.5319,0.555546


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,512
eval/loss,0.55555
eval/mean_token_accuracy,0.8662
eval/num_tokens,799399
eval/runtime,41.8418
eval/samples_per_second,24.473
eval/steps_per_second,3.059
eval_loss,0.55555
eval_perplexity,1.74289
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: 2kmokyfk with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.9999]
[34m[1mwandb[0m: 	effective_batch_size: 64
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adamw
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.01


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4684,0.513289


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,64
eval/loss,0.51329
eval/mean_token_accuracy,0.87167
eval/num_tokens,799399
eval/runtime,41.8303
eval/samples_per_second,24.48
eval/steps_per_second,3.06
eval_loss,0.51329
eval_perplexity,1.67078
learning_rate,0.0001


[34m[1mwandb[0m: Agent Starting Run: h5fs3xss with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 256
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.5202,0.547764


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,256
eval/loss,0.54776
eval/mean_token_accuracy,0.86988
eval/num_tokens,799399
eval/runtime,41.8296
eval/samples_per_second,24.48
eval/steps_per_second,3.06
eval_loss,0.54776
eval_perplexity,1.72938
learning_rate,5e-05


[34m[1mwandb[0m: Agent Starting Run: q18i30w8 with config:
[34m[1mwandb[0m: 	betas: [0.9, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 32
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 1e-06
[34m[1mwandb[0m: 	optimizer: adamw
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.5132,0.555168


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,32
eval/loss,0.55517
eval/mean_token_accuracy,0.87162
eval/num_tokens,799399
eval/runtime,41.9199
eval/samples_per_second,24.428
eval/steps_per_second,3.053
eval_loss,0.55517
eval_perplexity,1.74223
learning_rate,0.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vtp7tczz with config:
[34m[1mwandb[0m: 	betas: [0.95, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 64
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 1e-06
[34m[1mwandb[0m: 	optimizer: adamw
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.1


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss


# SFT - step 2

In [15]:
torch.cuda.empty_cache()
get_vm_usage_metrics()


TRAIN_SIZE = 8192
VALID_SIZE = 2048

ds_train_sample = ds_train.take(TRAIN_SIZE)
ds_valid_sample = ds_valid.take(VALID_SIZE)

print(len(ds_train_sample), len(ds_valid_sample), len(ds_test))

ds_train_with_assistant_content = ds_train_sample.map(construct_message_with_assistant_content)
ds_valid_with_assistant_content = ds_valid_sample.map(construct_message_with_assistant_content)

CPU 0 load: 0.00
CPU 1 load: 2.00
CPU 2 load: 2.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 2.98 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2385.0 MB
Disk Total: 60.95 GB, Used: 31.66 GB
8192 2048 5851


In [15]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

sweep_config = {
    'name': f'sweep-sft-step2-epochs1-samples{TRAIN_SIZE}-{timestamp}',
    'method': 'grid',
    'metric': {
        'name': 'eval_loss',
        'goal': 'minimize'   
    },
    'parameters': {
        'optimizer': {'values': ['nadam']},
        'effective_batch_size': {'values': [32, 64, 128]},
        'learning_rate': {'values': [1e-4, 5e-5]},
        'weight_decay': {'values': [0.0]},
        'betas': {'values': [(0.95, 0.999)]},
        'warmup_ratio': {'values': [0.05, 0.1]},
        'epochs': {'values': [1]}
    }
}

# sweep_id = wandb.sweep(sweep_config)
sweep_id = 'r7pwu78y' # continue the crashed sweep

optimizer_map = {
    "adam": torch.optim.Adam,
    "adamw": torch.optim.AdamW,
    "nadam": torch.optim.NAdam,
    "adamax": torch.optim.Adamax
}

def sweep_train():
    with wandb.init() as run:
        config = wandb.config  
        PER_DEVICE_BATCH_SIZE = 2  # higher values --> OOM
        gradient_accumulation_steps = int(config.effective_batch_size / PER_DEVICE_BATCH_SIZE)
        
        training_args = TrainingArguments(
            per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
            gradient_accumulation_steps=gradient_accumulation_steps,
            learning_rate=config.learning_rate,
            num_train_epochs=config.epochs,
            weight_decay=config.weight_decay,
            lr_scheduler_type="cosine",
            warmup_ratio=config.warmup_ratio,
            save_strategy="no",
            eval_strategy="epoch",
            logging_strategy="steps",
            logging_steps=1,
            report_to=['wandb'],
            fp16=True,
            fp16_full_eval=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            max_grad_norm=1,
            # load_best_model_at_end=True
        )
        
        def build_optimizer(model):
            optimizer_class = optimizer_map[config.optimizer]
            return optimizer_class(
                model.parameters(),
                lr=config.learning_rate,
                weight_decay=config.weight_decay,
                betas=config.betas
            )
            
        trainer = SFTTrainer(
            model=model,
            train_dataset=ds_train_with_assistant_content,
            eval_dataset=ds_valid_with_assistant_content,
            formatting_func=formatting_func,
            args=training_args,
            optimizers=(build_optimizer(model), None),  # (optimizer, scheduler)
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        trainer.train()

        for log in trainer.state.log_history:
            if 'eval_loss' in log:
                wandb.log({
                    "eval_loss": log['eval_loss'],
                    "eval_perplexity": math.exp(log['eval_loss']),
                    "step": log['step'],
                    "learning_rate": config.learning_rate,
                    "weight_decay": config.weight_decay,
                    "betas": config.betas,
                    "warmup_ratio": config.warmup_ratio,
                    "effective_batch_size": config.effective_batch_size,
                    "optimizer": config.optimizer
                })
        wandb.finish(); # finish the run
        del trainer
        torch.cuda.empty_cache()
        gc.collect()
        
wandb.agent(sweep_id, function=sweep_train)

[34m[1mwandb[0m: Agent Starting Run: ingi197a with config:
[34m[1mwandb[0m: 	betas: [0.95, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 64
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: Currently logged in as: [33molialeshka[0m ([33molialeshka-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4383,0.468934


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,64
eval/loss,0.46893
eval/mean_token_accuracy,0.87795
eval/num_tokens,799399
eval/runtime,47.8871
eval/samples_per_second,21.384
eval/steps_per_second,2.673
eval_loss,0.46893
eval_perplexity,1.59829
learning_rate,5e-05


[34m[1mwandb[0m: Agent Starting Run: 6j0f8896 with config:
[34m[1mwandb[0m: 	betas: [0.95, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 64
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4206,0.457404


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,64
eval/loss,0.4574
eval/mean_token_accuracy,0.88083
eval/num_tokens,799399
eval/runtime,47.4192
eval/samples_per_second,21.595
eval/steps_per_second,2.699
eval_loss,0.4574
eval_perplexity,1.57997
learning_rate,5e-05


[34m[1mwandb[0m: Agent Starting Run: 5h8mck2v with config:
[34m[1mwandb[0m: 	betas: [0.95, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 128
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4364,0.467642


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,128
eval/loss,0.46764
eval/mean_token_accuracy,0.87933
eval/num_tokens,799399
eval/runtime,47.2382
eval/samples_per_second,21.677
eval/steps_per_second,2.71
eval_loss,0.46764
eval_perplexity,1.59623
learning_rate,0.0001


[34m[1mwandb[0m: Agent Starting Run: ri9qjadh with config:
[34m[1mwandb[0m: 	betas: [0.95, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 128
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4506,0.481426


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,128
eval/loss,0.48143
eval/mean_token_accuracy,0.8786
eval/num_tokens,799399
eval/runtime,47.649
eval/samples_per_second,21.49
eval/steps_per_second,2.686
eval_loss,0.48143
eval_perplexity,1.61838
learning_rate,0.0001


[34m[1mwandb[0m: Agent Starting Run: 4b99komv with config:
[34m[1mwandb[0m: 	betas: [0.95, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 128
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.4963,0.532538


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,128
eval/loss,0.53254
eval/mean_token_accuracy,0.87625
eval/num_tokens,799399
eval/runtime,47.8094
eval/samples_per_second,21.418
eval/steps_per_second,2.677
eval_loss,0.53254
eval_perplexity,1.70325
learning_rate,5e-05


[34m[1mwandb[0m: Agent Starting Run: 9ks072of with config:
[34m[1mwandb[0m: 	betas: [0.95, 0.999]
[34m[1mwandb[0m: 	effective_batch_size: 128
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,0.5249,0.565132


0,1
effective_batch_size,▁
eval/loss,▁
eval/mean_token_accuracy,▁
eval/num_tokens,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_loss,▁
eval_perplexity,▁
learning_rate,▁

0,1
effective_batch_size,128
eval/loss,0.56513
eval/mean_token_accuracy,0.87427
eval/num_tokens,799399
eval/runtime,47.8987
eval/samples_per_second,21.378
eval/steps_per_second,2.672
eval_loss,0.56513
eval_perplexity,1.75968
learning_rate,5e-05


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


# SFT - step 3 - the final model

In [12]:
torch.cuda.empty_cache()
get_vm_usage_metrics()

print(len(ds_train), len(ds_valid), len(ds_test))

ds_train_with_assistant_content = ds_train.map(construct_message_with_assistant_content)
ds_valid_with_assistant_content = ds_valid.map(construct_message_with_assistant_content)

CPU 0 load: 3.00
CPU 1 load: 0.00
CPU 2 load: 0.00
CPU 3 load: 1.00
RAM Total: 27.41 GB, Used: 1.34 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2405.0 MB
Disk Total: 60.95 GB, Used: 46.04 GB
97500 2500 5851


In [13]:
# timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# resuming the prev run
timestamp = '2025-08-19_08-33-30'
RUN_NAME = f'sft-final-model-{timestamp}'
run_id = 'imoh6jtd'
wandb.init(
    project=os.environ["WANDB_PROJECT"],
    name=RUN_NAME,
    id=run_id,         # resume previous run if available
    resume="allow",    # allows resuming crashed run
)


RESUME_TRAINING = True
OUTPUT_DIR = "./sft-final_model-output"
PER_DEVICE_BATCH_SIZE = 2  # higher values --> OOM

optimizer = 'nadam'
effective_batch_size = 32
learning_rate = 1e-4
weight_decay = 0.0
betas = (0.95, 0.999)
warmup_ratio = 0.05
epochs = 1
gradient_accumulation_steps = int(effective_batch_size / PER_DEVICE_BATCH_SIZE)

optimizer_map = {
    "adam": torch.optim.Adam,
    "adamw": torch.optim.AdamW,
    "nadam": torch.optim.NAdam,
    "adamax": torch.optim.Adamax
}

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    lr_scheduler_type="cosine",
    warmup_ratio=warmup_ratio,
    save_strategy="steps",
    save_steps=gradient_accumulation_steps*5,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=gradient_accumulation_steps*5,
    logging_strategy="steps",
    logging_steps=gradient_accumulation_steps*5,
    report_to=['wandb'],
    run_name=RUN_NAME,
    fp16=True,
    fp16_full_eval=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    max_grad_norm=1,
    load_best_model_at_end=True,
    gradient_checkpointing=True
)

def build_optimizer(model):
    optimizer_class = optimizer_map[optimizer]
    return optimizer_class(
        model.parameters(),
        lr=learning_rate,
        weight_decay=weight_decay,
        betas=betas
    )

trainer = SFTTrainer(
    model=model,
    train_dataset=ds_train_with_assistant_content,
    eval_dataset=ds_valid_with_assistant_content,
    formatting_func=formatting_func,
    args=training_args,
    optimizers=(build_optimizer(model), None),  # (optimizer, scheduler)
    callbacks=[EarlyStoppingCallback(early_stopping_patience=25)]
)



# Training setup summary
dataset_size = len(ds_train_with_assistant_content)
steps_per_epoch = dataset_size // (PER_DEVICE_BATCH_SIZE * gradient_accumulation_steps)
total_steps = steps_per_epoch * epochs
warmup_steps = int(total_steps * warmup_ratio)

print("===== Training Setup Summary =====")
print(f"Num epochs:            {epochs}")
print(f"Effective batch size:  {effective_batch_size}")
print(f"Per-device batch size: {PER_DEVICE_BATCH_SIZE}")
print(f"Gradient accumulation: {gradient_accumulation_steps}")
print(f"Dataset size:          {dataset_size}")
print(f"Steps per epoch:       {steps_per_epoch}")
print(f"Total training steps:  {total_steps}")
print(f"Warmup steps:          {warmup_steps}")
print(f"Logging steps:         {training_args.logging_steps}")
print("===================================")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")


# Training
last_checkpoint = None
if RESUME_TRAINING and os.path.isdir(OUTPUT_DIR):
    last_checkpoint = get_last_checkpoint(OUTPUT_DIR)

if last_checkpoint is not None:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("Starting fresh training run")
    trainer.train()

print(f"End time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")


# WandB logging of eval metrics
for log in trainer.state.log_history:
    if 'eval_loss' in log:
        wandb.log({
            "eval_loss": log['eval_loss'],
            "eval_perplexity": math.exp(log['eval_loss']),
            "step": log['step'],
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "betas": betas,
            "warmup_ratio": warmup_ratio,
            "effective_batch_size": effective_batch_size,
            "optimizer": optimizer
        })

wandb.finish()  # finish the run

[34m[1mwandb[0m: Currently logged in as: [33molialeshka[0m ([33molialeshka-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


===== Training Setup Summary =====
Num epochs:            1
Effective batch size:  32
Per-device batch size: 2
Gradient accumulation: 16
Dataset size:          97500
Steps per epoch:       3046
Total training steps:  3046
Warmup steps:          152
Logging steps:         80
Start time: 2025-08-20_07-22-45
Resuming training from checkpoint: ./sft-final_model-output/checkpoint-2240


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
2320,0.3521,0.35387
2400,0.3498,0.351453
2480,0.3403,0.349289
2560,0.3433,0.347501
2640,0.3455,0.345989
2720,0.345,0.344931
2800,0.3367,0.344186
2880,0.3361,0.343747
2960,0.3381,0.343571
3040,0.3402,0.343535


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


End time: 2025-08-20_08-55-37


0,1
effective_batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/loss,█▆▅▄▃▂▁▁▁▁
eval/mean_token_accuracy,▁▃▄▆▆▇████
eval/num_tokens,▁▂▃▃▄▅▆▆▇█
eval/runtime,█▂▂▁▁▁▂▁▂▂
eval/samples_per_second,▁▇▇███▇█▇▇
eval/steps_per_second,▁▇████▇█▇▇
eval_loss,█▆▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁
eval_perplexity,█▅▇▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
effective_batch_size,32
eval/loss,0.34353
eval/mean_token_accuracy,0.90365
eval/num_tokens,4979548.0
eval/runtime,101.1352
eval/samples_per_second,24.719
eval/steps_per_second,3.095
eval_loss,0.34353
eval_perplexity,1.40992
learning_rate,0.0001


In [15]:
model_path = os.path.join(OUTPUT_DIR, 'final')
trainer.save_model(model_path)

# Test

In [12]:
torch.cuda.empty_cache()
get_vm_usage_metrics()

CPU 0 load: 1.00
CPU 1 load: 2.00
CPU 2 load: 1.00
CPU 3 load: 0.00
RAM Total: 27.41 GB, Used: 1.40 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 2405.0 MB
Disk Total: 60.95 GB, Used: 48.28 GB


In [13]:
model_path = './sft-final_model-output/final'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
model.eval()

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwe

In [14]:
BATCH_SIZE = 32
ENABLE_THINKING = False
MAX_NEW_TOKENS = 512


prompts = [ds_test[id]['sql_prompt'] for id in range(len(ds_test))]
contexts = [ds_test[id]['sql_context'] for id in range(len(ds_test))]

responses = []
print(f"Start time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")
for i in tqdm(range(0, len(prompts), BATCH_SIZE)):
    batch_prompts = prompts[i : i + BATCH_SIZE]
    batch_contexts = contexts[i : i + BATCH_SIZE]

    messages_list = [
        construct_message(prompt=p, context=c)
        for p, c in zip(batch_prompts, batch_contexts)
    ]

    batch_responses = generate_model_response_batch(model, tokenizer, messages_list, enable_thinking=ENABLE_THINKING, max_new_tokens=MAX_NEW_TOKENS)

    responses.extend(batch_responses)

print(f"End time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")

Start time: 2025-08-20_19-03-53


  0%|          | 0/183 [00:00<?, ?it/s]

End time: 2025-08-20_19-50-26


In [15]:
references = [ds_test[id]['sql'] for id in range(len(ds_test))]
predictions = [responses[id]['content'] for id in range(len(ds_test))]

scores = [
    evaluate_sql_response(
        reference=reference,
        prediction=prediction,
        sql_context=context
    )
    for reference, prediction, context in tqdm(zip(references, predictions, contexts), total=len(ds_test))
]

print(f"Mean test set score: {np.mean([score['final_score'] for score in scores]):.3f}")

  0%|          | 0/5851 [00:00<?, ?it/s]

Mean test set score: 0.793


In [16]:
with open('sft_test_predictions.pkl', 'wb') as f:
    pickle.dump(predictions, f)