# Imports

In [36]:
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, EarlyStoppingCallback
from transformers.trainer_utils import get_last_checkpoint
from trl import RewardTrainer, RewardConfig
import wandb
from datetime import datetime
import _config

import os
import psutil
import GPUtil
import gc


os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

os.environ["WANDB_API_KEY"] = _config.WANDB_API_KEY
os.environ["WANDB_PROJECT"] = _config.WANDB_PROJECT

# Utils

In [3]:
def get_vm_usage_metrics():
    # CPU usage
    cpu_load = psutil.cpu_percent(interval=1, percpu=True)
    for id, load in enumerate(cpu_load):
        print(f"CPU {id} load: {load:.2f}")
    # RAM usage
    ram = psutil.virtual_memory()
    print(f"RAM Total: {ram.total/(1024**3):.2f} GB, Used: {(ram.used)/(1024**3):.2f} GB")
    # GPU
    if torch.cuda.is_available():
        gpus = GPUtil.getGPUs()
        for gpu in gpus:
            print(f"GPU {gpu.id} ({gpu.name}) load: {gpu.load*100}%")
            print(f"GPU {gpu.id} ({gpu.name}) VRAM Total: {gpu.memoryTotal} MB, Used {gpu.memoryUsed} MB")
    # Disk 
    disk = psutil.disk_usage('/')
    print(f"Disk Total: {disk.total/(1024**3):.2f} GB, Used: {(disk.used)/(1024**3):.2f} GB")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')
get_vm_usage_metrics()

Device: cuda
CPU 0 load: 3.00
CPU 1 load: 1.00
CPU 2 load: 0.00
CPU 3 load: 0.00
RAM Total: 27.40 GB, Used: 1.94 GB
GPU 0 (Tesla T4) load: 0.0%
GPU 0 (Tesla T4) VRAM Total: 16384.0 MB, Used 3.0 MB
Disk Total: 60.95 GB, Used: 37.89 GB


# Data

In [4]:
data = pd.read_csv('rm_data.xlsx')

print(data.shape)
data.head()

(6943, 5)


Unnamed: 0,sql_prompt,sql_context,sql,model_used,completion
0,What is the average moisture level for each cr...,"CREATE TABLE crop_moisture (id INT, crop_id IN...","SELECT type, AVG(moisture) as avg_moisture FRO...",meta-llama/llama-4-maverick-17b-128e-instruct,"SELECT type, moisture as avg_moisture FROM cro..."
1,Add a new job title called 'Data Science Manag...,CREATE TABLE JobTitle (JobTitleID INT PRIMARY ...,"INSERT INTO JobTitle (JobTitleID, JobTitleName...",meta-llama/llama-4-maverick-17b-128e-instruct,"INSERT INTO JobTitel (JobTitleID, JobTitleName..."
2,What is the total number of military equipment...,CREATE TABLE MaintenanceRequests (RequestID IN...,SELECT COUNT(*) FROM MaintenanceRequests WHERE...,meta-llama/llama-4-scout-17b-16e-instruct,SELECT COUNT(*) FROM MaintenanceRequests WHERE...
3,Insert a new record into the 'community_educat...,"CREATE TABLE community_education (id INT, prog...","INSERT INTO community_education (id, program, ...",moonshotai/kimi-k2-instruct,"""INSERT INTO community_education (id, program,..."
4,How many users signed up daily in the 'games' ...,"CREATE TABLE signups (user_id INT, category TE...","SELECT DATE(timestamp) as signup_date, COUNT(D...",moonshotai/kimi-k2-instruct,"SELECT DATE(timestamp) as signup_date, COUNT(u..."


In [17]:
dataset = []
for id in range(data.shape[0]):
    dataset.append({
        'prompt': data.loc[id, 'sql_prompt'],
        'chosen': data.loc[id, 'sql'],
        'rejected': data.loc[id, 'completion']
    })
    
dataset = Dataset.from_list(dataset)

split = dataset.train_test_split(test_size=0.1, seed=42)
ds_train = split['train']
ds_valid = split['test']

ds_train

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 6248
})

# RM training

In [24]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
RUN_NAME = f'rm-lr1e5-epochs1-{timestamp}'
OUTPUT_DIR = './rm-output'
RESUME_TRAINING = False

PER_DEVICE_BATCH_SIZE = 2
effective_batch_size = 16
epochs = 1
learning_rate = 1e-5
warmup_ratio = 0.1

gradient_accumulation_steps = int(effective_batch_size / PER_DEVICE_BATCH_SIZE)

wandb.init(
    project=os.environ["WANDB_PROJECT"],
    name=RUN_NAME,
    # id=run_id ,         # resume previous run if available
    # resume="allow",    # allows resuming crashed run
)


model = AutoModelForSequenceClassification.from_pretrained(
    "Qwen/Qwen3-0.6B", 
    dtype=torch.bfloat16
)

training_args = RewardConfig(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    lr_scheduler_type="cosine",
    warmup_ratio=warmup_ratio,
    save_strategy="steps",
    save_steps=gradient_accumulation_steps*5,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=gradient_accumulation_steps*5,
    logging_strategy="steps",
    logging_steps=gradient_accumulation_steps*5,
    report_to=['wandb'],
    run_name=RUN_NAME,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    max_grad_norm=1,
    load_best_model_at_end=True,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False}
)

trainer = RewardTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=25)]
)



# Training setup summary
dataset_size = len(ds_train)
steps_per_epoch = dataset_size // (PER_DEVICE_BATCH_SIZE * gradient_accumulation_steps)
total_steps = steps_per_epoch * epochs
warmup_steps = int(total_steps * warmup_ratio)

print("===== Training Setup Summary =====")
print(f"Num epochs:            {epochs}")
print(f"Effective batch size:  {effective_batch_size}")
print(f"Per-device batch size: {PER_DEVICE_BATCH_SIZE}")
print(f"Gradient accumulation: {gradient_accumulation_steps}")
print(f"Dataset size:          {dataset_size}")
print(f"Steps per epoch:       {steps_per_epoch}")
print(f"Total training steps:  {total_steps}")
print(f"Warmup steps:          {warmup_steps}")
print(f"Logging steps:         {training_args.logging_steps}")
print("===================================")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")


# Training
last_checkpoint = None
if RESUME_TRAINING and os.path.isdir(OUTPUT_DIR):
    last_checkpoint = get_last_checkpoint(OUTPUT_DIR)

if last_checkpoint is not None:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("Starting fresh training run")
    trainer.train()

print(f"End time: {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Adding EOS to train dataset:   0%|          | 0/6248 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/6248 [00:00<?, ? examples/s]

Filtering train >1024 tokens:   0%|          | 0/6248 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/695 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/695 [00:00<?, ? examples/s]

Filtering eval >1024 tokens:   0%|          | 0/695 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


===== Training Setup Summary =====
Num epochs:            1
Effective batch size:  16
Per-device batch size: 2
Gradient accumulation: 8
Dataset size:          6248
Steps per epoch:       390
Total training steps:  390
Warmup steps:          39
Logging steps:         40
Start time: 2026-01-11_17-26-21
Starting fresh training run


Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
40,0.6618,0.483437,72963.0,-6.53218,3.340088,10.953869,0.795387,1.083188
80,0.328,0.241077,144365.0,-10.846726,-0.73802,11.013393,0.90253,3.763621
120,0.2137,0.205987,215066.0,-9.399182,3.838975,15.412946,0.915923,6.183353
160,0.1915,0.202793,287460.0,-7.064407,7.215848,16.869048,0.919643,5.630406
200,0.2004,0.179696,359427.0,-10.120164,2.419957,14.953125,0.931548,6.654533
240,0.1424,0.166673,432846.0,-9.952381,3.237289,15.802827,0.938988,7.164189
280,0.2046,0.167084,510075.0,-10.170387,3.341889,16.078125,0.93378,7.472739
320,0.1636,0.158692,585930.0,-10.321057,2.801315,15.734375,0.940476,7.32605
360,0.1693,0.164289,658661.0,-10.338914,2.778903,15.737351,0.93378,7.324001


End time: 2026-01-11_18-15-03


In [33]:
model.save_pretrained(f"{OUTPUT_DIR}/best_model")

In [35]:
reward_model = AutoModelForSequenceClassification.from_pretrained(
    f"{OUTPUT_DIR}/best_model",
    dtype=torch.bfloat16,
    device_map="auto" if torch.cuda.is_available() else None
)