# Requirements
Install the requirements

In [None]:
!pip install requirements.txt

# Configurations

In [None]:
import sys, os, torch
sys.path.append(os.path.abspath(".."))

import torch
from transformers import TrainerCallback, AutoTokenizer, AutoModelForCausalLM, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, load_from_disk
from peft import LoraConfig, TaskType
from trl import SFTTrainer
import gc
from torch.utils.data import DataLoader
from dataset_helper import export_dataset, get_dataset, get_dataset_dir

if torch.cuda.is_available():
    print("Set GPU max to 80%")
    # torch.cuda.set_per_process_memory_fraction(0.8, device=0)


# Amount of dataset lines that will be compiled and converted to dataset.jsonl.
max_dataset=1_200_000
train_validation_ratio=0.9

# Training args - FIXED FOR RAM CONSTRAINTS
train_batches_per_device=8    # REDUCED from 16 - critical for RAM
val_batches_per_device=8      # REDUCED from 2
gradient_accumulation=8      # INCREASED to maintain effective batch size
eval_accumulation=8
num_train_epoch=1
learning_rate=5e-5
save_ratio=0.01          
log_ratio=0.005
eval_ratio=0.25       

# LoRA
rank=32
alpha=64
dropout=0.01

model_name = "Qwen/Qwen2.5-0.5B-Instruct"

adapter_folder_name="adapters/qwen2.5_0.5b_lora_half2"
dataset_train_output_path = f"{get_dataset_dir()}/llm_dataset_train.jsonl"
dataset_val_output_path = f"{get_dataset_dir()}/llm_dataset_val.jsonl"

# Load Data

In [None]:
from sklearn.model_selection import train_test_split

# Load data
df, dir = get_dataset()

if max_dataset>-1:
    df = df.sample(max_dataset)

df_train, df_val = train_test_split(
    df,
    train_size=train_validation_ratio,
    random_state=42,
    shuffle=True
)

export_dataset(
    df,
    dataset_train_output_path,
    format="jsonl_state_action",
    completion_mode="short",
    include_pos_rot=False
)

export_dataset(
    df_val,
    dataset_val_output_path,
    format="jsonl_state_action",
    completion_mode="short",
    include_pos_rot=False
)

print(f"Saved {len(df_train)} samples to {dataset_train_output_path}")
print(f"Saved {len(df_val)} samples to {dataset_val_output_path}")

# Fine-tuning with LoRA

In [None]:
# Device detection
if torch.backends.mps.is_available():
    device = torch.device("mps")
    device_map = {"": "mps"}
elif torch.cuda.is_available():
    device = torch.device("cuda")
    device_map = {"": device}
    print(device_map)
else:
    device = torch.device("cpu")
    device_map = {"": "cpu"}

print(f"Using device: {device}")

# FIXED: Proper config for 4-bit quantization
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,  # Changed to True for better memory efficiency
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )

# Load Qwen2.5-0.5B model & tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    # use_fast=True
)

# Ensure padding token exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,   
    device_map="auto",  
    torch_dtype=torch.bfloat16,
    trust_remote_code=True 
)

# Add LoRA adapter
lora_config = LoraConfig(
    r=rank,
    lora_alpha=alpha,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=dropout,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

processed_path = "processed_sumobot_dataset"

def tokenize(example):
    if "messages" in example:
        # Chat-format dataset
        text = tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
            add_generation_prompt=False
        )
    elif "text" in example:
        # Plain-text dataset
        text = example["text"]
    elif "state" in example and "action" in example:
        text = f"""<|im_start|>system
You are a Sumobot assistant.<|im_end|>
<|im_start|>user
sumobot state: {example['state']}<|im_end|>
<|im_start|>assistant
action: {example['action']}<|im_end|>"""
    else:
        raise ValueError("Example must contain 'messages', 'text', or 'state'+'action'")

    # Tokenize with shorter max_length for smaller model
    tokenized = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=256  # SmolLM2 can handle shorter contexts efficiently
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# IMPROVED: Better batching for tokenization
if os.path.exists(processed_path):
    print(f"‚úÖ Found cached dataset at {processed_path}, loading...")
    tokenized_datasets = load_from_disk(processed_path)
else:
    print("üìä Loading and tokenizing dataset...")
    dataset = load_dataset(
        "json",
        data_files={
            "train": dataset_train_output_path,
            "val": dataset_val_output_path
        }
    )

    # Tokenize with smaller batches to avoid memory spikes
    tokenized_datasets = dataset.map(
        tokenize,
        batched=False,
        load_from_cache_file=False,
        # batch_size=100,  # Reduced from 1000
        # remove_columns=dataset["train"].column_names,  # Remove original columns to save memory
    )

    print("üíæ Saving tokenized dataset to disk...")
    tokenized_datasets.save_to_disk(processed_path)
    
    # Clear memory after saving
    del dataset
    gc.collect()

print(f"Train samples: {len(tokenized_datasets['train'])}")
print(f"Val samples: {len(tokenized_datasets['val'])}")

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    batch_size=train_batches_per_device,
    shuffle=True
)

train_length = len(train_dataloader)

train_num = (train_length // gradient_accumulation) * num_train_epoch
print(f"train length: {train_num}")

del train_dataloader
gc.collect()

# FIXED: Proper training arguments for your setup
training_args = TrainingArguments(
    output_dir=adapter_folder_name,
    
    # Batch sizes - CRITICAL FIX
    per_device_train_batch_size=train_batches_per_device,        # Small to save RAM
    per_device_eval_batch_size=val_batches_per_device,         # Small to save RAM
    gradient_accumulation_steps=gradient_accumulation,       # Increased to compensate
    eval_accumulation_steps=eval_accumulation,
    
    # Memory optimization - CRITICAL FIXES
    # dataloader_num_workers=0,             # No extra workers on Windows
    # dataloader_pin_memory=False,          # Disable pinned memory
    gradient_checkpointing=True,          # Enable checkpointing
    
    # Precision - FIXED TO MATCH MODEL
    fp16=False,                           # CHANGED: Don't use fp16 with 4-bit
    bf16=True,                            # CHANGED: Use bf16 to match model
    
    # Optimizer - Good choice for memory
    # optim="paged_adamw_8bit",            # CHANGED: 8-bit saves more memory than 32bit
    
    # Training schedule
    learning_rate=learning_rate,
    num_train_epochs=num_train_epoch,
    warmup_steps=100,
    
    # Checkpointing & logging
    save_strategy="steps",
    save_steps=max(50,int(train_num * save_ratio)),
    logging_strategy="steps",
    logging_steps=max(10,int(train_num * log_ratio)),
    logging_dir=f"{adapter_folder_name}/outputs",
    eval_strategy="steps",
    eval_steps=int(train_num * eval_ratio),
    
    # Other
    report_to="tensorboard",
    dataloader_num_workers=0,
    
    # Additional memory saving options
    # save_total_limit=2,                   # Keep only 2 checkpoints
    load_best_model_at_end=False,        # Don't load model at end (saves memory)
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# IMPORTANT: Disable cache for gradient checkpointing
model.config.use_cache = False

# Clear cache before training
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

print(f"üîß Model memory footprint: {model.get_memory_footprint() / 1e9:.2f} GB")

class CustomEarlyStopping(TrainerCallback):
    def __init__(self, patience=5, min_delta=0.1):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = float("inf")
        self.counter = 0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        val_loss = metrics.get("eval_loss")
        if val_loss is None:
            return control

        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            print(f"‚úÖ Improved eval_loss to {val_loss:.4f}")
        else:
            self.counter += 1
            print(f"‚ö†Ô∏è No significant improvement. Counter {self.counter}/{self.patience}")

        if self.counter >= self.patience:
            print("üõë Early stopping triggered!")
            control.should_training_stop = True

        return control

# Memory monitoring callback
class MemoryMonitorCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 50 == 0:
            import psutil
            process = psutil.Process()
            cpu_mem = process.memory_info().rss / 1e9
            gpu_mem = torch.cuda.memory_allocated() / 1e9 if torch.cuda.is_available() else 0
            print(f"üìä Step {state.global_step} | CPU RAM: {cpu_mem:.2f}GB | GPU RAM: {gpu_mem:.2f}GB")
        return control

# Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    peft_config=lora_config,
    args=training_args,
    data_collator=data_collator,
    callbacks=[
        CustomEarlyStopping(patience=10, min_delta=0.01),
        MemoryMonitorCallback()
    ]
)

# Enable resume from checkpoint
latest_ckpt = None
if os.path.isdir(adapter_folder_name):
    checkpoints = [os.path.join(adapter_folder_name, d) for d in os.listdir(adapter_folder_name) if d.startswith("checkpoint-")]
    if checkpoints:
        latest_ckpt = max(checkpoints, key=lambda x: int(x.split("-")[-1]))
        print(f"üìÇ Resuming from: {latest_ckpt}")

# Train
print("üöÄ Starting training...")
try:
    if latest_ckpt:
        trainer.train(resume_from_checkpoint=latest_ckpt)
    else:
        trainer.train()
    
    # Save LoRA adapter + tokenizer
    print("üíæ Saving model...")
    trainer.model.save_pretrained(adapter_folder_name)
    tokenizer.save_pretrained(adapter_folder_name)
    print("‚úÖ Training complete!")
    
except Exception as e:
    print(f"‚ùå Training failed with error: {e}")
    import traceback
    traceback.print_exc()

# Testing

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",   # or "cuda" if you have NVIDIA
    torch_dtype="auto"
)

# Load LoRA adapter
model = PeftModel.from_pretrained(model, adapter_folder_name)

# Merge LoRA into the base model (optional if you want a standalone model)
model = model.merge_and_unload()

# Inference with chat template
messages = [
    {"role": "system", "content": "You are a Sumobot assistant that decides actions based on game state."},
    {"role": "user", "content": "Given this game state: AngleToEnemy=8.11, AngleToEnemyScore=0.99, DistanceToEnemyScore=0.81, NearBorderArenaScore=0.19, FacingToArena=-0.98."},
]

# Apply the tokenizer's built-in chat template
chat_prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,             
    add_generation_prompt=True   
)

inputs = tokenizer(chat_prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=128
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Save merged model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",   # or "cuda" if you have NVIDIA
    torch_dtype="auto"
)

# Load LoRA adapter
model = PeftModel.from_pretrained(model, adapter_folder_name)

# Merge LoRA into the base model (optional if you want a standalone model)
model = model.merge_and_unload()

# Save merged adapter (LoRA) with base model - OPTIONAL
save_path = "qwen2.5-0.5b-instruct-sumobot-merged"

model.save_pretrained(save_path, safe_serialization=True)
tokenizer.save_pretrained(save_path)

# Run Inference

Enhance the dataset with inference to our finetuned model

In [None]:
import json
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

print(f"Loading model: {save_path}...")
tokenizer = AutoTokenizer.from_pretrained(save_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    save_path,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

input_files = [dataset_train_output_path, dataset_val_output_path]
output_file = "enhanced_dataset.jsonl"

print(f"Processing {len(input_files)} file(s)...\n")
processed = 0

with open(output_file, 'w') as outfile:
    for input_file in input_files:
        if not os.path.exists(input_file):
            print(f"Skipping {input_file} (not found)")
            continue

        with open(input_file, 'r') as infile:
            for line in tqdm(infile, desc=os.path.basename(input_file)):
                if not (line := line.strip()):
                    continue

                try:
                    state = json.loads(line).get('state', '')

                    prompt = f"""<|im_start|>system
Sumobot assistant.<|im_end|>
<|im_start|>user
{state}<|im_end|>
<|im_start|>assistant
action:"""

                    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=32,
                        temperature=0.1,
                        top_p=0.95,
                        do_sample=True
                    )

                    action = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()
                    outfile.write(json.dumps({"state": state, "action": action}) + '\n')
                    processed += 1

                    if processed % 100 == 0:
                        outfile.flush()

                except Exception as e:
                    print(f"Error: {e}")

print(f"\nDone! Processed: {processed:,} -> {output_file}")

# Vector Database Workflow

Setup Milvus ‚Üí Import Data ‚Üí Query ‚Üí Spawn API

## Setup Milvus Collection

Milvus Lite runs in-process, stores data locally in a file
```bash
pip install pymilvus
```

In [None]:
import sys
import os
sys.path.append(os.path)

from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility

def detect_gpu():
    """Detect if NVIDIA GPU with CUDA is available"""
    try:
        import pynvml
        pynvml.nvmlInit()
        device_count = pynvml.nvmlDeviceGetCount()
        if device_count > 0:
            handle = pynvml.nvmlDeviceGetHandleByIndex(0)
            name = pynvml.nvmlDeviceGetName(handle)
            print(f"‚úÖ Detected GPU: {name}")
            pynvml.nvmlShutdown()
            return True
    except:
        pass
    try:
        import torch
        if torch.cuda.is_available():
            print(f"‚úÖ Detected GPU via PyTorch: {torch.cuda.get_device_name(0)}")
            return True
    except:
        pass
    print("No NVIDIA GPU detected, using CPU mode")
    return False

def get_index_config(has_gpu=None):
    """Get Milvus index configuration based on GPU availability"""
    if has_gpu is None:
        has_gpu = detect_gpu()
    if has_gpu:
        return {"index_type": "GPU_IVF_FLAT", "metric_type": "L2", "params": {"nlist": 1024}}, True
    else:
        return {"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 128}}, False

def get_search_params(has_gpu=None):
    """Get search parameters based on GPU availability"""
    if has_gpu is None:
        has_gpu = detect_gpu()
    return {"nprobe": 64 if has_gpu else 16}

connections.connect(uri="./milvus_sumobot.db")
print("‚úÖ Connected to Milvus Lite")

# Drop existing collection if exists
col_name = "sumobot_states"
if col_name in utility.list_collections():
    Collection(col_name).drop()
    print(f"üóëÔ∏è  Dropped existing collection")

# Create collection
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="state_vec", dtype=DataType.FLOAT_VECTOR, dim=5),
    FieldSchema(name="action", dtype=DataType.VARCHAR, max_length=64),
]
schema = CollectionSchema(fields, description="Sumobot state ‚Üí action mapping")
col = Collection(name=col_name, schema=schema)
print(f"‚úÖ Created collection '{col_name}'")

# Create index (GPU-accelerated if available)
index_params, is_gpu = get_index_config()
col.create_index(field_name="state_vec", index_params=index_params)
print(f"‚úÖ Created {'GPU' if is_gpu else 'CPU'} index")

# Load into memory
col.load()
print("üöÄ Collection loaded")

## Import Enhanced Data

In [None]:
import numpy as np

def encode_state(state_str):
    parts = dict(item.split('=') for item in state_str.strip('.').split(', '))
    return np.array([
        float(parts["AngleToEnemy"]) / 180.0,
        float(parts["AngleToEnemyScore"]),
        float(parts["DistanceToEnemyScore"]),
        float(parts["NearBorderArenaScore"]),
        float(parts["FacingToArena"]),
    ], dtype=np.float32)

BATCH_SIZE = 5000

batch_vecs, batch_actions = [], []
with open(output_file, "r") as f:
    for line in tqdm(f, desc="Importing to Milvus"):
        item = json.loads(line)
        batch_vecs.append(encode_state(item["state"]).tolist())
        batch_actions.append(item["action"])

        if len(batch_vecs) >= BATCH_SIZE:
            col.insert([batch_vecs, batch_actions])
            batch_vecs, batch_actions = [], []

if batch_vecs:
    col.insert([batch_vecs, batch_actions])

col.flush()
print(f"‚úÖ Imported {col.num_entities:,} entities to Milvus")

## Test API

In [None]:
import subprocess

# Spawn API in background
api_process = subprocess.Popen(
    ["python", "api.py", "--port", "9999", "--workers", "5"]
)
print(f"‚úÖ API spawned (PID: {api_process.pid})")
print("üìö Docs: http://localhost:9999/docs")
print("üîç Health: http://localhost:9999/health")

api_process.terminate()

## Spawn API Server

Run in terminal:
```bash
python api.py --port 9999 --workers 5
```

## Benchmark Performance

In [None]:
import time

import numpy as np

HAS_GPU = detect_gpu()
SEARCH_PARAMS = get_search_params(HAS_GPU)
print(f"Mode: {'GPU üöÄ' if HAS_GPU else 'CPU'}")

def query_action(angle, angle_score, dist_score, near_score, facing):
    vec = np.array([angle/180.0, angle_score, dist_score, near_score, facing], dtype=np.float32)
    result = col.search(
        data=[vec],
        anns_field="state_vec",
        param=SEARCH_PARAMS,
        limit=1,
        output_fields=["action"],
    )
    actions = [hit.entity.get("action") for hit in result[0]]
    return actions

# Test
test_state = (63.55, 0.45, 0.81, 0.18, -0.48)
start = time.time()
action = query_action(*test_state)
query_time = (time.time() - start) * 1000

print(f"üéÆ Action: {action}")
print(f"‚è±Ô∏è  Time: {query_time:.2f}ms")

In [None]:
print("üî• Running benchmark (100 queries)...")
times = []
for _ in tqdm(range(100)):
    start = time.time()
    query_action(*test_state)
    times.append((time.time() - start) * 1000)

times_sorted = sorted(times)
print(f"Average: {sum(times)/len(times):.2f}ms")
print(f"p50: {times_sorted[50]:.2f}ms")
print(f"p95: {times_sorted[95]:.2f}ms")
print(f"p99: {times_sorted[99]:.2f}ms")

## Ready to be consumed from Sumobot LLM Agent (Unity)