In [1]:
!pip install -q "transformers>=4.40.0" "accelerate>=0.30.0" "peft>=0.10.0" \
               "bitsandbytes>=0.42.0" "datasets>=2.19.0" "huggingface_hub>=0.23.0" \
               openai

import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from huggingface_hub import login

# Base model
MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Sequence length and dataset sizes (we will use more data than before)
MAX_LENGTH = 256
TRAIN_SAMPLES = 5000   # can be increased later if GPU allows
VAL_SAMPLES = 1000

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


Device: cuda


In [10]:
!pip install --upgrade pip
!pip install "flash-attn>=2.6.0" --no-build-isolation


Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3
Collecting flash-attn>=2.6.0
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m36.2 MB/s[0m  [33m0:00:00[0m
[?25h  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (pyproject.toml) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.8.3-cp312-cp312-linux_x86_64.whl size=256040057 sha256=f25da18657a87fc83dc1bfb8b7751b82246e9db355510226b674fd437c34

In [2]:
import os
import json
from huggingface_hub import login
from openai import OpenAI

SECRETS_PATH = "/content/secrets.json"

with open(SECRETS_PATH, "r") as f:
    secrets = json.load(f)

HF_TOKEN = secrets["HF_TOKEN"]
OPENAI_API_KEY = secrets["OPENAI_API_KEY"]

# Login / set env
login(HF_TOKEN)
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

print("Secrets loaded from file and clients configured.")


Secrets loaded from file and clients configured.


In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load Alpaca-style dataset
raw_dataset = load_dataset("tatsu-lab/alpaca")

# Take first TRAIN_SAMPLES + VAL_SAMPLES examples
total_needed = TRAIN_SAMPLES + VAL_SAMPLES
train_raw = raw_dataset["train"].select(range(total_needed))

train_split = train_raw.select(range(TRAIN_SAMPLES))
val_split = train_raw.select(range(TRAIN_SAMPLES, total_needed))

print("Raw train size:", len(raw_dataset["train"]))
print("Used train_split size:", len(train_split))
print("Used val_split size:", len(val_split))

# Load tokenizer (LLaMA 3.1)
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    token=HF_TOKEN,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def format_example(example):
    instr = example["instruction"]
    inp = example.get("input", "")
    out = example["output"]

    if inp and len(inp.strip()) > 0:
        prompt = f"Instruction: {instr}\nInput: {inp}\nAnswer:"
    else:
        prompt = f"Instruction: {instr}\nAnswer:"

    full_text = prompt + " " + out
    return full_text

def tokenize_function(example):
    text = format_example(example)
    tokens = tokenizer(
        text,
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_train = train_split.map(
    tokenize_function,
    batched=False,
    remove_columns=train_split.column_names,
)

tokenized_val = val_split.map(
    tokenize_function,
    batched=False,
    remove_columns=val_split.column_names,
)

print("Tokenized train size:", len(tokenized_train))
print("Tokenized val size:", len(tokenized_val))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Raw train size: 52002
Used train_split size: 5000
Used val_split size: 1000


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenized train size: 5000
Tokenized val size: 1000


In [11]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
import torch

def get_base_model():
    """
    Load LLaMA 3.1 in bf16 on a single A100 GPU.
    Try to use FlashAttention2; if not available, fall back to eager attention.
    """
    try:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            token=HF_TOKEN,
            torch_dtype=torch.bfloat16,      # bf16 for A100
            device_map={"": 0},              # full model on GPU 0
            attn_implementation="flash_attention_2",
        )
        print("[get_base_model] Using flash_attention_2.")
    except Exception as e:
        print("[get_base_model] FlashAttention2 not available, falling back to eager attention.")
        print("  Reason:", repr(e))
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            token=HF_TOKEN,
            torch_dtype=torch.bfloat16,
            device_map={"": 0},
            attn_implementation="eager",
        )

    model.config.use_cache = False
    return model


def make_lora_model(base_model, lora_r=16, lora_alpha=32, lora_dropout=0.1):
    lora_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
    )
    model = get_peft_model(base_model, lora_config)
    return model

print("Model helpers defined (bf16 + FlashAttention2 when available + LoRA wrapper).")


Model helpers defined (bf16 + FlashAttention2 when available + LoRA wrapper).


In [13]:
import numpy as np
from transformers import TrainingArguments, Trainer, default_data_collator

def train_and_eval(hparams):
    """
    hparams expected keys:
      - lr
      - batch_size
      - epochs
      - warmup_ratio
      - lora_r
      - lora_alpha
      - lora_dropout
      - trial_id
    """

    print(f"[train_and_eval] Starting trial: {hparams['trial_id']}")
    print("[train_and_eval] Hyperparameters:", hparams)

    base_model = get_base_model()
    model = make_lora_model(
        base_model,
        lora_r=hparams["lora_r"],
        lora_alpha=hparams["lora_alpha"],
        lora_dropout=hparams["lora_dropout"],
    )

    training_args = TrainingArguments(
        output_dir=f"./outputs/{hparams['trial_id']}",
        per_device_train_batch_size=hparams["batch_size"],
        per_device_eval_batch_size=hparams["batch_size"],
        num_train_epochs=hparams["epochs"],
        learning_rate=hparams["lr"],
        warmup_ratio=hparams["warmup_ratio"],
        logging_steps=50,
        save_strategy="no",
        evaluation_strategy="steps",
        eval_steps=200,
        bf16=torch.cuda.is_available(),
        gradient_accumulation_steps=1,
        remove_unused_columns=False,
        report_to=[],
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
    )

    train_result = trainer.train()
    eval_metrics = trainer.evaluate()

    eval_loss = float(eval_metrics.get("eval_loss", np.nan))
    perplexity = float(np.exp(eval_loss)) if eval_loss and eval_loss < 20 else float("inf")
    eval_metrics["perplexity"] = perplexity

    print(f"[train_and_eval] Finished trial: {hparams['trial_id']}")
    print("[train_and_eval] eval_loss:", eval_loss)
    print("[train_and_eval] perplexity:", perplexity)

    return eval_metrics


In [14]:
import json
import numpy as np

def _call_agent(system_msg, user_msg, model_name="gpt-4o", temperature=0.4):
    completion = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        temperature=temperature,
    )
    content = completion.choices[0].message.content.strip()
    try:
        data = json.loads(content)
    except json.JSONDecodeError:
        start = content.find("{")
        end = content.rfind("}")
        if start != -1 and end != -1:
            data = json.loads(content[start:end + 1])
        else:
            raise ValueError("Agent reply was not valid JSON: " + content)
    return data


def seal_effect_summary(history):
    seal_steps = [h for h in history if h.get("kind") == "seal" and h.get("seal_info")]
    train_steps = [h for h in history if h.get("kind") == "train" and h.get("metrics")]

    if not seal_steps or not train_steps:
        return "No SEAL runs yet; no refinement effect to analyze."

    last_seal = seal_steps[-1]["seal_info"]
    base_ep = last_seal["base_episode"]
    base_train = None
    for t in train_steps:
        if t["episode"] == base_ep:
            base_train = t
            break

    if base_train is None:
        return "SEAL runs exist but base TRAIN episode was not found in history."

    seal_loss = float(last_seal["seal_metrics"]["eval_loss"])
    seal_ppl = float(last_seal["seal_metrics"]["perplexity"])
    train_loss = float(base_train["metrics"]["eval_loss"])
    train_ppl = float(base_train["metrics"]["perplexity"])

    delta_loss = train_loss - seal_loss
    delta_ppl = train_ppl - seal_ppl

    if delta_loss > 0:
        return (
            f"SEAL improved eval_loss from {train_loss:.4f} to {seal_loss:.4f} "
            f"and perplexity from {train_ppl:.4f} to {seal_ppl:.4f} "
            f"(delta_loss={delta_loss:.4f}, delta_ppl={delta_ppl:.4f})."
        )
    elif delta_loss < 0:
        return (
            f"SEAL worsened eval_loss from {train_loss:.4f} to {seal_loss:.4f} "
            f"and perplexity from {train_ppl:.4f} to {seal_ppl:.4f} "
            f"(delta_loss={delta_loss:.4f}, delta_ppl={delta_ppl:.4f})."
        )
    else:
        return (
            f"SEAL left eval_loss and perplexity unchanged at loss={train_loss:.4f}, "
            f"ppl={train_ppl:.4f} (delta_loss={delta_loss:.4f}, delta_ppl={delta_ppl:.4f})."
        )


def tuner_agent(history, judge_feedback=None, reward_feedback=None, seal_summary=None):
    history_text = json.dumps(history, ensure_ascii=False)
    judge_feedback = judge_feedback or ""
    reward_feedback = reward_feedback or ""
    seal_summary = seal_summary or ""

    system_msg = (
        "You are the main Tuner/Guidance Agent, the lead ML engineer (AI agent in the loop) "
        "responsible for proposing hyperparameters for LLaMA 3.1 LoRA fine-tuning and a SEAL plan. "
        "You must return ONLY a JSON object."
    )

    user_msg = f"""
You are the lead agent coordinating hyperparameter tuning and SEAL configuration.

You receive:
- Full TRAIN + SEAL history (with metrics):
{history_text}

- Judge feedback:
{judge_feedback}

- Reward-Penalty Agent feedback:
{reward_feedback}

- Summary of SEAL effect so far:
{seal_summary}

Your tasks:
1) Comment briefly on how training is going.
2) Propose a TRAIN configuration (hyperparameters) for the next round.
3) Propose a SEAL plan (how many epochs to run in the next SEAL stage).

Constraints for TRAIN:
- lr between 1e-5 and 5e-4
- batch_size in [4,8,12]
- epochs between 1 and 2
- warmup_ratio between 0.0 and 0.2
- lora_r in [8, 16, 32]
- lora_alpha in [16, 32, 64]
- lora_dropout between 0.0 and 0.2

Return ONLY valid JSON with this structure:
{{
  "say": "Short natural language explanation from the Tuner Agent.",
  "hparams": {{
    "lr": 1e-4,
    "batch_size": 2,
    "epochs": 1,
    "warmup_ratio": 0.1,
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.1
  }},
  "seal_plan": {{
    "seal_epochs": 1
  }}
}}
"""

    return _call_agent(system_msg, user_msg)


def judge_agent(history):
    history_text = json.dumps(history, ensure_ascii=False)

    system_msg = (
        "You are the Judge Agent, responsible for deciding whether to continue or stop the tuning "
        "process based on metrics. You must return ONLY JSON."
    )

    user_msg = f"""
You receive the full history of TRAIN and SEAL steps:

{history_text}

Each TRAIN step has metrics including eval_loss and perplexity.
Each SEAL step has refinement metrics.

Your job:
- Decide if we should CONTINUE another round (TRAIN + SEAL) or STOP because the model is good enough
  or metrics have plateaued / started to degrade.
- Provide feedback that the Tuner, Reward-Penalty, and SEAL Expert agents can use.

Hard rule:
- Do NOT recommend stopping before at least 3 episodes, unless there is a clearly strong degradation
  in eval_loss or perplexity.

Return ONLY valid JSON with this structure:
{{
  "say": "Short natural language explanation from the Judge Agent.",
  "continue": true,
  "reason": "Short justification of the decision.",
  "feedback": "Actionable feedback for the other agents."
}}
"""

    return _call_agent(system_msg, user_msg)


def reward_penalty_agent(history):
    train_steps = [h for h in history if h.get("kind") == "train" and h.get("metrics")]
    seal_steps = [h for h in history if h.get("kind") == "seal" and h.get("seal_info")]

    if not train_steps:
        return {
            "say": "No training history yet; neutral reward and no overrides.",
            "reward_signal": {
                "train_improvement": 0.0,
                "seal_improvement": 0.0,
                "total_improvement": 0.0,
                "last_train_loss": None,
                "best_train_loss": None,
            },
            "override": {}
        }

    last_train = train_steps[-1]
    last_loss = float(last_train["metrics"]["eval_loss"])
    last_hparams = last_train["hparams"]

    if len(train_steps) == 1:
        train_improvement = 0.0
    else:
        prev_train = train_steps[-2]
        prev_loss = float(prev_train["metrics"]["eval_loss"])
        train_improvement = prev_loss - last_loss

    if seal_steps:
        last_seal = seal_steps[-1]["seal_info"]
        base_ep = last_seal["base_episode"]
        base_train = None
        for t in train_steps:
            if t["episode"] == base_ep:
                base_train = t
                break
        if base_train is not None:
            base_loss = float(base_train["metrics"]["eval_loss"])
            seal_loss = float(last_seal["seal_metrics"]["eval_loss"])
            seal_improvement = base_loss - seal_loss
        else:
            seal_improvement = 0.0
    else:
        seal_improvement = 0.0

    total_improvement = train_improvement + 0.5 * seal_improvement
    best_loss = min(float(t["metrics"]["eval_loss"]) for t in train_steps)

    reward = max(total_improvement, 0.0)
    penalty = max(-total_improvement, 0.0)

    lr = float(last_hparams["lr"])
    epochs = int(last_hparams["epochs"])
    lora_dropout = float(last_hparams["lora_dropout"])
    lora_r = int(last_hparams["lora_r"])
    lora_alpha = int(last_hparams["lora_alpha"])
    batch_size = int(last_hparams["batch_size"])
    warmup_ratio = float(last_hparams["warmup_ratio"])

    allowed_batch = [4,8,16]
    allowed_r = [8, 16, 32]
    allowed_alpha = [16, 32, 64]

    def step_discrete(value, allowed, direction):
        allowed_sorted = sorted(allowed)
        if value not in allowed_sorted:
            return allowed_sorted[0]
        idx = allowed_sorted.index(value)
        if direction > 0 and idx < len(allowed_sorted) - 1:
            return allowed_sorted[idx + 1]
        if direction < 0 and idx > 0:
            return allowed_sorted[idx - 1]
        return value

    override = {}

    if total_improvement > 0.02:
        new_lr = min(lr * 1.2, 5e-4)
        new_epochs = min(epochs, 2)
        new_dropout = max(lora_dropout * 0.9, 0.0)
        new_r = step_discrete(lora_r, allowed_r, direction=+1)
        new_alpha = step_discrete(lora_alpha, allowed_alpha, direction=+1)
        new_batch = step_discrete(batch_size, allowed_batch, direction=+1)
        new_warmup = max(min(warmup_ratio * 0.8, 0.2), 0.0)

        override["lr"] = new_lr
        override["epochs"] = new_epochs
        override["lora_dropout"] = new_dropout
        override["lora_r"] = new_r
        override["lora_alpha"] = new_alpha
        override["batch_size"] = new_batch
        override["warmup_ratio"] = new_warmup

        say = (
            f"Total improvement is positive (train={train_improvement:.4f}, seal={seal_improvement:.4f}). "
            f"Increasing lr from {lr:.6f} to {new_lr:.6f}, keeping epochs at {new_epochs}, "
            f"reducing dropout to {new_dropout:.3f}, increasing batch_size from {batch_size} to {new_batch}, "
            f"adjusting LoRA r/alpha to r={new_r}, alpha={new_alpha}, and lowering warmup_ratio "
            f"from {warmup_ratio:.3f} to {new_warmup:.3f} to exploit the current region."
        )
    elif total_improvement < -0.02:
        new_lr = max(lr * 0.5, 1e-5)
        new_epochs = min(epochs + 1, 2)
        new_dropout = min(lora_dropout + 0.05, 0.2)
        new_r = step_discrete(lora_r, allowed_r, direction=-1)
        new_alpha = step_discrete(lora_alpha, allowed_alpha, direction=-1)
        new_batch = step_discrete(batch_size, allowed_batch, direction=-1)
        new_warmup = max(min(warmup_ratio + 0.05, 0.2), 0.0)

        override["lr"] = new_lr
        override["epochs"] = new_epochs
        override["lora_dropout"] = new_dropout
        override["lora_r"] = new_r
        override["lora_alpha"] = new_alpha
        override["batch_size"] = new_batch
        override["warmup_ratio"] = new_warmup

        say = (
            f"Total improvement is negative (train={train_improvement:.4f}, seal={seal_improvement:.4f}). "
            f"Reducing lr from {lr:.6f} to {new_lr:.6f}, increasing epochs to {new_epochs}, "
            f"increasing dropout to {new_dropout:.3f}, decreasing batch_size from {batch_size} to {new_batch}, "
            f"adjusting LoRA r/alpha to r={new_r}, alpha={new_alpha}, and increasing warmup_ratio "
            f"from {warmup_ratio:.3f} to {new_warmup:.3f} to stabilize training."
        )
    else:
        new_lr = max(min(lr * 0.9, 5e-4), 1e-5)
        new_epochs = epochs
        new_dropout = lora_dropout
        new_r = lora_r
        new_alpha = lora_alpha
        new_batch = batch_size
        new_warmup = warmup_ratio

        override["lr"] = new_lr
        override["epochs"] = new_epochs
        override["lora_dropout"] = new_dropout
        override["lora_r"] = new_r
        override["lora_alpha"] = new_alpha
        override["batch_size"] = new_batch
        override["warmup_ratio"] = new_warmup

        say = (
            f"Total improvement is small/plateau (train={train_improvement:.4f}, seal={seal_improvement:.4f}). "
            f"Applying mild exploration by reducing lr from {lr:.6f} to {new_lr:.6f} and "
            f"keeping epochs={new_epochs}, batch_size={new_batch}, dropout={new_dropout:.3f}, "
            f"LoRA r={new_r}, alpha={new_alpha}, warmup_ratio={new_warmup:.3f} while observing further episodes."
        )

    return {
        "say": say,
        "reward_signal": {
            "train_improvement": train_improvement,
            "seal_improvement": seal_improvement,
            "total_improvement": total_improvement,
            "reward": reward,
            "penalty": penalty,
            "last_train_loss": last_loss,
            "best_train_loss": best_loss,
        },
        "override": override,
    }


def seal_expert_agent(history, last_train_entry, judge_feedback=None, reward_feedback=None, seal_summary=None):
    history_text = json.dumps(history, ensure_ascii=False)
    last_train_text = json.dumps(last_train_entry, ensure_ascii=False)
    judge_feedback = judge_feedback or ""
    reward_feedback = reward_feedback or ""
    seal_summary = seal_summary or ""

    system_msg = (
        "You are the SEAL Expert Agent, responsible for choosing the SEAL refinement plan "
        "and evaluating the SEAL effect based on metrics. You must return ONLY JSON."
    )

    user_msg = f"""
You are focusing on the SEAL (self-refinement) stage.

Full history:
{history_text}

Most recent TRAIN step:
{last_train_text}

Judge feedback:
{judge_feedback}

Reward-Penalty Agent feedback:
{reward_feedback}

Summary of SEAL effect so far:
{seal_summary}

Your job:
- Decide how many SEAL epochs to run in the NEXT SEAL stage (1 or 2).
- Interpret how SEAL is currently behaving (helping, hurting, or neutral).
- Provide feedback that the Tuner and Reward-Penalty agents can use.

Return ONLY valid JSON with this structure:
{{
  "say": "Short natural language explanation from the SEAL Expert Agent.",
  "seal_epochs": 1,
  "evaluation": "Short explanation of whether SEAL is helping or hurting and what to adjust."
}}
"""

    return _call_agent(system_msg, user_msg)


print("All agents (tuner, judge, reward-penalty, SEAL expert) are defined.")


All agents (tuner, judge, reward-penalty, SEAL expert) are defined.


In [15]:
import random

def run_seal_stage(history, seal_epochs=1):
    train_steps = [h for h in history if h.get("kind") == "train" and h.get("metrics")]
    if not train_steps:
        print("[SEAL-Tool] No TRAIN steps found in history. Skipping SEAL.")
        return None

    best_train = min(
        train_steps,
        key=lambda x: float(x["metrics"].get("eval_loss", float("inf")))
    )

    base_episode = best_train["episode"]
    base_hparams = best_train["hparams"]
    base_loss = float(best_train["metrics"]["eval_loss"])
    base_ppl = float(best_train["metrics"]["perplexity"])

    print(f"[SEAL-Tool] Using best TRAIN episode {base_episode} with eval_loss={base_loss:.4f}")

    sample_indices = random.sample(range(len(train_split)), k=min(2, len(train_split)))
    print("[SEAL-Tool] Sample training examples used in SEAL stage:")
    for idx in sample_indices:
        ex = train_split[idx]
        formatted = format_example(ex)
        preview = formatted[:200].replace("\n", " ")
        print(f"  - example index {idx}: {preview}...")

    seal_hparams = dict(base_hparams)
    seal_hparams["epochs"] = seal_epochs
    seal_hparams["trial_id"] = f"seal_from_ep_{base_episode}"

    print(f"[SEAL-Tool] Starting SEAL training for {seal_epochs} epoch(s) on the SAME dataset.")
    print("[SEAL-Tool] SEAL hyperparameters:")
    print(seal_hparams)

    seal_metrics = train_and_eval(seal_hparams)

    seal_loss = float(seal_metrics.get("eval_loss", float("nan")))
    seal_ppl = float(seal_metrics.get("perplexity", float("nan")))

    print(f"[SEAL-Tool] SEAL eval_loss: {seal_loss}")
    print(f"[SEAL-Tool] SEAL perplexity: {seal_ppl}")

    seal_info = {
        "base_episode": base_episode,
        "base_loss": base_loss,
        "base_perplexity": base_ppl,
        "seal_epochs": seal_epochs,
        "seal_metrics": seal_metrics,
    }

    return seal_info

print("SEAL tool is defined (run_seal_stage).")


SEAL tool is defined (run_seal_stage).


In [16]:
import numpy as np
from transformers import TrainingArguments, Trainer, default_data_collator

def train_and_eval(hparams):
    """
    hparams expected keys:
      - lr
      - batch_size
      - epochs
      - warmup_ratio
      - lora_r
      - lora_alpha
      - lora_dropout
      - trial_id
    """

    print(f"[train_and_eval] Starting trial: {hparams['trial_id']}")
    print("[train_and_eval] Hyperparameters:", hparams)

    base_model = get_base_model()
    model = make_lora_model(
        base_model,
        lora_r=hparams["lora_r"],
        lora_alpha=hparams["lora_alpha"],
        lora_dropout=hparams["lora_dropout"],
    )

    training_args = TrainingArguments(
        output_dir=f"./outputs/{hparams['trial_id']}",
        per_device_train_batch_size=hparams["batch_size"],
        per_device_eval_batch_size=hparams["batch_size"],
        num_train_epochs=hparams["epochs"],
        learning_rate=hparams["lr"],
        warmup_ratio=hparams["warmup_ratio"],
        logging_steps=50,
        save_strategy="no",
        bf16=torch.cuda.is_available(),
        gradient_accumulation_steps=1,
        remove_unused_columns=False,
        report_to=[],
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
    )

    train_result = trainer.train()
    eval_metrics = trainer.evaluate()

    eval_loss = float(eval_metrics.get("eval_loss", np.nan))
    perplexity = float(np.exp(eval_loss)) if eval_loss and eval_loss < 20 else float("inf")
    eval_metrics["perplexity"] = perplexity

    print(f"[train_and_eval] Finished trial: {hparams['trial_id']}")
    print("[train_and_eval] eval_loss:", eval_loss)
    print("[train_and_eval] perplexity:", perplexity)

    return eval_metrics


In [17]:
import json

max_episodes = 4
force_min_episodes = 3
history = []

for episode in range(max_episodes):
    print("\n" + "=" * 20 + f" EPISODE {episode} " + "=" * 20)

    # 1) Judge Agent
    if history:
        judge_res = judge_agent(history)
        judge_say = judge_res.get("say", "")
        judge_continue = bool(judge_res.get("continue", True))
        judge_reason = judge_res.get("reason", "")
        judge_feedback = judge_res.get("feedback", "")
        print(f"[Judge] {judge_say}")
        print(f"[Judge decision (raw)] continue = {judge_continue}")
        print(f"[Judge reason] {judge_reason}")
    else:
        judge_res = {
            "say": "No history yet; allowing first training + SEAL round.",
            "continue": True,
            "reason": "First episode, we need initial data.",
            "feedback": "Start with a stable configuration and observe metrics."
        }
        judge_continue = True
        judge_feedback = judge_res["feedback"]
        print("[Judge] No history yet; allowing first training + SEAL round.")

    if episode < force_min_episodes and not judge_continue:
        print("[Orchestrator] Overriding Judge: enforcing minimum of "
              f"{force_min_episodes} episodes before stopping.")
        judge_continue = True

    if not judge_continue:
        print("[Orchestrator] Judge decided to stop. Ending orchestration.")
        break

    # 2) Reward-Penalty Agent
    rp_res = reward_penalty_agent(history)
    rp_say = rp_res.get("say", "")
    rp_signal = rp_res.get("reward_signal", {})
    rp_override = rp_res.get("override", {})

    print(f"[Reward-Penalty] {rp_say}")
    print(f"[Reward-Penalty signal] {json.dumps(rp_signal, indent=2)}")

    # 3) SEAL effect summary so far
    seal_summary_text = seal_effect_summary(history)
    print(f"[SEAL-Eval] {seal_summary_text}")

    # 4) Tuner / Guidance Agent (main expert)
    tuner_res = tuner_agent(
        history,
        judge_feedback=judge_feedback,
        reward_feedback=rp_say + " | signal=" + json.dumps(rp_signal),
        seal_summary=seal_summary_text,
    )

    tuner_say = tuner_res.get("say", "")
    tuner_hparams = tuner_res.get("hparams", {})
    tuner_seal_plan = tuner_res.get("seal_plan", {"seal_epochs": 1})

    print(f"[Tuner] {tuner_say}")
    print("[Tuner proposal] TRAIN hparams from Tuner Agent:")
    print(tuner_hparams)
    print("[Tuner proposal] SEAL plan from Tuner Agent:")
    print(tuner_seal_plan)

    # 5) Merge Tuner hparams with Reward-Penalty overrides
    merged_hparams = dict(tuner_hparams)
    for k, v in rp_override.items():
        merged_hparams[k] = v

    default_hparams = {
        "lr": 1e-4,
        "batch_size": 4,
        "epochs": 1,
        "warmup_ratio": 0.1,
        "lora_r": 16,
        "lora_alpha": 32,
        "lora_dropout": 0.1,
    }
    for k, v in default_hparams.items():
        if k not in merged_hparams or merged_hparams[k] is None:
            merged_hparams[k] = v

    merged_hparams["trial_id"] = f"train_ep_{episode}"

    print("[Brainstorm] Final agreed TRAIN hyperparameters after applying Reward-Penalty overrides:")
    print(merged_hparams)

    # 6) TRAIN stage
    print("[Orchestrator] Running TRAIN with hyperparameters:")
    print(merged_hparams)

    train_metrics = train_and_eval(merged_hparams)
    train_loss = float(train_metrics.get("eval_loss", float("nan")))
    train_ppl = float(train_metrics.get("perplexity", float("nan")))

    print(f"[TRAIN] eval_loss: {train_loss}")
    print(f"[TRAIN] perplexity: {train_ppl}")

    train_entry = {
        "episode": episode,
        "kind": "train",
        "hparams": merged_hparams,
        "metrics": train_metrics,
    }
    history.append(train_entry)

    # 7) SEAL Expert Agent
    seal_summary_after_train = seal_effect_summary(history)
    seal_expert_res = seal_expert_agent(
        history,
        last_train_entry=train_entry,
        judge_feedback=judge_feedback,
        reward_feedback=rp_say,
        seal_summary=seal_summary_after_train,
    )

    seal_expert_say = seal_expert_res.get("say", "")
    seal_epochs = int(seal_expert_res.get("seal_epochs", 1))
    seal_eval_comment = seal_expert_res.get("evaluation", "")

    print(f"[SEAL Expert] {seal_expert_say}")
    print(f"[SEAL Expert decision] seal_epochs = {seal_epochs}")
    print(f"[SEAL Expert evaluation] {seal_eval_comment}")

    # 8) SEAL stage
    print(f"[Orchestrator] Running SEAL stage for {seal_epochs} epoch(s)...")
    seal_info = run_seal_stage(history, seal_epochs=seal_epochs)

    seal_entry = {
        "episode": episode,
        "kind": "seal",
        "seal_plan": {"seal_epochs": seal_epochs},
        "seal_info": seal_info,
    }
    history.append(seal_entry)

    # 9) Episode summary
    if seal_info is not None:
        seal_loss = float(seal_info["seal_metrics"].get("eval_loss", float("nan")))
        seal_ppl = float(seal_info["seal_metrics"].get("perplexity", float("nan")))
    else:
        seal_loss = float("nan")
        seal_ppl = float("nan")

    print("[Brainstorm] Episode summary:")
    print(f"  - TRAIN eval_loss: {train_loss:.4f} perplexity: {train_ppl:.4f}")
    print(f"  - SEAL  eval_loss: {seal_loss:.4f} perplexity: {seal_ppl:.4f}")

print("\n================ FINAL HISTORY ================")
print(json.dumps(history, indent=2, ensure_ascii=False))



[Judge] No history yet; allowing first training + SEAL round.
[Reward-Penalty] No training history yet; neutral reward and no overrides.
[Reward-Penalty signal] {
  "train_improvement": 0.0,
  "seal_improvement": 0.0,
  "total_improvement": 0.0,
  "last_train_loss": null,
  "best_train_loss": null
}
[SEAL-Eval] No SEAL runs yet; no refinement effect to analyze.
[Tuner] Training has not started yet, so we'll begin with a stable configuration to establish a performance baseline.
[Tuner proposal] TRAIN hparams from Tuner Agent:
{'lr': 0.0003, 'batch_size': 8, 'epochs': 1, 'warmup_ratio': 0.1, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.1}
[Tuner proposal] SEAL plan from Tuner Agent:
{'seal_epochs': 1}
[Brainstorm] Final agreed TRAIN hyperparameters after applying Reward-Penalty overrides:
{'lr': 0.0003, 'batch_size': 8, 'epochs': 1, 'warmup_ratio': 0.1, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.1, 'trial_id': 'train_ep_0'}
[Orchestrator] Running TRAIN with hyperparameters:


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[get_base_model] Using flash_attention_2.


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
50,2.5288
100,0.3939
150,0.4231
200,0.4289
250,0.3871
300,0.3928
350,0.4216
400,0.4113
450,0.3885
500,0.4284


[train_and_eval] Finished trial: train_ep_0
[train_and_eval] eval_loss: 0.406671941280365
[train_and_eval] perplexity: 1.5018113425325845
[TRAIN] eval_loss: 0.406671941280365
[TRAIN] perplexity: 1.5018113425325845
[SEAL Expert] We are starting with a stable configuration and observing initial metrics. The perplexity is reasonable, indicating a good starting point.
[SEAL Expert decision] seal_epochs = 1
[SEAL Expert evaluation] SEAL has not been applied yet, so there is no effect to evaluate. Begin with 1 SEAL epoch to gather initial refinement data and observe any changes in metrics.
[Orchestrator] Running SEAL stage for 1 epoch(s)...
[SEAL-Tool] Using best TRAIN episode 0 with eval_loss=0.4067
[SEAL-Tool] Sample training examples used in SEAL stage:
  - example index 912: Instruction: Identify the main idea of the text. Input: Global warming is the term used to describe a gradual increase in the average temperature of the Earth's atmosphere and its oceans, a change tha...
  - example 

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[get_base_model] Using flash_attention_2.


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
50,2.5003
100,0.3937
150,0.4229
200,0.4288
250,0.3869
300,0.3926
350,0.4219
400,0.4113
450,0.3883
500,0.4285


[train_and_eval] Finished trial: seal_from_ep_0
[train_and_eval] eval_loss: 0.4066004157066345
[train_and_eval] perplexity: 1.5017039284561475
[SEAL-Tool] SEAL eval_loss: 0.4066004157066345
[SEAL-Tool] SEAL perplexity: 1.5017039284561475
[Brainstorm] Episode summary:
  - TRAIN eval_loss: 0.4067 perplexity: 1.5018
  - SEAL  eval_loss: 0.4066 perplexity: 1.5017

[Judge] The initial results are promising with a slight improvement after the SEAL step.
[Judge decision (raw)] continue = True
[Judge reason] The eval_loss and perplexity have shown slight improvement after the SEAL step, and we have not yet reached the minimum of 3 episodes.
[Reward-Penalty] Total improvement is small/plateau (train=0.0000, seal=0.0001). Applying mild exploration by reducing lr from 0.000300 to 0.000270 and keeping epochs=1, batch_size=8, dropout=0.100, LoRA r=16, alpha=32, warmup_ratio=0.100 while observing further episodes.
[Reward-Penalty signal] {
  "train_improvement": 0.0,
  "seal_improvement": 7.15255737

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[get_base_model] Using flash_attention_2.


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
50,2.6922
100,0.3936
150,0.4225
200,0.428
250,0.3861
300,0.3917
350,0.421
400,0.4106
450,0.3878
500,0.428


[train_and_eval] Finished trial: train_ep_1
[train_and_eval] eval_loss: 0.4064937233924866
[train_and_eval] perplexity: 1.501543716735687
[TRAIN] eval_loss: 0.4064937233924866
[TRAIN] perplexity: 1.501543716735687
[SEAL Expert] The SEAL process has shown a minor improvement in both eval_loss and perplexity, indicating a slight positive impact. However, the improvement is minimal, suggesting a potential plateau.
[SEAL Expert decision] seal_epochs = 1
[SEAL Expert evaluation] SEAL is currently helping, but the effect is marginal. Continue with 1 SEAL epoch to gather more data and confirm the trend. Monitor for any signs of plateauing or degradation.
[Orchestrator] Running SEAL stage for 1 epoch(s)...
[SEAL-Tool] Using best TRAIN episode 1 with eval_loss=0.4065
[SEAL-Tool] Sample training examples used in SEAL stage:
  - example index 912: Instruction: Identify the main idea of the text. Input: Global warming is the term used to describe a gradual increase in the average temperature of th

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[get_base_model] Using flash_attention_2.


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
50,2.6922
100,0.3936
150,0.4225
200,0.428
250,0.3861


KeyboardInterrupt: 

In [18]:
import json
import numpy as np

max_episodes = 4
force_min_episodes = 3
history = []

# ==================== New: QA Generator Agent ====================

def qa_generation_agent(history, num_pairs=3):
    history_text = json.dumps(history, ensure_ascii=False)

    system_msg = (
        "You are the internal QA-Generator Agent for a fine-tuned LLaMA 3.1 model. "
        "You simulate the model's behavior by proposing realistic user questions and "
        "the model's expected answers, based on the current training and SEAL history. "
        "Return ONLY JSON."
    )

    user_msg = f"""
You receive the full TRAIN + SEAL history (with eval_loss, perplexity, SEAL info, etc.):

{history_text}

Your job:
- Briefly comment on what kind of behavior the model has likely learned so far.
- Generate a small set of test interactions (questions and answers) that the model might produce.
- The questions should cover a mix of reasoning, explanation, and simple tasks.
- The answers should be what you expect the model to answer at this stage of training.

Return ONLY valid JSON with this structure:
{{
  "say": "Short natural language explanation of what the model has learned so far.",
  "pairs": [
    {{
      "question": "User question 1 ...",
      "answer": "Model answer 1 ..."
    }},
    {{
      "question": "User question 2 ...",
      "answer": "Model answer 2 ..."
    }}
  ]
}}
The 'pairs' list should contain {num_pairs} elements.
"""

    return _call_agent(system_msg, user_msg, model_name="gpt-4o-mini", temperature=0.7)


# ==================== New: QA Evaluation Agent ====================

def qa_evaluation_agent(history, qa_data):
    history_text = json.dumps(history, ensure_ascii=False)
    qa_text = json.dumps(qa_data, ensure_ascii=False)

    system_msg = (
        "You are the QA-Evaluator Agent. "
        "You evaluate the quality of the model's behavior as reflected in the generated Q&A pairs. "
        "You must return ONLY JSON with a numeric score and actionable feedback."
    )

    user_msg = f"""
You receive:

1) Full TRAIN + SEAL history with metrics:
{history_text}

2) The QA-Generator Agent's synthetic evaluation Q&A:
{qa_text}

Your tasks:
- Judge how good the model behavior looks based on the Q&A pairs:
  - Are answers correct, coherent, and safe?
  - Is the style aligned with an instruction-following LLaMA 3.1 model?
- Produce a numeric score between 0 and 10 (float allowed).
- Provide short, actionable feedback to guide hyperparameter tuning or SEAL strategy.

Return ONLY valid JSON with this structure:
{{
  "say": "Short natural language explanation of your judgment.",
  "score": 7.5,
  "feedback": "Short actionable feedback for improving training / SEAL / hyperparameters."
}}
"""

    return _call_agent(system_msg, user_msg, model_name="gpt-4o-mini", temperature=0.4)


print("Orchestrator with QA-Generator and QA-Evaluator agents is ready.")


# ==================== Agentic Training + SEAL + QA Loop ====================

for episode in range(max_episodes):
    print("\n" + "=" * 20 + f" EPISODE {episode} " + "=" * 20)

    # 1) Judge Agent
    if history:
        judge_res = judge_agent(history)
        judge_say = judge_res.get("say", "")
        judge_continue = bool(judge_res.get("continue", True))
        judge_reason = judge_res.get("reason", "")
        judge_feedback = judge_res.get("feedback", "")
        print(f"[Judge] {judge_say}")
        print(f"[Judge decision (raw)] continue = {judge_continue}")
        print(f"[Judge reason] {judge_reason}")
    else:
        judge_res = {
            "say": "No history yet; allowing first training + SEAL round.",
            "continue": True,
            "reason": "First episode, we need initial data.",
            "feedback": "Start with a stable configuration and observe metrics."
        }
        judge_continue = True
        judge_feedback = judge_res["feedback"]
        print("[Judge] No history yet; allowing first training + SEAL round.")

    if episode < force_min_episodes and not judge_continue:
        print("[Orchestrator] Overriding Judge: enforcing minimum of "
              f"{force_min_episodes} episodes before stopping.")
        judge_continue = True

    if not judge_continue:
        print("[Orchestrator] Judge decided to stop. Ending orchestration.")
        break

    # 2) Reward-Penalty Agent
    rp_res = reward_penalty_agent(history)
    rp_say = rp_res.get("say", "")
    rp_signal = rp_res.get("reward_signal", {})
    rp_override = rp_res.get("override", {})

    print(f"[Reward-Penalty] {rp_say}")
    print(f"[Reward-Penalty signal] {json.dumps(rp_signal, indent=2)}")

    # 3) SEAL effect summary so far
    seal_summary_text = seal_effect_summary(history)
    print(f"[SEAL-Eval] {seal_summary_text}")

    # 4) Tuner / Guidance Agent (main expert)
    tuner_res = tuner_agent(
        history,
        judge_feedback=judge_feedback,
        reward_feedback=rp_say + " | signal=" + json.dumps(rp_signal),
        seal_summary=seal_summary_text,
    )

    tuner_say = tuner_res.get("say", "")
    tuner_hparams = tuner_res.get("hparams", {})
    tuner_seal_plan = tuner_res.get("seal_plan", {"seal_epochs": 1})

    print(f"[Tuner] {tuner_say}")
    print("[Tuner proposal] TRAIN hparams from Tuner Agent:")
    print(tuner_hparams)
    print("[Tuner proposal] SEAL plan from Tuner Agent:")
    print(tuner_seal_plan)

    # 5) Merge Tuner hparams with Reward-Penalty overrides
    merged_hparams = dict(tuner_hparams)
    for k, v in rp_override.items():
        merged_hparams[k] = v

    default_hparams = {
        "lr": 1e-4,
        "batch_size": 8,   # you already pushed this up
        "epochs": 1,
        "warmup_ratio": 0.1,
        "lora_r": 16,
        "lora_alpha": 32,
        "lora_dropout": 0.1,
    }
    for k, v in default_hparams.items():
        if k not in merged_hparams or merged_hparams[k] is None:
            merged_hparams[k] = v

    merged_hparams["trial_id"] = f"train_ep_{episode}"

    print("[Brainstorm] Final agreed TRAIN hyperparameters after applying Reward-Penalty overrides:")
    print(merged_hparams)

    # 6) TRAIN stage
    print("[Orchestrator] Running TRAIN with hyperparameters:")
    print(merged_hparams)

    train_metrics = train_and_eval(merged_hparams)
    train_loss = float(train_metrics.get("eval_loss", float("nan")))
    train_ppl = float(train_metrics.get("perplexity", float("nan")))

    print(f"[TRAIN] eval_loss: {train_loss}")
    print(f"[TRAIN] perplexity: {train_ppl}")

    train_entry = {
        "episode": episode,
        "kind": "train",
        "hparams": merged_hparams,
        "metrics": train_metrics,
    }
    history.append(train_entry)

    # 7) SEAL Expert Agent
    seal_summary_after_train = seal_effect_summary(history)
    seal_expert_res = seal_expert_agent(
        history,
        last_train_entry=train_entry,
        judge_feedback=judge_feedback,
        reward_feedback=rp_say,
        seal_summary=seal_summary_after_train,
    )

    seal_expert_say = seal_expert_res.get("say", "")
    seal_epochs = int(seal_expert_res.get("seal_epochs", 1))
    seal_eval_comment = seal_expert_res.get("evaluation", "")

    print(f"[SEAL Expert] {seal_expert_say}")
    print(f"[SEAL Expert decision] seal_epochs = {seal_epochs}")
    print(f"[SEAL Expert evaluation] {seal_eval_comment}")

    # 8) SEAL stage
    print(f"[Orchestrator] Running SEAL stage for {seal_epochs} epoch(s)...")
    seal_info = run_seal_stage(history, seal_epochs=seal_epochs)

    seal_entry = {
        "episode": episode,
        "kind": "seal",
        "seal_plan": {"seal_epochs": seal_epochs},
        "seal_info": seal_info,
    }
    history.append(seal_entry)

    if seal_info is not None:
        seal_loss = float(seal_info["seal_metrics"].get("eval_loss", float("nan")))
        seal_ppl = float(seal_info["seal_metrics"].get("perplexity", float("nan")))
    else:
        seal_loss = float("nan")
        seal_ppl = float("nan")

    print("[Brainstorm] Episode summary:")
    print(f"  - TRAIN eval_loss: {train_loss:.4f} perplexity: {train_ppl:.4f}")
    print(f"  - SEAL  eval_loss: {seal_loss:.4f} perplexity: {seal_ppl:.4f}")

    # 9) New: QA Generation + QA Evaluation Agents (AI-in-the-loop instead of manual inspection)
    print("[Orchestrator] Invoking QA-Generator Agent to simulate internal Q&A...")
    qa_gen = qa_generation_agent(history, num_pairs=3)
    qa_say = qa_gen.get("say", "")
    qa_pairs = qa_gen.get("pairs", []) or []

    print(f"[QA-Generator] {qa_say}")
    for i, pair in enumerate(qa_pairs):
        q = pair.get("question", "")
        a = pair.get("answer", "")
        print(f"[QA-Example {i+1}] Q: {q}")
        print(f"[QA-Example {i+1}] A: {a}")

    print("[Orchestrator] Invoking QA-Evaluator Agent to score the model behavior from these Q&A pairs...")
    qa_eval = qa_evaluation_agent(history, qa_gen)
    qa_eval_say = qa_eval.get("say", "")
    qa_score = qa_eval.get("score", None)
    qa_feedback = qa_eval.get("feedback", "")

    print(f"[QA-Evaluator] {qa_eval_say}")
    print(f"[QA-Evaluator score] {qa_score}")
    print(f"[QA-Evaluator feedback] {qa_feedback}")

    qa_entry = {
        "episode": episode,
        "kind": "qa_eval",
        "qa_pairs": qa_pairs,
        "qa_score": qa_score,
        "qa_feedback": qa_feedback,
    }
    history.append(qa_entry)

print("\n================ FINAL HISTORY ================")
print(json.dumps(history, indent=2, ensure_ascii=False))


Orchestrator with QA-Generator and QA-Evaluator agents is ready.

[Judge] No history yet; allowing first training + SEAL round.
[Reward-Penalty] No training history yet; neutral reward and no overrides.
[Reward-Penalty signal] {
  "train_improvement": 0.0,
  "seal_improvement": 0.0,
  "total_improvement": 0.0,
  "last_train_loss": null,
  "best_train_loss": null
}
[SEAL-Eval] No SEAL runs yet; no refinement effect to analyze.
[Tuner] Starting with an initial stable configuration for training as there is no prior history to guide adjustments.
[Tuner proposal] TRAIN hparams from Tuner Agent:
{'lr': 0.0001, 'batch_size': 8, 'epochs': 1, 'warmup_ratio': 0.1, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.1}
[Tuner proposal] SEAL plan from Tuner Agent:
{'seal_epochs': 1}
[Brainstorm] Final agreed TRAIN hyperparameters after applying Reward-Penalty overrides:
{'lr': 0.0001, 'batch_size': 8, 'epochs': 1, 'warmup_ratio': 0.1, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.1, 'trial_id': 

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[get_base_model] Using flash_attention_2.


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
50,3.7191
100,0.396
150,0.423
200,0.4274
250,0.386
300,0.3915
350,0.4209
400,0.4101
450,0.3885
500,0.4292


[train_and_eval] Finished trial: train_ep_0
[train_and_eval] eval_loss: 0.4078734815120697
[train_and_eval] perplexity: 1.5036169137971005
[TRAIN] eval_loss: 0.4078734815120697
[TRAIN] perplexity: 1.5036169137971005
[SEAL Expert] We are beginning the SEAL refinement process with no prior SEAL history. Starting with a single SEAL epoch will allow us to observe initial effects without overcommitting.
[SEAL Expert decision] seal_epochs = 1
[SEAL Expert evaluation] SEAL has not been applied yet, so its impact is currently neutral. We should start with one SEAL epoch to gather baseline data and adjust based on observed changes in metrics.
[Orchestrator] Running SEAL stage for 1 epoch(s)...
[SEAL-Tool] Using best TRAIN episode 0 with eval_loss=0.4079
[SEAL-Tool] Sample training examples used in SEAL stage:
  - example index 912: Instruction: Identify the main idea of the text. Input: Global warming is the term used to describe a gradual increase in the average temperature of the Earth's atmo

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[get_base_model] Using flash_attention_2.


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
50,3.7423
100,0.3957
150,0.4228
200,0.4272
250,0.3859
300,0.3913
350,0.4207
400,0.41
450,0.3884
500,0.4291


[train_and_eval] Finished trial: seal_from_ep_0
[train_and_eval] eval_loss: 0.40779900550842285
[train_and_eval] perplexity: 1.503504934588279
[SEAL-Tool] SEAL eval_loss: 0.40779900550842285
[SEAL-Tool] SEAL perplexity: 1.503504934588279
[Brainstorm] Episode summary:
  - TRAIN eval_loss: 0.4079 perplexity: 1.5036
  - SEAL  eval_loss: 0.4078 perplexity: 1.5035
[Orchestrator] Invoking QA-Generator Agent to simulate internal Q&A...
[QA-Generator] The model has likely learned to generate coherent and contextually relevant responses with a focus on clarity and accuracy, as indicated by the low perplexity and eval loss metrics.
[QA-Example 1] Q: Can you explain how photosynthesis works?
[QA-Example 1] A: Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy, usually from the sun, into chemical energy stored in glucose. It occurs mainly in the chloroplasts of plant cells, where chlorophyll captures light energy to convert carbon dioxide and water i

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[get_base_model] Using flash_attention_2.


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
50,3.8419
100,0.3965
150,0.423
200,0.4275
250,0.3862
300,0.3916
350,0.421
400,0.4103
450,0.3886
500,0.4294


[train_and_eval] Finished trial: train_ep_1
[train_and_eval] eval_loss: 0.40814870595932007
[train_and_eval] perplexity: 1.5040308028846618
[TRAIN] eval_loss: 0.40814870595932007
[TRAIN] perplexity: 1.5040308028846618
[SEAL Expert] The SEAL process is showing a very slight improvement in both eval_loss and perplexity, indicating a neutral to slightly positive effect.
[SEAL Expert decision] seal_epochs = 1
[SEAL Expert evaluation] SEAL is currently providing a marginal improvement, suggesting it is slightly helping. Given the plateau in improvement, maintaining the current SEAL configuration with 1 epoch is advisable to observe further trends without overfitting. Consider small adjustments in learning rate or LoRA parameters in future iterations.
[Orchestrator] Running SEAL stage for 1 epoch(s)...
[SEAL-Tool] Using best TRAIN episode 0 with eval_loss=0.4079
[SEAL-Tool] Sample training examples used in SEAL stage:
  - example index 912: Instruction: Identify the main idea of the text. In

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[get_base_model] Using flash_attention_2.


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
50,3.7423
100,0.3957
150,0.4228
200,0.4272
250,0.3859
300,0.3913
350,0.4207
400,0.41
450,0.3884
500,0.4291


[train_and_eval] Finished trial: seal_from_ep_0
[train_and_eval] eval_loss: 0.40779900550842285
[train_and_eval] perplexity: 1.503504934588279
[SEAL-Tool] SEAL eval_loss: 0.40779900550842285
[SEAL-Tool] SEAL perplexity: 1.503504934588279
[Brainstorm] Episode summary:
  - TRAIN eval_loss: 0.4081 perplexity: 1.5040
  - SEAL  eval_loss: 0.4078 perplexity: 1.5035
[Orchestrator] Invoking QA-Generator Agent to simulate internal Q&A...
[QA-Generator] The model has likely learned to provide accurate factual information and perform basic reasoning tasks, demonstrating an understanding of fundamental concepts in science and mathematics. However, it may still benefit from further training to enhance the depth and variety of its responses.
[QA-Example 1] Q: What is the process of evaporation?
[QA-Example 1] A: Evaporation is the process by which liquid water is transformed into water vapor, a gas. This occurs when molecules in the liquid gain enough energy to overcome intermolecular forces and esc

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[get_base_model] Using flash_attention_2.


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
50,3.9517
100,0.3974
150,0.4233
200,0.4279
250,0.3865
300,0.3919
350,0.4214
400,0.4106
450,0.3889
500,0.4297


[train_and_eval] Finished trial: train_ep_2
[train_and_eval] eval_loss: 0.40844765305519104
[train_and_eval] perplexity: 1.504480495739122
[TRAIN] eval_loss: 0.40844765305519104
[TRAIN] perplexity: 1.504480495739122
[SEAL Expert] The SEAL process is showing slight improvements in eval_loss and perplexity, indicating a positive but minimal impact.
[SEAL Expert decision] seal_epochs = 1
[SEAL Expert evaluation] SEAL is helping slightly by improving metrics, but the effect is minimal. Continue with 1 SEAL epoch to monitor for consistent improvements, while considering further adjustments to learning rate or training epochs if improvements plateau.
[Orchestrator] Running SEAL stage for 1 epoch(s)...
[SEAL-Tool] Using best TRAIN episode 0 with eval_loss=0.4079
[SEAL-Tool] Sample training examples used in SEAL stage:
  - example index 912: Instruction: Identify the main idea of the text. Input: Global warming is the term used to describe a gradual increase in the average temperature of the E

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[get_base_model] Using flash_attention_2.


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
50,3.7423
100,0.3957
150,0.4228
200,0.4272
250,0.3859
300,0.3913
350,0.4207
400,0.41
450,0.3884
500,0.4291


[train_and_eval] Finished trial: seal_from_ep_0
[train_and_eval] eval_loss: 0.40779900550842285
[train_and_eval] perplexity: 1.503504934588279
[SEAL-Tool] SEAL eval_loss: 0.40779900550842285
[SEAL-Tool] SEAL perplexity: 1.503504934588279
[Brainstorm] Episode summary:
  - TRAIN eval_loss: 0.4084 perplexity: 1.5045
  - SEAL  eval_loss: 0.4078 perplexity: 1.5035
[Orchestrator] Invoking QA-Generator Agent to simulate internal Q&A...
[QA-Generator] The model has learned to provide clear and concise answers to factual questions, along with basic explanations and problem-solving for simple mathematical tasks. It exhibits a reasonable understanding of various topics but may still benefit from additional training to enhance complexity and depth in its responses.
[QA-Example 1] Q: What is the process of photosynthesis?
[QA-Example 1] A: Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. It mainly occurs in th

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[get_base_model] Using flash_attention_2.


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
50,4.1173
100,0.3983
150,0.4238
200,0.4283
250,0.3869
300,0.3923
350,0.4218
400,0.411
450,0.3891
500,0.4301


[train_and_eval] Finished trial: train_ep_3
[train_and_eval] eval_loss: 0.4088570773601532
[train_and_eval] perplexity: 1.505096592734355
[TRAIN] eval_loss: 0.4088570773601532
[TRAIN] perplexity: 1.505096592734355
[SEAL Expert] SEAL is currently providing a slight improvement in eval_loss and perplexity, indicating it is helping, albeit modestly. To explore further improvements, we should continue with 1 SEAL epoch.
[SEAL Expert decision] seal_epochs = 1
[SEAL Expert evaluation] SEAL is helping, but the improvement is minimal. Maintain the current SEAL plan with 1 epoch while focusing on adjusting learning rates and potentially increasing training epochs to enhance model performance.
[Orchestrator] Running SEAL stage for 1 epoch(s)...
[SEAL-Tool] Using best TRAIN episode 0 with eval_loss=0.4079
[SEAL-Tool] Sample training examples used in SEAL stage:
  - example index 912: Instruction: Identify the main idea of the text. Input: Global warming is the term used to describe a gradual incr

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[get_base_model] Using flash_attention_2.


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
50,3.7423
100,0.3957
150,0.4228
200,0.4272
250,0.3859
300,0.3913
350,0.4207
400,0.41
450,0.3884
500,0.4291


[train_and_eval] Finished trial: seal_from_ep_0
[train_and_eval] eval_loss: 0.40779900550842285
[train_and_eval] perplexity: 1.503504934588279
[SEAL-Tool] SEAL eval_loss: 0.40779900550842285
[SEAL-Tool] SEAL perplexity: 1.503504934588279
[Brainstorm] Episode summary:
  - TRAIN eval_loss: 0.4089 perplexity: 1.5051
  - SEAL  eval_loss: 0.4078 perplexity: 1.5035
[Orchestrator] Invoking QA-Generator Agent to simulate internal Q&A...
[QA-Generator] The model has learned to provide accurate answers to factual questions, perform simple mathematical tasks, and explain basic scientific concepts. However, it may still lack some depth and variety in its responses, suggesting that further training could enhance its performance.
[QA-Example 1] Q: What is the boiling point of water?
[QA-Example 1] A: The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at standard atmospheric pressure.
[QA-Example 2] Q: Can you explain the law of supply and demand?
[QA-Example 2] A: The law of 