### Data Preparation - Raw conversation -> Training/evaluation format data

In [None]:
!unzip '/content/v3.zip'

Archive:  /content/v3.zip
   creating: v3/
  inflating: __MACOSX/._v3           
  inflating: v3/20251124_152107__os_to_004__persona_008.json  
  inflating: __MACOSX/v3/._20251124_152107__os_to_004__persona_008.json  
  inflating: v3/20251124_124851__ca_sm_001__persona_002.json  
  inflating: __MACOSX/v3/._20251124_124851__ca_sm_001__persona_002.json  
  inflating: v3/20251124_144854__os_co_006__persona_053.json  
  inflating: __MACOSX/v3/._20251124_144854__os_co_006__persona_053.json  
  inflating: v3/20251124_151057__os_ro_006__persona_024.json  
  inflating: __MACOSX/v3/._20251124_151057__os_ro_006__persona_024.json  
  inflating: v3/20251124_145517__os_co_010__persona_049.json  
  inflating: __MACOSX/v3/._20251124_145517__os_co_010__persona_049.json  
  inflating: v3/20251124_144650__os_co_003__persona_050.json  
  inflating: __MACOSX/v3/._20251124_144650__os_co_003__persona_050.json  
  inflating: v3/20251124_145657__os_ro_002__persona_032.json  
  inflating: __MACOSX/v3/._2025112

In [None]:
import json
import os
from glob import glob

# ------------------------------------------------------------
# 1. LOAD CONVERSATION FILES
# ------------------------------------------------------------

CONV_DIR = "/content/v3/*.json"   # <-- change path as needed
OUTPUT_PATH = "test_dataset.jsonl"

SYSTEM_INSTRUCTION = """
You are a helpful multi-turn dialogue assistant capable of leveraging tool calls to solve user tasks and provide structured chat responses.

**Steps for Each Turn**
1. Think
2. Decide on tool usage
3. Respond

**Output Format**
<think> ... </think>
<tool_call>
{"name": "...", "parameters": {...}}
</tool_call>
<response> ... </response>

**Important Notes**
- Always include <think>.
- Use <tool_call> OR <response> OR both.
- Maintain full history context.
""".strip()


# ------------------------------------------------------------
# 2. Extract ALL assistant steps cleanly (tool calls + say)
# ------------------------------------------------------------

def extract_assistant_output(msg):
    """
    Extract ALL assistant step['output_raw'] exactly as-is,
    in the order they appear.

    This becomes the training target ("output").
    """
    steps = msg.get("steps", [])
    pieces = []
    for step in steps:
        raw = (step.get("output_raw") or "").strip()
        if raw:
            pieces.append(raw)
    return "\n\n".join(pieces).strip()


# ------------------------------------------------------------
# 3. Build history up to this user turn
# ------------------------------------------------------------

def build_dialogue_history(messages, upto_turn):
    """
    Build history including ALL previous user messages and ALL previous
    assistant steps (including tool calls + <obs>).

    IMPORTANT:
    - Include ONLY the user message for `upto_turn`.
    - DO NOT include the assistant's message for `upto_turn`.
    """
    history_blocks = []

    for msg in messages:
        turn = msg["turn_id"]
        role = msg["role"]

        if turn > upto_turn:
            break

        if turn == upto_turn and role == "assistant":
            # Do NOT include assistant output for this turn in the history
            continue

        if role == "user":
            text = (msg.get("output_raw") or "").strip()
            history_blocks.append(f"<user> {text} </user>")

        elif role == "assistant":
            # Include ALL steps for history + any <obs>
            for step in msg.get("steps", []):
                raw = (step.get("output_raw") or "").strip()
                if raw:
                    history_blocks.append(raw)

                # If this step has an observation, add <obs>
                obs = step.get("observation")
                if obs:
                    raw_obs = obs.get("raw")
                    if raw_obs:
                        history_blocks.append(f"<obs> {raw_obs} </obs>")

    return "\n\n".join(history_blocks).strip()


# ------------------------------------------------------------
# 4. CONVERT EACH CONVERSATION INTO MULTIPLE DATAPOINTS
# ------------------------------------------------------------

dataset = []

conv_files = sorted(glob(CONV_DIR))
print(f"Found {len(conv_files)} conversation files!")

for path in conv_files:
    with open(path, "r", encoding="utf-8") as f:
        conv = json.load(f)

    messages = conv.get("messages", [])
    if not messages:
        continue

    # All user turn_ids
    user_turns = sorted([m["turn_id"] for m in messages if m["role"] == "user"])

    for turn in user_turns:

        # 1. Build history (<user>..., <think>..., <tool_call>..., <obs>...)
        history = build_dialogue_history(messages, upto_turn=turn)
        if not history:
            continue

        full_input = "**Dialogue Records History**\n" + history

        # 2. Get assistant message for this user turn
        assistant_msg = next(
            (m for m in messages if m["turn_id"] == turn and m["role"] == "assistant"),
            None
        )
        if assistant_msg is None:
            continue

        # 3. Build output = ALL assistant step outputs (raw XML: <think>, <plan>, <action type="tool">, etc.)
        assistant_output = extract_assistant_output(assistant_msg)
        if not assistant_output:
            continue

        # 4. Create final datapoint
        datapoint = {
            "instruction": SYSTEM_INSTRUCTION,
            "input": full_input,
            "output": assistant_output
        }

        dataset.append(datapoint)


# ------------------------------------------------------------
# 5. SAVE JSONL
# ------------------------------------------------------------

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    for dp in dataset:
        f.write(json.dumps(dp, ensure_ascii=False) + "\n")

print(f"\nSUCCESS → Created dataset with {len(dataset)} datapoints → {OUTPUT_PATH}")


Found 10 conversation files!

SUCCESS → Created dataset with 31 datapoints → test_dataset.jsonl


### SFT


In [48]:
!pip install -U transformers datasets peft accelerate bitsandbytes trl

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.25.1-py3-none-any.whl.metadata (11 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.25.1-py3-none-any.whl (465 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.5/465.5 kB[0m [31m10.8

In [None]:
# sft_rlla_hf.py
# pip install -U transformers datasets peft accelerate bitsandbytes trl

%%writefile sft_rlla_hf.py
import argparse, json, random, math, torch, os
from typing import Dict, List
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

SPECIAL_TOKENS = ["<think>", "<tool_call>", "<response>", "<obs>"]
IGNORE_INDEX = -100

def build_source(ex: Dict[str, str]) -> str:
    return (
        "### Instruction\n"
        f"{ex['instruction'].strip()}\n\n"
        "### Dialogue Context\n"
        f"{ex['input'].strip()}\n\n"
        "### You must produce the final assistant turn below.\n"
    )

def build_target(ex: Dict[str, str]) -> str:
    return ex["output"].strip()

def load_json_array(path: str) -> List[Dict]:
    with open(path, "r") as f:
        return json.load(f)

# def load_json_array(path: str) -> List[Dict]:
#     """
#     Load JSONL: one JSON object per line.
#     Returns a list of dicts.
#     """
#     items = []
#     with open(path, "r", encoding="utf-8") as f:
#         for line in f:
#             line = line.strip()
#             if not line:
#                 continue
#             items.append(json.loads(line))
#     return items

def tokenize_and_mask(example, tokenizer, max_len):
    src = build_source(example)
    tgt = build_target(example)
    full = src + tgt + tokenizer.eos_token

    tok_full = tokenizer(full, max_length=max_len, truncation=True)
    src_ids = tokenizer(src, max_length=max_len, truncation=True)["input_ids"]

    labels = tok_full["input_ids"][:]
    mask_upto = min(len(src_ids), len(labels))
    for i in range(mask_upto):
        labels[i] = IGNORE_INDEX

    tok_full["labels"] = labels
    return tok_full

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--data_path", type=str, required=True)
    ap.add_argument("--model_name", type=str, required=True)
    ap.add_argument("--output_dir", type=str, required=True)
    ap.add_argument("--epochs", type=int, default=2)
    ap.add_argument("--batch_size", type=int, default=2)
    ap.add_argument("--grad_accum", type=int, default=8)
    ap.add_argument("--lr", type=float, default=2e-4)
    ap.add_argument("--max_len", type=int, default=4096)
    ap.add_argument("--seed", type=int, default=42)
    ap.add_argument("--lora_r", type=int, default=16)
    ap.add_argument("--lora_alpha", type=int, default=32)
    ap.add_argument("--lora_dropout", type=float, default=0.05)
    ap.add_argument("--qlora", action="store_true", help="Enable 4-bit QLoRA")
    ap.add_argument("--eval_ratio", type=float, default=0.05)
    args = ap.parse_args()

    os.makedirs(args.output_dir, exist_ok=True)
    random.seed(args.seed)

    # Load data (array JSON with instruction/input/output)
    raw = load_json_array(args.data_path)
    random.shuffle(raw)
    n = len(raw)
    val_n = max(1, int(args.eval_ratio * n))
    val_split, train_split = raw[:val_n], raw[val_n:]

    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True, trust_remote_code=True)
    # Add special tokens
    tokens_to_add = [t for t in SPECIAL_TOKENS if t not in tokenizer.get_vocab()]
    if tokens_to_add:
        tokenizer.add_special_tokens({"additional_special_tokens": tokens_to_add})
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Model (FP16/BF16 or 4-bit QLoRA)
    if args.qlora:
        bnb_cfg = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
        model = AutoModelForCausalLM.from_pretrained(
            args.model_name,
            quantization_config=bnb_cfg,
            device_map="auto",
            trust_remote_code=True,
        )
    else:
        dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
        model = AutoModelForCausalLM.from_pretrained(
            args.model_name,
            torch_dtype=dtype,
            device_map="auto",
            trust_remote_code=True,
        )

    # Resize embeddings if we added tokens
    model.resize_token_embeddings(len(tokenizer))

    # Apply LoRA
    if args.qlora:
        model = prepare_model_for_kbit_training(model)
    lora_cfg = LoraConfig(
        r=args.lora_r,
        lora_alpha=args.lora_alpha,
        lora_dropout=args.lora_dropout,
        bias="none",
        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, lora_cfg)

    # Build datasets -> tokenize + mask
    def _map_fn(ex): return tokenize_and_mask(ex, tokenizer, args.max_len)
    ds_train = Dataset.from_list(train_split).map(_map_fn, remove_columns=list(train_split[0].keys()))
    ds_val   = Dataset.from_list(val_split).map(_map_fn,   remove_columns=list(val_split[0].keys()))
    dset = DatasetDict({"train": ds_train, "validation": ds_val})

    # Trainer
    fp16 = torch.cuda.is_available() and not torch.cuda.is_bf16_supported()
    bf16 = torch.cuda.is_bf16_supported()
    training_args = TrainingArguments(
        output_dir=args.output_dir,
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=max(1, args.batch_size),
        gradient_accumulation_steps=args.grad_accum,
        learning_rate=args.lr,
        warmup_ratio=0.03,
        lr_scheduler_type="cosine",
        logging_steps=25,
        #evaluation_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,
        save_total_limit=2,
        fp16=fp16,
        bf16=bf16,
        gradient_checkpointing=True,
        report_to="none",
    )
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dset["train"],
        eval_dataset=dset["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    trainer.train()
    trainer.save_model(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)

    print("Saved to:", args.output_dir)

if __name__ == "__main__":
    main()


Writing sft_rlla_hf.py


In [None]:
!python sft_rlla_hf.py \
  --data_path /content/rlla_rl.json \
  --model_name Qwen/Qwen2.5-32B-Instruct \
  --output_dir ./opensource-qwen2.5-32b \
  --epochs 1 \
  --batch_size 1 \
  --grad_accum 2 \
  --max_len 4096 \
  --lr 2e-4 \
  --qlora

2025-11-29 03:00:26.222408: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-29 03:00:26.241173: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764385226.262763    2625 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764385226.269426    2625 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764385226.286381    2625 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [53]:
from huggingface_hub import login

login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [54]:
from huggingface_hub import HfApi
from pathlib import Path

api = HfApi()

repo_id = "ajChakrarborty/custom-data-qwen2.5-7b-instruct-ft-rl-1"   # <-- change this

api.create_repo(repo_id, exist_ok=True)

api.upload_folder(
    repo_id=repo_id,
    folder_path="/content/grpo-tool-agent/checkpoint-200",   # your model output directory
    path_in_repo="",                     # upload all files
)


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...kpoint-200/tokenizer.json:   0%|          | 27.8kB / 11.4MB            

  ...adapter_model.safetensors:   0%|          | 16.7MB / 4.51GB            

  ...eckpoint-200/optimizer.pt:   1%|          |  15.0B / 1.66kB            

  ...ckpoint-200/rng_state.pth:   1%|          |   139B / 14.6kB            

  ...eckpoint-200/scheduler.pt:   1%|          |  14.0B / 1.47kB            

  ...int-200/training_args.bin:   1%|          |  69.0B / 7.25kB            

CommitInfo(commit_url='https://huggingface.co/ajChakrarborty/custom-data-qwen2.5-7b-instruct-ft-rl-1/commit/672c84ffb89f9630b76f6f2524d9c0748577a499', commit_message='Upload folder using huggingface_hub', commit_description='', oid='672c84ffb89f9630b76f6f2524d9c0748577a499', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ajChakrarborty/custom-data-qwen2.5-7b-instruct-ft-rl-1', endpoint='https://huggingface.co', repo_type='model', repo_id='ajChakrarborty/custom-data-qwen2.5-7b-instruct-ft-rl-1'), pr_revision=None, pr_num=None)

### Inference - SFT, Base

In [1]:
!pip install transformers accelerate bitsandbytes datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

ft_model = "ajChakrarborty/custom-data-qwen2.5-7b-instruct-ft-rl-1"
sft_model = "ajChakrarborty/custom-qwen2.5-7b-instruct-ft-1"
base_model = "Qwen/Qwen2.5-7B-Instruct"

# 1. Load tokenizer from SFT checkpoint (has added tokens)
tokenizer = AutoTokenizer.from_pretrained(
    sft_model,
    trust_remote_code=True
)

# 2. Load base model in 4-bit
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_cfg,
    device_map="auto",
    trust_remote_code=True,
)

# 3. Resize embeddings to match tokenizer *before* loading LoRA
model.resize_token_embeddings(len(tokenizer))

# 4. Load the LoRA adapter
model = PeftModel.from_pretrained(
    model,
    ft_model,
    device_map="auto",
)

print("Model loaded successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/669 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/753 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/4.51G [00:00<?, ?B/s]

Model loaded successfully!


In [2]:
# -------------------------
# BASE MODEL (no LoRA)
# -------------------------
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

base_model_name = "Qwen/Qwen2.5-32B-Instruct"

tokenizer_base = AutoTokenizer.from_pretrained(
    base_model_name,
    trust_remote_code=True
)

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model_base = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_cfg,
    trust_remote_code=True,
    device_map="auto",
)

print("✓ Base model loaded successfully")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 17 files:   0%|          | 0/17 [00:00<?, ?it/s]

model-00002-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00005-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00006-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00008-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00004-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00007-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00001-of-00017.safetensors:   0%|          | 0.00/3.92G [00:00<?, ?B/s]

model-00003-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00009-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00010-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00011-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00012-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00013-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00014-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00015-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00016-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00017-of-00017.safetensors:   0%|          | 0.00/3.10G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

✓ Base model loaded successfully


In [None]:
model.eval()

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch
import json
from tqdm import tqdm

In [5]:
test_samples = []
with open("/content/test_dataset.jsonl", "r") as f:
    for line in f:
        test_samples.append(json.loads(line))

print("Loaded", len(test_samples), "samples.")

Loaded 31 samples.


Pre-Act/Re-Act Prompts



In [8]:
import json
import glob

# Path to folder containing your 5 JSON files
json_files = glob.glob("/content/*.json")

merged_tools = {}

for file_path in json_files:
    with open(file_path, "r") as f:
        data = json.load(f)

    if "tools" not in data:
        print(f"WARNING: No 'tools' key in {file_path}")
        continue

    for tool_name, tool_def in data["tools"].items():
        if tool_name in merged_tools:
            print(f"WARNING: Duplicate tool name '{tool_name}' found in {file_path}. Overwriting.")
        merged_tools[tool_name] = tool_def

# Build final unified toolset
UNIFIED_TOOLSET = {
    "id": "toolset.unified.v1",
    "tools": merged_tools
}

# Save merged toolset
with open("unified_toolset.json", "w") as f:
    json.dump(UNIFIED_TOOLSET, f, indent=2)

print("Merged toolset saved to unified_toolset.json")


Merged toolset saved to unified_toolset.json


In [7]:
import json
import torch
from tqdm import tqdm
# ----------------------------------------------
# 2. SYSTEM PROMPT WITH TOOLSET INSERTED
# ----------------------------------------------
# SYSTEM_PROMPT_BASE = """
# You are a reasoning-centric, tool-using agent that follows the ReAct methodology:
# you think step-by-step, observe, reason, and act in small increments.

# Your job is to help the user complete their task by interleaving:
# 1. Thought (reasoning / reflection)
# 2. Action (tool use OR user-facing reply)

# You do NOT need to produce long global plans before acting.
# Instead, you produce short, incremental reasoning based on the latest observation.

# ============================================================
# REACT PRINCIPLES YOU MUST FOLLOW
# ============================================================

# 1. **Reason step-by-step in <think>**
#    - Explain what you understand from the user's request
#    - Interpret the last observation or tool result
#    - Decide what the next best immediate step is (not the whole workflow)
#    - Keep reasoning grounded in evidence

# 2. **Incremental Plans in <plan>**
#    - Keep plans short, local, 1–3 steps max
#    - Only describe what you will do *next*, not an entire pipeline
#    - Example good plan:
#         <plan>
#         1. Call find_order() to identify the correct order
#         2. After seeing results, decide next tool
#         </plan>

# 3. **One Action at a Time**
#    - After thinking and planning, ALWAYS output exactly one action:
#         <action type="tool" name="...">{JSON}</action>
#        OR
#         <action type="say">some message</action>
#    - Never output more than one <action>.
#    - Never mix a tool and a say in one message.

# 4. **State-Driven Behavior**
#    - Observe the conversation so far and previous tool results
#    - Adjust reasoning after every tool output
#    - Do not hallucinate missing parameters — ask or use tools to retrieve them.

# ============================================================
# OUTPUT FORMAT
# ============================================================

# Every message must contain:

# <think>
# Non-empty reasoning (2–5 sentences). Never leave this blank.
# </think>

# <plan>
# Short-term, local next steps ONLY (1–3 bullet points).
# Always included on first step after user message.
# Omit only when responding to a tool result *within the same turn*.
# </plan>

# <action type="say">...</action>
# OR
# <action type="tool" name="...">{ ... }</action>

# RULES:
# - EXACTLY ONE <action> block.
# - Action must be last.
# - JSON must be valid.
# - Never produce text outside these blocks.
# - Never explain the ReAct rules to the user.
# - Never show your reasoning.

# ============================================================
# STRICT GUARANTEES
# ============================================================
# - <think> must NEVER be empty.
# - <plan> must contain at least 1 numbered step when included.
# - Use tools only when you have the required parameters.
# - Ask for missing details using a say() action.
# - When a tool returns an error or empty result, think → plan → ask or retry.
# - Do not produce tool calls without valid, non-fabricated parameters.

# ============================================================
# TOOLSET
# ============================================================
# {tool_list_json}

# You now begin when the user speaks.
# """

SYSTEM_PROMPT_BASE = """
You are a smart and helpful system agent in a multi-turn, task-oriented conversation. Your job is to help the user complete their task by reasoning step-by-step, asking good questions, and using the tools available to you.

Your primary objective is to execute the user's request by calling the appropriate tools. Planning and discussion are steps toward execution, but the task is not complete until you have actually called the tools to perform the required actions. You act as the user's guide through this process, maintaining memory of what has already been said and done, and always responding with purpose. When tools are needed, you call them. When the user needs to clarify, you prompt them. At every step, you make progress toward the final goal.

TOOL USAGE:
- Before calling any tool, carefully review the available tools and their descriptions.
- If a tool requires a parameter (like an ID) that you don't have, check if another tool provides it. For example, if you need a card_id but only have a card description, look for a tool that lists accounts/cards to get the ID first.
- Plan the sequence of tool calls needed: identify what information you need, which tools provide it, and the order in which to call them.
- Never assume you can proceed without the required parameters - if a tool needs an ID and you don't have it, find the tool that provides IDs first.

REQUIRED FORMAT - Every single output must follow this structure:

<think>
Internal reasoning. Think through the current state, what just happened, and what's needed next. ALWAYS include this block in every output, even after tool calls.

CRITICAL: This block must NEVER be empty. Always write at least 2-3 sentences explaining:
- What information you currently have
- What the user just said or what tool result you just received
- What you're deciding to do next and why
- Any important considerations or constraints

Examples of good reasoning:
- "The user wants to book a restaurant for Friday evening. I need to search for restaurants matching their cuisine preference, then check availability for their party size."
- "The search returned 3 restaurants. I should present these options to the user, highlighting key differences like price range and ratings to help them choose."
- "The tool call failed because the restaurant_id was invalid. I need to ask the user to clarify which restaurant they meant, or search again with different criteria."
</think>
<plan>
A short-term plan, written as a numbered list, outlining the major remaining steps to complete the user's goal. You MUST include this block on your FIRST output after receiving a user message (i.e., the first step of each new turn). Omit it on subsequent steps within the same turn (after tool results).

CRITICAL: ALWAYS include at least one item in the plan, even if the task is practically complete. If the task is finished, include an item like "1. Notify the user that the task is complete" or "1. Confirm completion with the user". Never omit the plan block entirely.

IMPORTANT: You MUST include <plan> even when you are about to call a tool immediately. The plan describes what you will do, including tool calls. For example, if you're about to call find_meeting, your plan might be:
1. Search for the meeting using find_meeting tool
2. Review the results and identify the correct meeting
3. Update the meeting using update_meeting tool
4. Confirm the changes to the user

Do NOT skip the plan just because you're calling a tool - the plan is required BEFORE the tool call.
</plan>

After </think> (and </plan> if included), output exactly ONE <action>...</action> block:

• To communicate with the user:
  <action type="say">Your user-facing message here</action>

• To use a tool (JSON args only; double-quoted keys/strings):
  <action type="tool" name="tool_name">
    { "arg1": "...", "arg2": 123 }
  </action>

CRITICAL: You must output EXACTLY ONE <action> block per turn. Do NOT combine tool and say. Do NOT include any code, imports, assignments, or extra text outside of <think>, <plan> (required on first step of each turn), and your single <action> block. Put any calculations in <plan>.

IMPORTANT: Always include the closing </action> tag for your action block.

FORMAT RULES:
- ALWAYS start with <think>...</think> - this is required for EVERY output and MUST contain actual reasoning text (never empty)
- ALWAYS include <plan>...</plan> on your FIRST output after a user message (first step of each new turn) - this is REQUIRED, not optional. The plan must contain at least one item, even if it's just "1. Notify the user that the task is complete"
- CRITICAL: Include <plan> even when you're about to call a tool immediately - the plan describes your intended actions including tool calls
- Omit <plan> on subsequent steps within the same turn (after tool results)
- Output exactly ONE <action>...</action> per message
- Your message must end with </action> (no text or whitespace after it)
- For <action type="say">: body must contain actual text (not empty or only whitespace). Multi-line is allowed. Close the tag.
- For <action type="tool" ...>: body must be valid JSON (double-quoted keys/strings). Close the tag.

SELF-CHECK BEFORE EVERY OUTPUT:
- Did I include <think> with actual content (NOT empty)? (REQUIRED ALWAYS - must contain at least 2-3 sentences of reasoning)
- Did I include <plan> with at least one item? (REQUIRED if this is the first step of a new turn after a user message - even if task is complete or I'm about to call a tool, include "1. Notify user task is complete" or describe the tool calls I'll make)
- One <action> only?
- Closing </action> present and last in message?
- If say: Does the text contain actual content (not empty or only whitespace)?
- If tool: JSON is valid?

IMPORTANT: If your <think> block is empty or contains only whitespace, you MUST add reasoning before sending your response. Empty reasoning blocks are not acceptable.

---

STRICT RULES:

- Do **not** output anything outside of the structure above. No markdown, lists, or freeform notes.
- You must output exactly ONE <action> per turn: either type="say" OR type="tool", never both.
- The <think> and <plan> sections are invisible to the user. Write them only for internal use.
- If you cannot proceed without more information, use <action type="say">...</action> to ask the user for clarification.
- If a tool call fails (e.g., invalid inputs, no results, or error), reflect on what went wrong in <think> (with detailed reasoning, never empty) and revise your <plan> accordingly.
- Never guess tool arguments. If you're missing required inputs, first check if another tool can provide them; if not, use say() to ask the user.
- Before calling a tool that requires an ID or identifier, check if you need to call another tool first to obtain it. Review tool descriptions to understand parameter dependencies.

- If the user gives contradictory or confusing input, use say() to clarify instead of making assumptions.
- Always keep track of what information you've already gathered, what tools have been used, and what the next step is.
- If the user corrects something, pause and reassess before responding.
- You must never invent tool outputs or pretend something worked when it didn't.
- Before claiming a tool did something, verify you can see its result in the conversation history; if not, call the tool instead of guessing.

TONE AND STYLE:

- Your `say()` message should be friendly, clear, and purposeful. Always move the task forward.
- Be concise for straightforward turns. Be more detailed when a complex decision is involved.
- Never use flattery. Don't thank or compliment the user unless they've done something notable.
- Don't explain your internal reasoning unless the user asks.
- Never apologize unless you've made an actual error.

REFUSALS AND ERRORS:

- If something cannot be done, say so directly and offer an alternative.
- Don't lecture or speculate about why a request is problematic. Be brief and redirect.
- If a request would cause failure (e.g., missing inputs), explain what's needed instead.

You are now in an active conversation. Begin when the user speaks.



=== AVAILABLE TOOLS ===
{tool_list_json}

You now begin when the user speaks.
"""

# Insert tools
with open("unified_toolset.json") as f:
    TOOLSET = json.load(f)

tool_list_json = json.dumps(TOOLSET, indent=2)

SYSTEM_PROMPT = SYSTEM_PROMPT_BASE.replace("{tool_list_json}", tool_list_json)



# ----------------------------------------------
# 3. BUILD PROMPT FOR EACH TEST SAMPLE
# ----------------------------------------------
def build_prompt(example):
    """
    Build the COMPLETE inference prompt with:
    - system prompt
    - conversation instruction
    - dialogue history
    """
    return (
        f"<system>\n{SYSTEM_PROMPT}\n</system>\n\n"
        "### Instruction\n"
        f"{example['instruction'].strip()}\n\n"
        "### Dialogue Context\n"
        f"{example['input'].strip()}\n\n"
        "### Produce the final assistant turn below, following ALL rules.\n"
    )


# ----------------------------------------------
# 4. RUN MODEL GENERATION
# ----------------------------------------------
def generate_assistant_only(prompt, max_new_tokens=512, temperature=0.0):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_ids = inputs["input_ids"]
    prompt_len = input_ids.shape[1]

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            pad_token_id=tokenizer.eos_token_id,
        )

    gen_ids = output_ids[0, prompt_len:]
    assistant_text = tokenizer.decode(gen_ids, skip_special_tokens=False)
    return assistant_text

In [8]:
test_samples = []
with open("/content/test_dataset.jsonl") as f:
    for line in f:
        if line.strip():
            test_samples.append(json.loads(line))

print("Loaded samples:", len(test_samples))
print("Example:", test_samples[0])

Loaded samples: 31
Example: {'instruction': 'You are a helpful multi-turn dialogue assistant capable of leveraging tool calls to solve user tasks and provide structured chat responses.\n\n**Steps for Each Turn**\n1. Think\n2. Decide on tool usage\n3. Respond\n\n**Output Format**\n<think> ... </think>\n<tool_call>\n{"name": "...", "parameters": {...}}\n</tool_call>\n<response> ... </response>\n\n**Important Notes**\n- Always include <think>.\n- Use <tool_call> OR <response> OR both.\n- Maintain full history context.', 'input': '**Dialogue Records History**\n<user> hey! can you help me set up a meeting w/ sarah sometime next week? </user>', 'output': '<think>\nNeed details to schedule: Sarah’s email, user’s, date range “next week”, duration, time preferences, location, etc. No tool call yet. Keep it brief and ask targeted questions. </think>\n<plan>\n1. Clarify key meeting details: which Sarah, emails, duration, preferred days/times, and location/online.\n2. Once provided, search for mut

In [9]:
results = []

for ex in tqdm(test_samples, desc="Running inference", ncols=90):
    prompt = build_prompt(ex)
    pred = generate_assistant_only(prompt)

    results.append({
        "instruction": ex["instruction"],
        "input": ex["input"],
        "gold_output": ex.get("output"),
        "prediction": pred
    })


Running inference:   0%|                                           | 0/31 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Running inference: 100%|██████████████████████████████████| 31/31 [27:31<00:00, 53.28s/it]


In [10]:
results[6]

{'instruction': 'You are a helpful multi-turn dialogue assistant capable of leveraging tool calls to solve user tasks and provide structured chat responses.\n\n**Steps for Each Turn**\n1. Think\n2. Decide on tool usage\n3. Respond\n\n**Output Format**\n<think> ... </think>\n<tool_call>\n{"name": "...", "parameters": {...}}\n</tool_call>\n<response> ... </response>\n\n**Important Notes**\n- Always include <think>.\n- Use <tool_call> OR <response> OR both.\n- Maintain full history context.',
 'input': '**Dialogue Records History**\n<user> Need to cancel an order I placed for a desk lamp yesterday. </user>',
 'gold_output': '<think>\n</think>\n<plan>\n1. Ask the user for information needed to look up their order (order ID or identifying details).\n2. Once provided, use find_orders to locate the order.\n3. Fetch full order details and cancellation options.\n4. If eligible, confirm what exactly the user wants to cancel and proceed with cancel_order.\n5. Share the result and any refund infor

In [11]:
with open("predictions-custom-7b-rl-pre-act.jsonl", "w") as f:
    for r in results:
        f.write(json.dumps(r) + "\n")

print("DONE!")

DONE!


### L-1 Eval


In [19]:
import json

with open("/content/predictions-pre-trained-32b-re-act.jsonl") as f:
    for i, line in enumerate(f):
        if i < 3:   # show first 3 samples
            ex = json.loads(line)
            print(f"\n==== SAMPLE {i} ====\n")
            print("GOLD RAW:\n", repr(ex["gold_output"]))
            print("\nPRED RAW:\n", repr(ex["prediction"]))
        else:
            break



==== SAMPLE 0 ====

GOLD RAW:
 '<think>\nNeed details to schedule: Sarah’s email, user’s, date range “next week”, duration, time preferences, location, etc. No tool call yet. Keep it brief and ask targeted questions. </think>\n<plan>\n1. Clarify key meeting details: which Sarah, emails, duration, preferred days/times, and location/online.\n2. Once provided, search for mutual availability next week using the scheduling tools.\n3. Propose a few concrete time options to the user and confirm one.\n4. Create the meeting and confirm details back to the user.\n</plan>\n<action type="say">\nI can set that up. I just need a few details:\n\n1) Which Sarah (full name and email)?\n2) Your email (so I include you as an attendee)?\n3) How long should the meeting be (e.g., 30 or 60 minutes)?\n4) Any preferred days/times next week (and your time zone)?\n5) Location: in-person (where?) or video (Zoom/Meet/etc.), and any title/agenda for the invite?\n\nOnce I have these, I’ll find a few times next week

In [None]:
import re, json, html
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F

# ======================================================
# 0. MODELS FOR TEXT SIMILARITY
# ======================================================
SIM_MODEL = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

def semantic_similarity(a, b):
    if not a.strip() or not b.strip():
        return 0.0
    emb_a = SIM_MODEL.encode(a, convert_to_tensor=True)
    emb_b = SIM_MODEL.encode(b, convert_to_tensor=True)
    return float(F.cosine_similarity(emb_a, emb_b, dim=0).item())

# ======================================================
# 1. CLEANING RAW TEXT (fix escaped tags)
# ======================================================
def clean_text(raw):
    try:
        txt = raw.encode().decode("unicode_escape")
    except:
        txt = raw
    txt = html.unescape(txt)
    txt = txt.replace("\\n", "\n")
    return txt

# ======================================================
# 2. REGEX FOR <action> and <tool_call>
# ======================================================
ACTION_RE = re.compile(
    r"<action\s+type=\"(?P<type>[a-zA-Z]+)\"(?:\s+name=\"(?P<name>[^\"]+)\")?>\s*(?P<body>.*?)</action>",
    re.DOTALL
)

TOOL_CALL_RE = re.compile(
    r"<tool_call>\s*(?P<body>{.*?})\s*</tool_call>",
    re.DOTALL
)

# ======================================================
# 3. EXTRACT ALL ACTIONS (NOT JUST FINAL)
# ======================================================
def extract_all_actions(text):
    actions = []

    # --- A. Normal <action> blocks ---
    for m in ACTION_RE.finditer(text):
        a_type = m.group("type")
        name = m.group("name")
        body = m.group("body").strip()

        if a_type == "tool":
            try:
                params = json.loads(body)
            except:
                params = None
            actions.append({
                "action_type": "tool",
                "tool_name": name,
                "params": params,
                "text": None
            })
        else:
            actions.append({
                "action_type": "say",
                "tool_name": None,
                "params": None,
                "text": body
            })

    # --- B. Qwen-style <tool_call> blocks ---
    for m in TOOL_CALL_RE.finditer(text):
        body = m.group("body").strip()
        try:
            obj = json.loads(body)
        except:
            obj = {}
        actions.append({
            "action_type": "tool",
            "tool_name": obj.get("name"),
            "params": obj.get("parameters"),
            "text": None
        })

    return actions

# ======================================================
# 4. TOOL SEQUENCE EVALUATION
# ======================================================
def evaluate_tool_sequence(gold_actions, pred_actions):
    gold_tools = [a for a in gold_actions if a["action_type"] == "tool"]
    pred_tools = [a for a in pred_actions if a["action_type"] == "tool"]

    L = min(len(gold_tools), len(pred_tools))

    name_matches = []
    param_matches = []

    for i in range(L):
        g = gold_tools[i]
        p = pred_tools[i]

        name_matches.append(1.0 if g["tool_name"] == p["tool_name"] else 0.0)
        param_matches.append(1.0 if g["params"] == p["params"] else 0.0)

    return {
        "tool_name_f1": np.mean(name_matches) if name_matches else None,
        "params_match_full": np.mean(param_matches) if param_matches else None,
        "tool_recall": len(pred_tools) / len(gold_tools) if gold_tools else None
    }

# ======================================================
# 5. FINAL ANSWER F1 (token overlap)
# ======================================================
def token_f1(a, b):
    a_tokens = a.lower().split()
    b_tokens = b.lower().split()
    a_set, b_set = set(a_tokens), set(b_tokens)
    overlap = len(a_set & b_set)
    if overlap == 0:
        return 0.0
    precision = overlap / len(a_set)
    recall = overlap / len(b_set)
    return 2 * precision * recall / (precision + recall)

# ======================================================
# 6. MAIN EVALUATION LOOP
# ======================================================
results = []

with open("/content/predictions-custom-7b-rl-pre-act.jsonl") as f:
    for line in tqdm(f):
        item = json.loads(line)

        # extract sequences
        gold_raw = clean_text(item["gold_output"])
        pred_raw = clean_text(item["prediction"])

        gold_actions = extract_all_actions(gold_raw)
        pred_actions = extract_all_actions(pred_raw)

        # 1. multi-tool step-by-step comparison
        tool_scores = evaluate_tool_sequence(gold_actions, pred_actions)

        # 2. final action types: say or tool
        gold_final = gold_actions[-1] if gold_actions else None
        pred_final = pred_actions[-1] if pred_actions else None

        action_recall = 1.0 if gold_final and pred_final and gold_final["action_type"] == pred_final["action_type"] else 0.0

        # 3. final-answer F1
        if gold_final and gold_final["action_type"] == "say":
            if pred_final and pred_final["action_type"] == "say":
                fa_f1 = token_f1(pred_final["text"], gold_final["text"])
                fa_sim = semantic_similarity(pred_final["text"], gold_final["text"])
            else:
                fa_f1 = 0.0
                fa_sim = 0.0
        else:
            fa_f1 = None
            fa_sim = None

        results.append({
            "action_recall": action_recall,
            **tool_scores,
            "final_answer_f1": fa_f1,
            "final_answer_sim": fa_sim
        })

# ======================================================
# 7. AGGREGATE STATISTICS
# ======================================================
def avg(key):
    vals = [r[key] for r in results if key in r and r[key] is not None]
    return float(np.mean(vals)) if vals else None

summary = {
    "action_recall": avg("action_recall"),
    "tool_name_f1": avg("tool_name_f1"),
    "params_match_full": avg("params_match_full"),
    "tool_recall": avg("tool_recall"),
    "final_answer_f1": avg("final_answer_f1"),
    "final_answer_sim": avg("final_answer_sim")
}

summary


### RLVR

In [1]:
!pip install -q "transformers>=4.44.0" "trl>=0.9.6" peft bitsandbytes datasets accelerate

import os, re, json
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)
from peft import PeftModel
from trl import GRPOConfig, GRPOTrainer


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.5/465.5 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
ft_model  = "ajChakrarborty/custom-qwen2.5-32b-instruct-ft-1"   # your SFT LoRA
base_model = "Qwen/Qwen2.5-32B-Instruct"                        # frozen base

tokenizer = AutoTokenizer.from_pretrained(
    ft_model,
    trust_remote_code=True,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_cfg,
    device_map="auto",
    trust_remote_code=True,
)

base.resize_token_embeddings(len(tokenizer))
base.config.use_cache = False  # important for training

model = PeftModel.from_pretrained(
    base,
    ft_model,
    device_map="auto",
)

print("✅ SFT LoRA model loaded for GRPO.")


In [42]:
import json

# Load unified tool list JSON
with open("unified_toolset.json") as f:
    TOOLSET = json.load(f)

# Pretty JSON dump for readability inside prompt
tool_list_json = json.dumps(TOOLSET, indent=2)

# ==============================
# Your structured RL system prompt
# ==============================
SYSTEM_PROMPT_BASE = """
You are a helpful multi-turn assistant that solves user tasks by thinking step-by-step and calling tools when needed.
Every response must follow the exact structured format found in the training data.

=============================
REQUIRED OUTPUT FORMAT
=============================

<think>
Your internal reasoning.
This MUST contain text (never empty).
</think>

<plan>
A numbered list describing your intended next steps.
This MUST contain at least one step.
</plan>

Then output EXACTLY ONE of the following:

------------------------------------------
1) TOOL CALL
------------------------------------------
<action type="tool" name="TOOL_NAME">
  { JSON_PARAMETERS }
</action>

------------------------------------------
2) ASSISTANT RESPONSE
------------------------------------------
<action type="say">
Your natural-language reply to the user.
</action>

------------------------------------------
MULTI-STEP TOOL USE
------------------------------------------
You may repeat (think→plan→action) multiple times inside the same turn,
as long as the FINAL block is either a tool call or a say-action.

=============================
ADDITIONAL RULES
=============================

• ALWAYS include <think> AND <plan> before every <action>.
• NEVER output anything outside <think>, <plan>, and <action> blocks.
• NEVER invent tool names or parameters not in the tool list.
• If required parameters are missing, ask the user using <action type="say">.
• Maintain full history context including previous tool outputs.

=============================
AVAILABLE TOOLS
=============================
{tool_list_json}

You now begin when the user speaks.
"""

# Inject the tool JSON into the prompt
SYSTEM_PROMPT = SYSTEM_PROMPT_BASE.replace("{tool_list_json}", tool_list_json)


In [43]:
from datasets import load_dataset

DATA_PATH = "/content/final_dataset.jsonl"

raw_ds = load_dataset("json", data_files={"train": DATA_PATH})["train"]

print(raw_ds[0].keys())
# Should output: dict_keys(['instruction', 'input', 'output'])


dict_keys(['instruction', 'input', 'output'])


In [44]:
def build_prompt(example):
    """
    Build the RL system prompt + instruction + dialogue input.
    The model must generate ONLY the assistant's structured output.
    """
    return (
        f"<system>\n{SYSTEM_PROMPT}\n</system>\n\n"
        "### Dialogue\n"
        f"{example['input']}\n\n"
        "### Assistant Response\n"
    )

rl_dataset = raw_ds.map(
    lambda e: {
        "prompt": build_prompt(e),
        "reference_output": e["output"]
    }
)

rl_dataset = rl_dataset.remove_columns(
    [col for col in rl_dataset.column_names if col not in ["prompt", "reference_output"]]
)

print(rl_dataset[0])




In [45]:
len(rl_dataset)

324

In [18]:
import json as pyjson

ACTION_RE = re.compile(
    r"<action\s+type=\"(?P<type>[a-zA-Z]+)\"(?:\s+name=\"(?P<name>[^\"]+)\")?>\s*(?P<body>.*?)</action>",
    re.DOTALL
)

def extract_final_action(text: str):
    """
    Extract the final (last) <action>...</action> block.
    Returns dict:
      - action_type: 'tool' or 'say'
      - tool_name: str | None
      - params: dict | None
      - text: str | None
    """
    matches = list(ACTION_RE.finditer(text))
    if not matches:
        return None

    m = matches[-1]
    action_type = m.group("type")
    name = m.group("name")
    body = m.group("body").strip()

    if action_type == "tool":
        try:
            params = pyjson.loads(body)
        except Exception:
            params = None
        return {
            "action_type": "tool",
            "tool_name": name,
            "params": params,
            "text": None,
        }
    else:
        return {
            "action_type": "say",
            "tool_name": None,
            "params": None,
            "text": body,
        }

def safe_string(x):
    if x is None:
        return ""
    if not isinstance(x, str):
        return str(x)
    return x



In [12]:
def extract_gold_from_prompt(prompt: str):
    # everything after ### Assistant Response
    m = re.search(r"### Assistant Response\s*(.*)", prompt, re.DOTALL)
    if not m:
        return ""
    return m.group(1).strip()

def token_f1(a, b):
    a_tokens = a.split()
    b_tokens = b.split()
    overlap = len(set(a_tokens) & set(b_tokens))
    if overlap == 0:
        return 0.0
    precision = overlap / len(a_tokens)
    recall = overlap / len(b_tokens)
    return 2 * precision * recall / (precision + recall)



In [25]:
def reward_action_type(prompts, completions, completion_ids, **kwargs):
    gold_outputs = kwargs["reference_output"]
    scores = []

    for completion, gold in zip(completions, gold_outputs):
        response = completion  # <-- completions are STRINGS
        g = extract_final_action(gold)
        p = extract_final_action(response)

        if not g or not p:
            scores.append(0.0)
            continue

        scores.append(1.0 if g["action_type"] == p["action_type"] else 0.0)

    return scores

def reward_tool_exact(prompts, completions, completion_ids, **kwargs):
    gold_outputs = kwargs["reference_output"]
    scores = []

    for completion, gold in zip(completions, gold_outputs):
        response = completion
        g = extract_final_action(gold)
        p = extract_final_action(response)

        if not g or g["action_type"] != "tool":
            scores.append(0.0)
            continue
        if not p or p["action_type"] != "tool":
            scores.append(0.0)
            continue

        if g["tool_name"] != p["tool_name"]:
            scores.append(0.0)
            continue

        scores.append(1.0 if g["params"] == p["params"] else 0.0)

    return scores

def reward_final_answer_f1(prompts, completions, completion_ids, **kwargs):
    gold_outputs = kwargs["reference_output"]
    scores = []

    for completion, gold in zip(completions, gold_outputs):
        response = completion
        g = extract_final_action(gold)
        p = extract_final_action(response)

        if not g or g["action_type"] != "say":
            scores.append(0.0)
            continue
        if not p or p["action_type"] != "say":
            scores.append(0.0)
            continue

        scores.append(token_f1(p["text"], g["text"]))

    return scores


In [15]:
#############################
# FIX GRPO FP16/BF16 ERRORS
#############################

print("Casting model to fp32 for GRPO...")

model.float()  # convert all params to fp32 at runtime
model.config.torch_dtype = torch.float32

# Cast Linear layers explicitly
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        module.weight.data = module.weight.data.float()
        if module.bias is not None:
            module.bias.data = module.bias.data.float()

print("✓ Model is now in fp32 compute mode")


Casting model to fp32 for GRPO...
✓ Model is now in fp32 compute mode


In [34]:
training_args = GRPOConfig(
    output_dir="./grpo-tool-agent",
    learning_rate=1e-5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,   # effective batch = 4
    max_prompt_length=2048,
    max_completion_length=512,
    generation_batch_size=8,
    num_generations=8,               # K: 4 is stable/fast
    optim="adamw_torch",
    num_train_epochs=1,
    bf16=False,
    fp16=False,
    remove_unused_columns=False,
    logging_steps=1,
    save_steps=100,
    report_to="none",
)

print(training_args)

GRPOConfig(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
beta=0.0,
bf16=False,
bf16_full_eval=False,
cache_implementation=None,
cast_lm_head_to_fp32=False,
chat_template_kwargs=None,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
delta=None,
disable_dropout=False,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
ds3_gather_for_generation=True,
epsilon=0.2,
epsilon_high

In [46]:
rl_dataset = rl_dataset.select(range(50))

In [47]:
rl_dataset

Dataset({
    features: ['prompt', 'reference_output'],
    num_rows: 50
})

In [48]:
print(len(rl_dataset))

50


In [None]:
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[
        reward_action_type,
        reward_tool_exact,
        reward_final_answer_f1,
    ],
    args=training_args,
    train_dataset=rl_dataset,
)

print("✅ GRPOTrainer initialized. Starting training...")
trainer.train()
print("✅ GRPO training done. Saving model ...")
trainer.save_model(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)


debugging this shit::

In [21]:
for k in rl_dataset[0]:
    print(k, "→", rl_dataset[0][k])


prompt → <system>

You are a helpful multi-turn assistant that solves user tasks by thinking step-by-step and calling tools when needed.  
Every response must follow the exact structured format found in the training data.

REQUIRED OUTPUT FORMAT

<think>
Your internal reasoning.  
This MUST contain text (never empty).  
</think>

<plan>
A numbered list describing your intended next steps.  
This MUST contain at least one step.  
</plan>

Then output EXACTLY ONE of the following:

------------------------------------------
1) TOOL CALL
------------------------------------------
<action type="tool" name="TOOL_NAME">
  { JSON_PARAMETERS }
</action>

------------------------------------------
2) ASSISTANT RESPONSE
------------------------------------------
<action type="say">
Your natural-language reply to the user.
</action>

------------------------------------------
MULTI-STEP TOOL USE
------------------------------------------
You may repeat (think→plan→action) multiple times inside th

In [22]:
print("--- RAW ENTRY ---")
print(raw_ds[0])

print("\n--- MAPPED ENTRY ---")
print(rl_dataset[0])


--- RAW ENTRY ---
{'instruction': 'You are a helpful multi-turn dialogue assistant capable of leveraging tool calls to solve user tasks and provide structured chat responses.\n\n**Steps for Each Turn**\n1. Think\n2. Decide on tool usage\n3. Respond\n\n**Output Format**\n<think> ... </think>\n<tool_call>\n{"name": "...", "parameters": {...}}\n</tool_call>\n<response> ... </response>\n\n**Important Notes**\n- Always include <think>.\n- Use <tool_call> OR <response> OR both.\n- Maintain full history context.', 'input': '**Dialogue Records History**\n<user> yo need a flight home asap from anchorage to denver, everything looks sold out for like a week  \n\ncan you find *anything* that gets me back sooner, don’t care about time, layovers, whatever, just cheap-ish and fast </user>', 'output': '<think>\n</think>\n<plan>\n1. Confirm key details: exact dates, passenger count, and flexibility range.\n2. Search for flights from Anchorage to Denver with maximum flexibility and broad filters.\n3. Re