In [None]:
import os
import re
import copy
import json
import torch
from math import ceil
import pandas as pd
from tqdm.notebook import tqdm
from typing import List, Optional
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from financial_tools_complex import run_tool
from typing import Any, Dict, Iterable, List, Optional, Tuple

In [None]:
system_prompt = \
"""You are a financial planning assistant with tools for portfolio allocation, mortgage affordability, tax optimization, retirement readiness, debt payoff strategies, insurance needs, education funding, and currency exchange. Analyze user requests and call the appropriate tool with all required parameters extracted from their query. Return concise answers with key metrics. Do not ask for clarification - use reasonable defaults if needed.

You are provided with function signatures within <tools> </tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: 

<tools> 
[
{"name": "calculate_portfolio_allocation", "description": "Calculate optimal portfolio allocation based on risk tolerance, age, time horizon, and liquidity needs.", "parameters": {"total_investment": {"type": "float"}, "risk_tolerance": {"type": "str", "enum": ["conservative", "moderate_conservative", "moderate", "moderate_aggressive", "aggressive"]}, "time_horizon_years": {"type": "int", "range": "1-50"}, "current_age": {"type": "int", "range": "18-100"}, "retirement_age": {"type": "int"}, "income_need_percentage": {"type": "float", "range": "0.0-0.10"}, "existing_allocations": {"type": "Dict[str, float]"}, "esg_preference": {"type": "bool"}, "tax_loss_harvesting": {"type": "bool"}, "rebalancing_frequency": {"type": "str", "enum": ["monthly", "quarterly", "semi_annual", "annual", "threshold_based"]}, "inflation_protection": {"type": "bool"}, "liquidity_requirement": {"type": "str", "enum": ["immediate", "short_term", "medium_term", "long_term"]}}}, 
{"name": "calculate_mortgage_affordability", "description": "Calculate max home price and monthly payment based on income, debts, credit score, and loan type.", "parameters": {"annual_income": {"type": "float"}, "monthly_debts": {"type": "float"}, "down_payment": {"type": "float"}, "credit_score": {"type": "int", "range": "300-850"}, "loan_type": {"type": "str", "enum": ["conventional", "fha", "va", "jumbo"]}, "property_state": {"type": "str"}, "property_tax_rate": {"type": "float", "range": "0.003-0.025"}, "hoa_fees": {"type": "float"}, "homeowners_insurance": {"type": "float"}, "pmi_required": {"type": "bool"}, "interest_rate_override": {"type": "Optional[float]"}, "loan_term_years": {"type": "int", "enum": [15, 20, 30]}}}, 
{"name": "optimize_tax_strategy", "description": "Calculate tax liability with optimization suggestions based on income, deductions, and investments.", "parameters": {"gross_income": {"type": "float"}, "filing_status": {"type": "str", "enum": ["single", "married", "head"]}, "state": {"type": "str"}, "retirement_contributions": {"type": "Dict[str, float]"}, "capital_gains_short": {"type": "float"}, "capital_gains_long": {"type": "float"}, "dividend_income_qualified": {"type": "float"}, "dividend_income_ordinary": {"type": "float"}, "itemized_deductions": {"type": "float"}, "dependents": {"type": "int", "range": "0-10"}, "self_employment_income": {"type": "float"}, "rental_income": {"type": "float"}}}, {"name": "calculate_retirement_readiness", "description": "Assess retirement readiness by projecting savings vs income needs.", "parameters": {"current_age": {"type": "int", "range": "18-80"}, "retirement_age": {"type": "int", "max": 75}, "current_savings": {"type": "float"}, "annual_contribution": {"type": "float"}, "employer_match_percent": {"type": "float", "range": "0.0-1.0"}, "expected_return": {"type": "float", "range": "0.03-0.12"}, "inflation_rate": {"type": "float", "range": "0.01-0.05"}, "desired_retirement_income": {"type": "float"}, "social_security_estimate": {"type": "float"}, "pension_income": {"type": "float"}, "healthcare_cost_annual": {"type": "float"}, "life_expectancy": {"type": "int"}}}, 
{"name": "analyze_debt_payoff_strategy", "description": "Analyze optimal debt payoff using avalanche, snowball, or hybrid strategies.", "parameters": {"debts": {"type": "List[Dict]", "schema": {"balance": "float", "rate": "float", "minimum": "float", "type": "str"}}, "monthly_payment_budget": {"type": "float"}, "strategy": {"type": "str", "enum": ["avalanche", "snowball", "hybrid"]}, "extra_payment_allocation": {"type": "str", "enum": ["single_focus", "proportional", "high_interest_only"]}, "interest_rate_threshold": {"type": "float", "range": "0.05-0.25"}, "consolidation_available": {"type": "bool"}, "consolidation_rate": {"type": "float", "range": "0.04-0.15"}, "balance_transfer_fee": {"type": "float", "range": "0.0-0.05"}, "credit_score_impact_weight": {"type": "float", "range": "0.0-1.0"}}}, 
{"name": "calculate_insurance_needs", "description": "Calculate life, disability, and LTC insurance needs based on income and dependents.", "parameters": {"age": {"type": "int", "range": "18-80"}, "annual_income": {"type": "float"}, "dependents": {"type": "int", "range": "0-10"}, "mortgage_balance": {"type": "float"}, "other_debts": {"type": "float"}, "existing_coverage": {"type": "Dict[str, float]", "keys": ["life", "disability", "ltc"]}, "health_status": {"type": "str", "enum": ["excellent", "good", "fair", "poor"]}, "occupation_risk": {"type": "str", "enum": ["low", "medium", "high"]}, "years_until_retirement": {"type": "int", "range": "1-40"}, "spouse_income": {"type": "float"}, "college_funding_need": {"type": "float"}, "final_expenses": {"type": "float"}}}, 
{"name": "calculate_education_funding", "description": "Calculate college funding needs and savings gap.", "parameters": {"child_current_age": {"type": "int", "range": "0-17"}, "college_start_age": {"type": "int"}, "years_of_college": {"type": "int", "range": "2-6"}, "current_annual_cost": {"type": "float"}, "education_inflation_rate": {"type": "float", "range": "0.03-0.08"}, "current_savings": {"type": "float"}, "monthly_contribution": {"type": "float"}, "expected_return": {"type": "float", "range": "0.04-0.10"}, "financial_aid_expected": {"type": "float"}, "student_contribution_percent": {"type": "float", "range": "0.0-0.50"}, "state_residency": {"type": "str"}, "school_type": {"type": "str", "enum": ["public_instate", "public_outstate", "private", "community"]}}}, 
{"name": "calculate_currency_exchange_arbitrage", "description": "Calculate optimal currency exchange with fees, hedging, and arbitrage options.", "parameters": {"base_currency": {"type": "str"}, "target_currency": {"type": "str"}, "amount": {"type": "float"}, "exchange_method": {"type": "str", "enum": ["bank", "forex_broker", "crypto_bridge", "wire"]}, "transfer_fee_percent": {"type": "float", "range": "0.0-0.05"}, "transfer_fee_fixed": {"type": "float"}, "intermediate_currency": {"type": "Optional[str]"}, "spot_rate_override": {"type": "Optional[float]"}, "forward_contract_months": {"type": "int", "range": "0-24"}, "hedging_strategy": {"type": "str", "enum": ["none", "forward", "option", "collar"]}, "tax_reporting_required": {"type": "bool"}}}
] 
</tools> 

Use the following pydantic model JSON schema for each tool call you will make: {"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"} For each function call return a json object with function name and arguments within <tool_call> </tool_call> XML tags as follows:
<tool_call>
{"name": <function-name>, "arguments": <args-dict>}
</tool_call>

You are a financial planning assistant. Analyze user requests and call the appropriate tool with all required parameters. Return concise answers. Use reasonable defaults if needed.
"""

In [None]:
dataset = load_dataset('json', data_files="financial_val_v4.jsonl", split='train')

In [None]:
valid_samples = []
for row in tqdm(dataset, total=len(dataset)):
    valid_samples.append(
        {
            "messages": [
                {"content": system_prompt, "role": "system"},
                {"content": row["prompt"][1]["content"], "role": "user"},
            ],
            "gt_answer": row["answer"],
            "gt_tool": row["ground_truth"]
        }
    )

In [None]:
# Compile once (not inside the loop)
TOOL_CALL_RE = re.compile(r"<tool_call>\s*(.*?)\s*</tool_call>", re.DOTALL)

# Qwen-style think end token id (</think>) you were using
THINK_END_TOKEN_ID = 151668

BATCH_SIZE = 50


def _safe_get_prompt(messages: List[Dict[str, Any]]) -> str:
    """Best-effort: return user prompt content if present."""
    if len(messages) > 1 and isinstance(messages[1], dict):
        return str(messages[1].get("content", ""))
    return ""


def _split_thinking_by_token_id(
    output_ids: List[int],
    tokenizer,
    think_end_token_id: int = THINK_END_TOKEN_ID,
) -> Tuple[str, str]:
    """
    Split decoded output into (thinking, content) by the last occurrence of </think> token id.
    If not found, thinking="", content=full decoded.
    """
    try:
        # index points to position *after* the </think> token
        idx = len(output_ids) - output_ids[::-1].index(think_end_token_id)
        thinking_ids = output_ids[:idx]
        content_ids = output_ids[idx:]
    except ValueError:
        thinking_ids = []
        content_ids = output_ids

    thinking = tokenizer.decode(thinking_ids, skip_special_tokens=True).strip()
    content = tokenizer.decode(content_ids, skip_special_tokens=True).strip()
    return thinking, content


def _extract_tool_call(content: str) -> Optional[Dict[str, Any]]:
    """
    Parse the first <tool_call>...</tool_call> block as JSON.
    Returns dict if valid, else None.
    """
    m = TOOL_CALL_RE.search(content or "")
    if not m:
        return None
    payload = m.group(1)
    try:
        obj = json.loads(payload)
        return obj if isinstance(obj, dict) else None
    except Exception:
        return None


@torch.inference_mode()
def inference_batched(
    inst_model,
    inst_tokenizer,
    valid_messages_samples,
    *,
    batch_size: int = BATCH_SIZE,
    max_new_tokens: int = 1024,
    temperature=0.01,
    top_p=0.99,
    think_end_token_id: int = THINK_END_TOKEN_ID,
):

    responses = []

    total = len(valid_messages_samples)
    n_batches = ceil(total / batch_size)

    for b in tqdm(range(n_batches)):
        batch = valid_messages_samples[b * batch_size:(b + 1) * batch_size]

        # ---- Build prompts
        prompts = [
            inst_tokenizer.apply_chat_template(
                row["messages"],
                tokenize=False,
                add_generation_prompt=True,
                enable_thinking=True,
            )
            for row in batch
        ]

        model_inputs = inst_tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
        ).to(inst_model.device)

        # ---- One GPU forward for 8 requests
        generated_ids = inst_model.generate(
            **model_inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p
        )

        for i, row in enumerate(batch):
            prompt_len = model_inputs.input_ids.shape[1]
            output_ids = generated_ids[i, prompt_len:].tolist()

            thinking_content, content = _split_thinking_by_token_id(
                output_ids, inst_tokenizer, think_end_token_id
            )

            tool_call = _extract_tool_call(content)

            predicted_answer = None
            parsed_tool_call = None
            if tool_call is not None:
                parsed_tool_call = tool_call
                try:
                    predicted_answer = run_tool(tool_call)
                except Exception:
                    pass

            responses.append({
                "prompt": _safe_get_prompt(row["messages"]),
                "gt_answer": row.get("gt_answer"),
                "gt_tool": row.get("gt_tool"),
                "predict_tool": content,
                "predict_tool_parsed": parsed_tool_call,
                "reasoning": thinking_content,
                "predicted_answer": predicted_answer,
            })

    return responses
    

def response_validity(row):
    """
    Binary answer accuracy scorer.

    Rules:
    - If ground-truth answer starts with 'Error:' → return None (row ignored).
    - If predicted answer starts with 'Error:' → return 0.
    - Otherwise → return 1.

    Returns:
        1     → correct / usable answer
        0     → model failure
        None  → invalid ground-truth (excluded from evaluation)
    """

    gt = str(row["gt_answer"])
    pred = str(row["predicted_answer"])

    if gt.startswith("Error:"):
        return None

    if pred.startswith("Error:"):
        return 0

    return 1


def exact_answer_match(row):
    """
    Strict final-answer exactness scorer.

    Rules:
    - If gt_answer starts with 'Error:' → return None (row ignored)
    - Normalize both strings by:
        • stripping whitespace
        • collapsing internal spaces
    - Return:
        1 → exact match
        0 → mismatch
        None → invalid GT
    """

    gt = str(row["gt_answer"])
    pred = str(row["predicted_answer"])

    if gt.startswith("Error:"):
        return None

    def normalize(s):
        return " ".join(s.strip().split())

    return 1 if normalize(gt) == normalize(pred) else 0


def normalized_tool_call_score(gt: dict, pred) -> float:
    """
    Final score ∈ [0,1]

    Weighting:
      - 0.3 → correct tool name
      - 0.7 → recursive strict argument match (2-decimal exact), using GT schema

    Notes:
      - If pred is None / NaN / not a dict → tool_score=0 and arg_score=0 (unless GT has no args).
    """

    # Coerce bad preds to empty dict
    if not isinstance(pred, dict):
        pred = {}

    def value_match(gt_val, pred_val):
        # Dict → recurse
        if isinstance(gt_val, dict):
            if not isinstance(pred_val, dict):
                return 0
            return dict_score(gt_val, pred_val)

        # None leaf
        if gt_val is None:
            return 1 if pred_val is None else 0

        # Numeric leaf (round to 2 decimals)
        if isinstance(gt_val, (int, float)):
            if pred_val is None:
                return 0
            try:
                return 1 if round(float(gt_val), 2) == round(float(pred_val), 2) else 0
            except Exception:
                return 0

        # Fallback strict equality (strings, bools, etc.)
        return 1 if gt_val == pred_val else 0

    def dict_score(gt_dict, pred_dict):
        total = len(gt_dict)
        if total == 0:
            return 1.0

        correct = 0
        for k, gt_val in gt_dict.items():
            if k not in pred_dict:
                continue
            correct += value_match(gt_val, pred_dict[k])

        return correct / total

    # Tool name component
    tool_score = 0.3 if gt.get("name") == pred.get("name") else 0.0

    gt_args = gt.get("arguments", {}) or {}
    pred_args = pred.get("arguments", {}) if isinstance(pred.get("arguments", {}), dict) else {}

    # Argument correctness (recursive)
    arg_component = dict_score(gt_args, pred_args) if gt_args else 1.0

    return tool_score + 0.7 * arg_component


def tool_call_schema_match(row) -> float:
    """
    Build a GT tool-call from the row and score against parsed prediction.
    Filters GT arguments to only include non-None values (keeps 0/False).
    """

    gt_tool = row["gt_tool"] or {}
    gt_args = (gt_tool.get("arguments") or {})

    gt = {
        "name": gt_tool.get("name"),
        # keep 0 / False, drop only None
        "arguments": {k: v for k, v in gt_args.items() if v is not None},
    }

    pred = row.get("predict_tool_parsed", None) if hasattr(row, "get") else row["predict_tool_parsed"]
    return normalized_tool_call_score(gt=gt, pred=pred)


def compute_individual_metrics(df):
    df["response_validity"] = df.apply(response_validity, axis=1)
    df["exact_answer_match"] = df.apply(exact_answer_match, axis=1)
    df["tool_call_schema_match"] = df.apply(tool_call_schema_match, axis=1)
    return {
        "response_validity": df["response_validity"].mean().round(3),
        "exact_answer_match": df["exact_answer_match"].mean().round(3),
        "tool_call_schema_match": df["tool_call_schema_match"].mean().round(3)
        
    }

### Qwen 0.6B Fine-tuned

In [None]:
tuned_checkpoint_path_qwen06 = "/path/to/checkpoint/0.6B/"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(tuned_checkpoint_path_qwen06, padding_side="left")

In [None]:
# tuned
model_trained_qwen06 = AutoModelForCausalLM.from_pretrained(
    tuned_checkpoint_path_qwen06,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map="auto",
    attn_implementation="kernels-community/vllm-flash-attn3",
)

In [None]:
trained_responses_qwen06 = inference_batched(
    inst_model=model_trained_qwen06, 
    inst_tokenizer=tokenizer, 
    valid_messages_samples=valid_samples
)

In [None]:
df_trained_qwen06 = pd.DataFrame(trained_responses_qwen06)
trained_metrics_qwen06 = compute_individual_metrics(df_trained_qwen06)

In [None]:
trained_metrics_qwen06

In [None]:
df_trained_qwen06

In [None]:
del model_trained_qwen06
torch.cuda.empty_cache()

### Base Model for Qwen 0.6

In [None]:
# base
model_base_qwen06 = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-0.6B",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map="auto",
    attn_implementation="kernels-community/vllm-flash-attn3",
)

In [None]:
base_responses_qwen06 = inference_batched(
    inst_model=model_base_qwen06, 
    inst_tokenizer=tokenizer, 
    valid_messages_samples=valid_samples
)

In [None]:
df_base_qwen06 = pd.DataFrame(base_responses_qwen06)
base_metrics_qwen06 = compute_individual_metrics(df_base_qwen06)

In [None]:
base_metrics_qwen06

In [None]:
del model_base_qwen06
torch.cuda.empty_cache()

### Qwen 1.7B Fine-tuned

In [None]:
tuned_checkpoint_path_qwen17 = "/path/to/checkpoint/1.7B/"

In [None]:
tokenizer_17B = AutoTokenizer.from_pretrained(tuned_checkpoint_path_qwen17, padding_side="left")

In [None]:
# tuned
model_trained_qwen17 = AutoModelForCausalLM.from_pretrained(
    tuned_checkpoint_path_qwen17,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map="auto",
    attn_implementation="kernels-community/vllm-flash-attn3",
)

In [None]:
trained_responses_qwen17 = inference_batched(
    inst_model=model_trained_qwen17, 
    inst_tokenizer=tokenizer_17B, 
    valid_messages_samples=valid_samples
)

In [None]:
df_trained_qwen17 = pd.DataFrame(trained_responses_qwen17)
trained_metrics_qwen17 = compute_individual_metrics(df_trained_qwen17)

In [None]:
trained_metrics_qwen17

In [None]:
del model_trained_qwen17
torch.cuda.empty_cache()

### Base Model for Qwen 1.7

In [None]:
# base
model_base_qwen17 = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-1.7B",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map="auto",
    attn_implementation="kernels-community/vllm-flash-attn3",
)

In [None]:
base_responses_qwen17 = inference_batched(
    inst_model=model_base_qwen17, 
    inst_tokenizer=tokenizer_17B, 
    valid_messages_samples=valid_samples
)

In [None]:
df_base_qwen17 = pd.DataFrame(base_responses_qwen17)
base_metrics_qwen17 = compute_individual_metrics(df_base_qwen17)

In [None]:
base_metrics_qwen17

In [None]:
del model_base_qwen17
torch.cuda.empty_cache()

### Evaluate Foundation Models

In [None]:
import os
import time
from litellm import completion

In [None]:
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
os.environ["AWS_SESSION_TOKEN"] = ""

In [None]:
results_for_models = {
    "Qwen/Qwen3-0.6B (rl-tuned)": trained_metrics_qwen06,
    "Qwen/Qwen3-0.6B (base)": base_metrics_qwen06,
    "Qwen/Qwen3-1.7B (rl-tuned)": trained_metrics_qwen17,
    "Qwen/Qwen3-1.7B (base)": base_metrics_qwen17,
}

In [None]:
results_for_models

In [None]:
model_br_mappings = {
    "bedrock/openai.gpt-oss-20b-1:0": "openai/gpt-oss-20b",
    "bedrock/openai.gpt-oss-120b-1:0": "openai/gpt-oss-120b",
    "bedrock/us.amazon.nova-lite-v1:0": "amazon/nova-lite",
    "bedrock/us.amazon.nova-pro-v1:0": "amazon/nova-pro",
    "bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0": "anthropic/claude-sonnet-4.5"
}

model_reasoning_mappings = {
    "bedrock/openai.gpt-oss-20b-1:0": "low",
    "bedrock/openai.gpt-oss-120b-1:0": "low",
    "bedrock/us.amazon.nova-lite-v1:0": None,
    "bedrock/us.amazon.nova-pro-v1:0": None,
    "bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0": "low"
}

dicts_of_dfs = {}

for model_key in list(model_br_mappings.keys()):
    responses_from_prop_llm = []
    _samples = valid_samples

    reasoning_effort = model_reasoning_mappings[model_key]
    if reasoning_effort:
        kwargs = {
            "model": model_key,
            "reasoning_effort": reasoning_effort,
            # "temperature": 0.01,
            # "max_tokens": 1024
        }
    else:
        kwargs = {
            "model": model_key,
            "temperature": 0.01,
            # "max_tokens": 1024
        }
    for row in tqdm(_samples, total=len(_samples)):
        messages = row["messages"]
        
        kwargs["messages"] = row["messages"]
        try:
            response = completion(
                **kwargs
            )
        except Exception as e:
            continue
        
        pattern = re.compile(r"<tool_call>\s*(.*?)\s*</tool_call>", re.DOTALL)
        if reasoning_effort:
            thinking_content = response.choices[0].message.reasoning_content
        else:
            thinking_content = None
            
        content = response.choices[0].message.content
        match = pattern.search(content)
        if match:
            payload = match.group(1)
            try:
                tool_call = json.loads(payload)
                answer = run_tool(tool_call)
                
                responses_from_prop_llm.append({
                    "prompt": messages[1]["content"],
                    "gt_answer": row["gt_answer"],
                    "gt_tool": row["gt_tool"],
                    "predict_tool": content,
                    "predict_tool_parsed": tool_call,
                    "reasoning": thinking_content,
                    "predicted_answer": answer
                    
                })
            except Exception as e:
                responses_from_prop_llm.append({
                    "prompt": messages[1]["content"],
                    "gt_answer": row["gt_answer"],
                    "gt_tool": row["gt_tool"],
                    "predict_tool": content,
                    "predict_tool_parsed": None,
                    "reasoning": thinking_content,
                    "predicted_answer": None
                    
                })
            
        else:
            responses_from_prop_llm.append({
                "prompt": messages[1]["content"],
                "gt_answer": row["gt_answer"],
                "gt_tool": row["gt_tool"],
                "predict_tool": content,
                "predict_tool_parsed": None,
                "reasoning": None,
                "predicted_answer": None
                
            })
        if 'claude' in model_key:
            time.sleep(1)
        else:
            time.sleep(0.2)
    df_frontier = pd.DataFrame(responses_from_prop_llm)
    frontier_metrics = compute_individual_metrics(df_frontier)

    print(f"Accuracy for {model_key} model: {frontier_metrics}")
    results_for_models[model_br_mappings[model_key]] = frontier_metrics
    dicts_of_dfs[model_br_mappings[model_key]] = df_frontier

In [None]:
results_for_models

### Visualization of Results

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def plot_three_benchmark_charts(results_for_models: dict):
    """
    Creates three bar charts:
      1. Binary Answer Accuracy
      2. Exact Answer Accuracy
      3. Tool Semantic Accuracy

    Each chart:
      - Sorted by metric value
      - Uses consistent static colors per model
    """

    # ---- Build DataFrame ----
    df = pd.DataFrame.from_dict(results_for_models, orient="index").reset_index()
    df = df.rename(columns={"index": "Model"})

    # ---- Static color map (consistent across all charts) ----
    palette = sns.color_palette("muted", len(df))
    color_map = dict(zip(df["Model"], palette))

    metrics = [
        ("response_validity", "Response Validity"),
        ("exact_answer_match", "Exact Answer Match"),
        ("tool_call_schema_match", "Tool Call Schema Match"),
    ]

    sns.set_theme(style="whitegrid", font_scale=1.05)

    for metric_key, metric_title in metrics:

        plot_df = df.sort_values(metric_key, ascending=False)

        plt.figure(figsize=(12, 5))
        ax = sns.barplot(
            data=plot_df,
            x="Model",
            y=metric_key,
            palette=color_map
        )

        ax.set_ylim(0, 1.05)
        ax.set_ylabel("Score")
        ax.set_xlabel("")
        ax.set_title(metric_title, fontsize=14, pad=10)
        plt.xticks(rotation=35, ha="right")

        # ---- Value labels ----
        for p in ax.patches:
            h = p.get_height()
            ax.text(
                p.get_x() + p.get_width() / 2,
                h + 0.01,
                f"{h:.2f}",
                ha="center",
                va="bottom",
                fontsize=9
            )

        plt.tight_layout()
        plt.savefig(f'{metric_key}.png') 
        plt.show()


In [None]:
plot_three_benchmark_charts(results_for_models)

In [None]:
results_for_models