In [13]:
import script_utils as utils

MODELS = [
    # "claude-haiku-4.5",
    # "skill-claude-haiku-4.5",
    # "custom-gpt-5-nano",
    # "skill-gpt-5-nano",
    "custom-3",
    "skill-agent-2-no-learning"
]
TEST_SET = "test"

tasks_set = {
    model: utils.get_tasks(model, remove_corrupted=True, dataset=TEST_SET) for model in MODELS
}



Removed corrupted task dir: D:\Projects\OSWorld-MA\results\pyautogui\screenshot\skill-agent-2-no-learning\libreoffice_impress\a53f80cd-4a90-4490-8310-097b011433f6
Removed corrupted task dir: D:\Projects\OSWorld-MA\results\pyautogui\screenshot\skill-agent-2-no-learning\libreoffice_writer\e528b65e-1107-4b8c-8988-490e4fece599
Removed corrupted task dir: D:\Projects\OSWorld-MA\results\pyautogui\screenshot\skill-agent-2-no-learning\multi_apps\09a37c51-e625-49f4-a514-20a773797a8a
Removed corrupted task dir: D:\Projects\OSWorld-MA\results\pyautogui\screenshot\skill-agent-2-no-learning\multi_apps\42d25c08-fb87-4927-8b65-93631280a26f
Removed corrupted task dir: D:\Projects\OSWorld-MA\results\pyautogui\screenshot\skill-agent-2-no-learning\thunderbird\9b7bc335-06b5-4cd3-9119-1a649c478509
Removed corrupted task dir: D:\Projects\OSWorld-MA\results\pyautogui\screenshot\skill-agent-2-no-learning\vlc\9195653c-f4aa-453d-aa95-787f6ccfaae9
Removed corrupted task dir: D:\Projects\OSWorld-MA\results\pyauto



In [14]:

# Find overlapping tasks across all models
task_keys = {
    model: {(t["domain"], t["task_id"]) for t in tasks}
    for model, tasks in tasks_set.items()
}

# Intersection: only tasks present in every model's results
overlapping_keys = set.intersection(*task_keys.values())

print(f"Tasks per model:")
for model, keys in task_keys.items():
    print(f"  {model}: {len(keys)}")
print(f"\nOverlapping tasks (present in all models): {len(overlapping_keys)}")

# Filter each model's task list to overlapping tasks only
filtered_tasks = {
    model: [t for t in tasks if (t["domain"], t["task_id"]) in overlapping_keys]
    for model, tasks in tasks_set.items()
}


Tasks per model:
  custom-3: 167
  skill-agent-2-no-learning: 168

Overlapping tasks (present in all models): 162


In [15]:
import pandas as pd
import matplotlib.pyplot as plt

CATEGORY_MAPPING = {
    'os': 'OS',
    'libreoffice_calc': 'Office',
    'libreoffice_writer': 'Office',
    'libreoffice_impress': 'Office',
    'chrome': 'Daily',
    'vlc': 'Daily',
    'thunderbird': 'Daily',
    'vs_code': 'Professional',
    'gimp': 'Professional',
    'multi_apps': 'Workflow',
}

def build_category_df(tasks, model_name):
    df = pd.DataFrame(tasks)
    df["success_bool"] = df["success"].astype(bool)
    df["category"] = df["domain"].str.lower().map(CATEGORY_MAPPING)

    cat = df.groupby("category")["success_bool"].agg(
        total="count",
        successful="sum",
        success_rate="mean",
    ).reset_index()
    cat["success_rate"] = (cat["success_rate"] * 100).round(2)

    totals = pd.DataFrame([{
        "category": "TOTAL",
        "total": len(df),
        "successful": int(df["success_bool"].sum()),
        "success_rate": round(df["success_bool"].mean() * 100, 2),
    }])
    cat = pd.concat([cat, totals], ignore_index=True)
    cat.columns = ["Category", "Total", "Successful", f"{model_name} (%)"]
    return cat.set_index("Category")


# Build per-model tables and merge on Category
dfs = [build_category_df(filtered_tasks[m], m) for m in MODELS]

# Start with first model's Total, then join success-rate columns
combined = dfs[0][["Total", f"{MODELS[0]} (%)"]].copy()
for df in dfs[1:]:
    model_col = [c for c in df.columns if c.endswith("(%)")][0]
    combined = combined.join(df[[model_col]], how="outer")

display(combined)


Unnamed: 0_level_0,Total,custom-3 (%),skill-agent-2-no-learning (%)
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Daily,32,90.62,90.62
OS,11,81.82,72.73
Office,56,75.0,66.07
Professional,22,72.73,81.82
TOTAL,162,80.25,75.93
Workflow,41,82.93,75.61


In [16]:
import json

skill_model = next((m for m in MODELS if "skill" in m), None)
base_model = next((m for m in MODELS if "skill" not in m), None)

if not (skill_model and base_model):
    print("No model with 'skill' in name found — skipping analysis.")
else:
    skill_lookup = {(t["domain"], t["task_id"]): t for t in filtered_tasks[skill_model]}
    base_lookup  = {(t["domain"], t["task_id"]): t for t in filtered_tasks[base_model]}

    wins   = []  # skill ✓, base ✗
    losses = []  # base ✓, skill ✗

    for key in overlapping_keys:
        s = skill_lookup[key]["success"]
        b = base_lookup[key]["success"]
        if s and not b:
            wins.append(key)
        elif b and not s:
            losses.append(key)

    def used_read_skills(task):
        return any(
            "Tool Call: `read_skills`" in step.get("response", "")
            for step in task["trajectories"]
        )

    wins_used   = [(k, used_read_skills(skill_lookup[k])) for k in wins]
    losses_used = [(k, used_read_skills(skill_lookup[k])) for k in losses]

    print(f"Skill model : {skill_model}")
    print(f"Base model  : {base_model}")
    print(f"\n{'─'*50}")
    print(f"Wins   (skill ✓, base ✗) : {len(wins)}")
    print(f"  read_skills called     : {sum(u for _, u in wins_used)} / {len(wins)}")
    print(f"  read_skills NOT called : {sum(not u for _, u in wins_used)} / {len(wins)}")

    print(f"\nLosses (base ✓, skill ✗) : {len(losses)}")
    print(f"  read_skills called     : {sum(u for _, u in losses_used)} / {len(losses)}")
    print(f"  read_skills NOT called : {sum(not u for _, u in losses_used)} / {len(losses)}")



Skill model : skill-agent-2-no-learning
Base model  : custom-3

──────────────────────────────────────────────────
Wins   (skill ✓, base ✗) : 11
  read_skills called     : 0 / 11
  read_skills NOT called : 11 / 11

Losses (base ✓, skill ✗) : 18
  read_skills called     : 0 / 18
  read_skills NOT called : 18 / 18


In [17]:
def build_metrics_df(tasks, model_name):
    df = pd.DataFrame(tasks)
    df["success_bool"] = df["success"].astype(bool)
    df["category"] = df["domain"].str.lower().map(CATEGORY_MAPPING)

    def agg(group):
        successful = group["success_bool"]
        return pd.Series({
            "success_rate": round(successful.mean() * 100, 2),
            "avg_steps_successful": round(group.loc[successful, "steps"].mean(), 2) if successful.any() else float("nan"),
        })

    cat = df.groupby("category").apply(agg).reset_index()

    total_success = df["success_bool"]
    totals = pd.DataFrame([{
        "category": "TOTAL",
        "success_rate": round(total_success.mean() * 100, 2),
        "avg_steps_successful": round(df.loc[total_success, "steps"].mean(), 2) if total_success.any() else float("nan"),
    }])
    cat = pd.concat([cat, totals], ignore_index=True).set_index("category")
    cat.index.name = "Category"

    # MultiIndex columns: (metric, model_name)
    cat.columns = pd.MultiIndex.from_tuples([
        ("Performance (%)", model_name),
        ("Avg Steps (successful)", model_name),
    ])
    return cat


metric_dfs = [build_metrics_df(filtered_tasks[m], m) for m in MODELS]
multi = pd.concat(metric_dfs, axis=1)

# Reorder so columns group by metric first
metrics = ["Performance (%)", "Avg Steps (successful)"]
multi = multi.reindex(columns=pd.MultiIndex.from_product([metrics, MODELS]))

display(multi)


Unnamed: 0_level_0,Performance (%),Performance (%),Avg Steps (successful),Avg Steps (successful)
Unnamed: 0_level_1,custom-3,skill-agent-2-no-learning,custom-3,skill-agent-2-no-learning
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Daily,90.62,90.62,18.86,14.52
OS,81.82,72.73,5.44,4.25
Office,75.0,66.07,11.86,15.03
Professional,72.73,81.82,11.06,11.28
Workflow,82.93,75.61,11.09,14.0
TOTAL,80.25,75.93,12.68,13.4
