# Notebook: Parsing

Just to parse the logs and create csv files etc.

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd

from utils import get_logs, save_dataframe, get_calls

i = 1
verbose = False

## 1. Reasoning models
Parsing the results for the reasoning model experiments.

### 1.0 Namespace + Loading the logs

In [2]:
# Models we've run experiments on
models = [
    "Qwen3-235B-A22B-Thinking-2507",
    "gpt-oss-120b", 
    "DeepSeek-R1", 
    "Llama-4-Maverick-17B-128E-Instruct-FP8",     
]

# Experiments we've set up
"""
    - simple: Standard evaluation without any special prompting strategies
    - repeats: Same as simple, but the experiment is repeated multiple times
"""
experiments = [
    "simple",
    "repeats"
]

# Benchmarks we've run experiments on
benchmarks = [
        "game24",
        "hle",
        "hotpotqa",
        "humaneval",
        "matharena",
        "scibench",
        "sonnetwriting"
    ]


# Setup
model = models[i]           # Model for which we want to load logs
experiment = experiments[1] # Experiment for which we want to load the logs

# Load the logs
logs = get_logs(
    logs_path="../logs",
    experiment=experiment,
    model=model,
    benchmarks=benchmarks,
    verbose=verbose
)

### 1.1 Parsing the logs to load the important information we'd like to save

In [3]:
# Each entry in data corresponds to a single experiment
data = []

for log in logs:
    """
        - log: dict --> The log for a single experiment
        - temp: dict --> Temporary dictionary with the extracted/processed log data
    """
    temp = {}
    
    try:
        # Metadata 
        temp.update(log["General information"])
        #temp.update(log["Method Configuration"])
        temp.update(log["LLM Information"])
        temp["log"] = log["Log path"]

        # Cost per sample
        temp["costs"] = [tab["Cost (total)"]["total"] for tab in log["API Detailed Information (per tab)"].values()]
        temp["costs_in"] = [tab["Cost (total)"]["in"] for tab in log["API Detailed Information (per tab)"].values()]
        temp["costs_out"] = [tab["Cost (total)"]["out"] for tab in log["API Detailed Information (per tab)"].values()]

        # Tokens per sample
        temp["tokens_in"] = [tab["Tokens (total)"]["in"] for tab in log["API Detailed Information (per tab)"].values()]
        temp["tokens_out"] = [tab["Tokens (total)"]["out"] for tab in log["API Detailed Information (per tab)"].values()]

        # Quality per sample
        temp["scores"] = log["Quality"]["Correct"]
        
        data.append(temp)
    except KeyError as e:
        print(f"Skipping log due to missing key: {e}")
        continue

### 1.2 Loading the data in a DataFrame

In [4]:
# Load the data into DataFrame. Each row corresponds to a single experiment
df = pd.DataFrame(data)

# Save the DataFrame for future reference
save_dataframe(
    df=df,
    data_path="../data/models",
    experiment=experiment,
    model=model
)

# Loading the data back (example)
df = pd.read_parquet("../data/examples/models.parquet")
if verbose:
    display(df.head(2))

## 2. Reasoning strategies
Parsing the results for the reasoning strategy experiments.

### 2.0 Namespace + Loading the logs

In [3]:
# Models we've run experiments on
models = [
    "gpt-4.1-nano",
    "gpt-4.1-mini",
    "gpt-5-nano", 
    "llama-4-scout-17b-16e-instruct",
    "qwen3-32b",
]

# Experiments we've set up
"""
    - simple: Standard evaluation without any special prompting strategies
    - repeats: Same as simple, but the experiment is repeated multiple times
"""
experiments = [
    "simple",
    "repeats"
]

# Benchmarks we've run experiments on
benchmarks = [
        "game24",
        "hle",
        "hotpotqa",
        "humaneval",
        "matharena",
        "scibench",
        "sonnetwriting"
    ]

# Reasoning strategies we've employed
strategies = [
    "io",
    "cot",
    "cot_sc",
    "foa",
    "tot_bfs",
    "tot_dfs",
    "got",
    "react",
    "reflection",
    "rap",
    "mcts"
]


# Setup
model = models[i]           # Model for which we want to load logs
experiment = experiments[1] # Experiment for which we want to load the logs

# Load the logs
logs = get_logs(
    logs_path="../logs",
    experiment=experiment,
    model=model,
    benchmarks=benchmarks,
    verbose=verbose
)

### 2.1 Parsing the logs to load the important information we'd like to save

In [4]:
# Each entry in data corresponds to a single experiment
data = []

for log in logs:
    """
        - log: dict --> The log for a single experiment
        - temp: dict --> Temporary dictionary with the extracted/processed log data
    """
    temp = {}
    
    try:
        # Metadata 
        temp.update(log["General information"])
        #temp.update(log["Method Configuration"])
        temp.update(log["LLM Information"])
        temp["log"] = log["Log path"]

        # Cost per sample
        temp["costs"] = [tab["Cost (total)"]["total"] for tab in log["API Detailed Information (per tab)"].values()]
        temp["costs_in"] = [tab["Cost (total)"]["in"] for tab in log["API Detailed Information (per tab)"].values()]
        temp["costs_out"] = [tab["Cost (total)"]["out"] for tab in log["API Detailed Information (per tab)"].values()]

        # Tokens per sample
        temp["tokens_in"] = [tab["Tokens (total)"]["in"] for tab in log["API Detailed Information (per tab)"].values()]
        temp["tokens_out"] = [tab["Tokens (total)"]["out"] for tab in log["API Detailed Information (per tab)"].values()]

        # Quality per sample
        temp["scores"] = log["Quality"]["Correct"]
        
        data.append(temp)
    except KeyError as e:
        print(f"Skipping log due to missing key: {e}")
        continue

### 2.2 Loading the data in a DataFrame

In [5]:
# Load the data into DataFrame. Each row corresponds to a single experiment
df = pd.DataFrame(data)

# Save the DataFrame for future reference
save_dataframe(
    df=df,
    data_path="../data/strategies",
    experiment=experiment,
    model=model
)

# Loading the data back (example)
df = pd.read_parquet("../data/examples/models.parquet")
if verbose:
    display(df.head(2))

## 3. Traces
Parsing the actual reasoning traces. Mostly used for reasoning models but they can parse the respective logs of reasoning strategies as well.

### 3.0 Namespace + Loading the logs

In [None]:
# Models we've run experiments on
models = [
    "Qwen3-235B-A22B-Thinking-2507",
    "gpt-oss-120b", 
    "DeepSeek-R1", 
    "Llama-4-Maverick-17B-128E-Instruct-FP8",     
]

# Experiments we've set up
"""
    - simple: Standard evaluation without any special prompting strategies
    - repeats: Same as simple, but the experiment is repeated multiple times
"""
experiments = [
    "simple",
    "repeats"
]

# Benchmarks we've run experiments on
benchmarks = [
        "game24",
        "hle",
        "hotpotqa",
        "humaneval",
        "matharena",
        "scibench",
        "sonnetwriting"
    ]


# Setup
model = models[0]           # Model for which we want to load logs
experiment = experiments[1] # Experiment for which we want to load the logs

calls = get_calls(
    logs_path="../logs",
    experiment=experiment,
    model=model,
    benchmarks=benchmarks,
    verbose=verbose
)

### 3.1 Parsing the logs + metadata

In [9]:
# Each entry in data corresponds to a single experiment
data = []

for log in calls:
    for call in log["calls"]:
        for response in call["responses"]:
            """
                - log: dict --> The log for a single experiment (keys are: benchmark, calls, path)
                - call: dict --> The call information
                - response: str --> One of the response samples of the call
                - temp: dict --> Temporary dictionary with the extracted/processed log data
            """
            temp = {
                "benchmark": log["benchmark"],
                "user": call["user_message"],
                "response": response,
                "path": log["path"],
            }
            data.append(temp)

### 3.2 Loading the data in a DataFrame

In [10]:
# Load the data into DataFrame. Each row corresponds to a single response sample
df = pd.DataFrame(data)

# Save the DataFrame for future reference
save_dataframe(
    df=df,
    data_path="../data/calls",
    experiment=experiment,
    model=model
)

df = pd.read_parquet("../data/examples/calls.parquet")
if verbose:
    display(df.head(2))