### Basic Muli-Turn Analysis

Example to build your analysis notebook to analyze multi-turn runs.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, sys
import json

import caesar.analysis.analysis_utils as analysis_utils

from caesar.utils import check_result_exists

In [7]:
PATH_TO_REPO_DIR = os.path.dirname(os.path.dirname(os.getcwd())) # TOP LEVEL REPO
from monkeys.problems.kernelbench_gen_level_1 import (
    DATASET as KERNELBENCH_LEVEL_1_DATASET,
    SUBSET_DATASET as KERNELBENCH_LEVEL_1_SUBSET_DATASET,
)
from monkeys.problems.kernelbench_gen_level_2 import (
    DATASET as KERNELBENCH_LEVEL_2_DATASET,
    SUBSET_DATASET as KERNELBENCH_LEVEL_2_SUBSET_DATASET
) 
from monkeys.problems.kernelbench_gen_level_3 import (
    DATASET as KERNELBENCH_LEVEL_3_DATASET,
    SUBSET_DATASET as KERNELBENCH_LEVEL_3_SUBSET_DATASET    
) 

from monkeys.problems.problem_utils import KernelBenchDataset

dataset_name_to_dataset = {
    "KernelBench/level1": KERNELBENCH_LEVEL_1_DATASET,
    "KernelBench/level2": KERNELBENCH_LEVEL_2_DATASET,
    "KernelBench/level3": KERNELBENCH_LEVEL_3_DATASET,
}

dataset_name_to_subset_dataset = {
    "KernelBench/level1": KERNELBENCH_LEVEL_1_SUBSET_DATASET,
    "KernelBench/level2": KERNELBENCH_LEVEL_2_SUBSET_DATASET,
    "KernelBench/level3": KERNELBENCH_LEVEL_3_SUBSET_DATASET,
}


MULTI_TURN_BASE_LOG_DIR = "/matx/u/simonguo/kernel_multi_turn"


In [12]:
# 
ex_run_group = "trial_level1_reflection_all_prev_deepseek"
dataset_name = "KernelBench/level1"
level = 1
use_subset = False

# with greedy 
# we will do num_sample = 1
num_sample = 1


In [None]:
# Construct dataset object
dataset = KernelBenchDataset(
    dataset_name=dataset_name,    
    level=level, 
    use_subset=use_subset,  # Use the checkbox value instead of config value
    dataset=dataset_name_to_dataset[dataset_name], 
    subset_dataset=dataset_name_to_subset_dataset[dataset_name]
    )

# dataset.get_problem_ids()


In [21]:
ex_run_name = "trial_run_turns_3"

In [None]:
# list all the runs in the run group
runs = analysis_utils.get_available_runs(os.path.join(MULTI_TURN_BASE_LOG_DIR, ex_run_group))

runs

In [47]:
def get_run_data(run_group, run_name, dataset: KernelBenchDataset):
    # get the log data

    run_data = []

    for problem_id in dataset.get_problem_ids(): # logical ID
        for sample_id in range(num_sample):

            log_path = os.path.join(MULTI_TURN_BASE_LOG_DIR, run_group, run_name, f"problem_{problem_id}", f"sample_{sample_id}", "log.json")
            if not check_result_exists(log_dir_prefix=MULTI_TURN_BASE_LOG_DIR, run_group=run_group, run_name=run_name, problem_id=problem_id, sample_id=sample_id):
                print(f"Result not found for {run_name} {problem_id} {sample_id}")
                continue

            config_path = os.path.join(MULTI_TURN_BASE_LOG_DIR, run_group, run_name, f"problem_{problem_id}", f"sample_{sample_id}", "config.json")
            config_data = analysis_utils.load_run_data(config_path)
            
            # you can do something with those 


            # this is the log data for the particular problem and sample
            log_data = analysis_utils.load_run_data(log_path)
            num_rounds = log_data["metadata"]["num_rounds"]
            # get the trajectory of compilation, correctness, and runtime over turns
            turn_compile_trajectory, turn_correct_trajectory, turn_runtime_trajectory = analysis_utils.get_turn_trajectory_overviews(log_data)

            # maybe you can pick the best one here
            # print(f"Turn Compile Trajectory: {turn_compile_trajectory}" )
            # print(f"Turn Correct Trajectory: {turn_correct_trajectory}")
            # print(f"Turn Runtime Trajectory: {turn_runtime_trajectory}")

            # you can get the final result here
            # as this is the last one
            final_result = log_data["result"]["eval_result"]
            
            # print(f"Final Result: {final_result}")
            # you can do something with this 
            
            # you can return a new dict and then make a dataframe in the end
            run_data.append({
                "problem_id": problem_id,
                "sample_id": sample_id,
                "num_rounds": num_rounds,
                "final_result": final_result,
                "turn_compile_trajectory": turn_compile_trajectory,
                "turn_correct_trajectory": turn_correct_trajectory,
                "turn_runtime_trajectory": turn_runtime_trajectory,
            })
        
    return run_data



In [49]:
run_data = get_run_data(run_group=ex_run_group, run_name=ex_run_name, dataset=dataset)


In [None]:
# Convert the run data list to a pandas DataFrame
run_data_df = pd.DataFrame(run_data)

# Display the DataFrame
run_data_df

# this is parituclar to this config, num_rounds, and the final resultZ



In [53]:
# i have a helper function for you load baseline 
# you can compute score here