In [207]:
import wandb
import pandas as pd
import os
from  tqdm import tqdm

In [208]:
api = wandb.Api(timeout=600)


In [209]:
# get all runs for a project
runs = api.runs("inchangbaek4907/pcgrl-llm")
len(runs)

143

In [210]:
# Specify cache directory
cache_dir = "./wandb_cache"
os.makedirs(cache_dir, exist_ok=True)

dfs = []
skipped_runs = []  # List to store IDs of skipped runs

for run in tqdm(runs):
    # Skip the run if it's not finished
    if run.state != "finished":
        print(f"Skipping run {run.id} (state: {run.state})")
        skipped_runs.append(run.id)
        continue

    # Define cache filename based on run ID
    cache_file = os.path.join(cache_dir, f"{run.id}.csv")
    
    # Check if cached file exists
    if os.path.exists(cache_file):
        # Load cached DataFrame
        df = pd.read_csv(cache_file)
    else:
        # Retrieve history data for Evaluation/similarity and diversity
        sim = run.history(keys=["Evaluation/similarity", "Evaluation/llm_iteration"])
        div = run.history(keys=["Evaluation/diversity", "Evaluation/llm_iteration"])
        llm_sim = run.history(keys=["Evaluation/llm/similarity", "Evaluation/llm_iteration"])
        llm_div = run.history(keys=["Evaluation/llm/diversity", "Evaluation/llm_iteration"])
        
        # Sequentially merge the DataFrames on "Evaluation/llm_iteration"
        df = pd.merge(sim, div, on="Evaluation/llm_iteration", how="outer")
        df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")
        if 'Evaluation/llm_iteration' in llm_sim.columns:
            df = pd.merge(df, llm_sim, on="Evaluation/llm_iteration", how="outer")
            df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")
        if 'Evaluation/llm_iteration' in llm_div.columns:
            df = pd.merge(df, llm_div, on="Evaluation/llm_iteration", how="outer")
            df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")
            
        # Now, 'df' contains all the merged data based on "Evaluation/llm_iteration"

        df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")

        # Add run config to DataFrame with prefix 'config.'
        for key, value in run.config.items():
            if isinstance(value, list):
                value = ",".join(map(str, value))  # Convert list to comma-separated string
            df[key] = value

        # Filter columns
        key_filter = ['target_character', 'pe', 'branch_factor', 'exp_name', 'evaluator', 'total_iterations', 
                      'reward_feature', 'fewshot', 'problem', 'seed', 'Evaluation/llm_iteration', 
                      'Evaluation/similarity', 'Evaluation/diversity']
        auxiliary_key_filter = ['Evaluation/llm/similarity', 'Evaluation/llm/diversity']
        
        try:
            df = df[key_filter + auxiliary_key_filter]
        except KeyError:
            df = df[key_filter]
        
        # Save DataFrame to cache as CSV
        df.to_csv(cache_file, index=False)
    
    # Append DataFrame to list
    dfs.append(df)

# Concatenate all DataFrames
df = pd.concat(dfs, ignore_index=True)
df.head()  # Display the first few rows for inspection

# Print summary of skipped runs
print("\nSummary of Skipped Runs:")
print(f"Total skipped runs: {len(skipped_runs)}")
print("Skipped run IDs:", skipped_runs)

 64%|██████▎   | 91/143 [01:08<01:06,  1.29s/it] 

Skipping run pe-got_it-6_fit-llm_exp-def_chr-D_s-7 (state: crashed)
Skipping run pe-got_it-6_fit-llm_exp-def_chr-M_s-7 (state: crashed)


 83%|████████▎ | 119/143 [01:41<00:26,  1.10s/it]

Skipping run pe-got_it-6_fit-vit_exp-def_fs_chr-D_s-1 (state: running)
Skipping run pe-got_it-6_fit-vit_exp-def_fs_chr-N_s-2 (state: running)
Skipping run pe-got_it-6_fit-vit_exp-def_fs_chr-M_s-2 (state: running)
Skipping run pe-got_it-6_fit-vit_exp-def_fs_chr-F_s-2 (state: running)
Skipping run pe-got_it-6_fit-vit_exp-def_fs_chr-C_s-2 (state: running)
Skipping run pe-got_it-6_fit-vit_exp-def_fs_chr-D_s-2 (state: running)
Skipping run pe-cot_it-6_fit-vit_exp-def_fs_chr-N_s-2 (state: running)
Skipping run pe-cot_it-6_fit-vit_exp-def_fs_chr-M_s-2 (state: running)
Skipping run pe-cot_it-6_fit-vit_exp-def_fs_chr-F_s-2 (state: running)
Skipping run pe-cot_it-6_fit-vit_exp-def_fs_chr-C_s-2 (state: running)


100%|██████████| 143/143 [01:42<00:00,  1.40it/s]

Skipping run pe-cot_it-6_fit-vit_exp-def_fs_chr-N_s-3 (state: running)
Skipping run pe-cot_it-6_fit-vit_exp-def_fs_chr-M_s-3 (state: running)
Skipping run pe-got_it-6_fit-vit_exp-def_fs_chr-N_s-3 (state: running)
Skipping run pe-got_it-6_fit-vit_exp-def_fs_chr-M_s-3 (state: running)
Skipping run pe-cot_it-6_fit-vit_exp-def_fs_chr-F_s-3 (state: running)
Skipping run pe-got_it-6_fit-vit_exp-def_fs_chr-F_s-3 (state: running)
Skipping run pe-cot_it-6_fit-vit_exp-def_fs_chr-C_s-3 (state: running)
Skipping run pe-got_it-6_fit-vit_exp-def_fs_chr-C_s-3 (state: running)
Skipping run pe-cot_it-6_fit-vit_exp-def_fs_chr-D_s-3 (state: running)
Skipping run pe-got_it-6_fit-vit_exp-def_fs_chr-D_s-3 (state: running)
Skipping run pe-tot_it-6_fit-vit_exp-def_fs_chr-N_s-1 (state: running)
Skipping run pe-tot_it-6_fit-vit_exp-def_fs_chr-F_s-1 (state: running)
Skipping run pe-tot_it-6_fit-vit_exp-def_fs_chr-M_s-1 (state: running)

Summary of Skipped Runs:
Total skipped runs: 25
Skipped run IDs: ['pe-got_it




In [None]:
# save to csv with time

In [277]:
time_str = pd.Timestamp.now().strftime("%Y-%m-%d-%H-%M-%S")

In [None]:
df.to_csv(f"wandb_output_{time_str}.csv", index=False)

In [None]:
# load the csv
# df = pd.read_csv("wandb_output_2024-11-05-16-46-41.csv")

In [None]:
pd = pd.read_csv("wandb_output_{time_str.csv")

In [None]:

df['seed'] = df['seed'] % 3
df

In [227]:
# remove pe == cotsc
df = df[df['pe'] != 'cotsc']

# if the exname is not def make the pe to 'pe+exp_name'
df['pe'] = df.apply(lambda x: x['pe'] + '-' +  x['exp_name'] if x['exp_name'] != 'def' else x['pe'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pe'] = df.apply(lambda x: x['pe'] + '-' +  x['exp_name'] if x['exp_name'] != 'def' else x['pe'], axis=1)


In [228]:
df.groupby(['pe', 'evaluator', 'fewshot', 'seed']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,target_character,branch_factor,exp_name,total_iterations,reward_feature,problem,Evaluation/llm_iteration,Evaluation/similarity,Evaluation/diversity,Evaluation/llm/similarity,Evaluation/llm/diversity
pe,evaluator,fewshot,seed,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cot,vit,False,0,30,30,30,30,30,30,30,30,30,0,0
cot,vit,False,1,30,30,30,30,30,30,30,30,30,0,0
cot,vit,False,2,30,30,30,30,30,30,30,30,30,0,0
cot,vit,True,1,30,30,30,30,30,30,30,30,30,0,0
cot,vit,True,2,6,6,6,6,6,6,6,6,6,0,0
got,llm,False,0,30,30,30,30,30,30,30,30,30,30,30
got,llm,False,1,18,18,18,18,18,18,18,18,18,18,18
got,llm,False,2,30,30,30,30,30,30,30,30,30,30,30
got,vit,False,0,30,30,30,30,30,30,30,30,30,0,0
got,vit,False,1,30,30,30,30,30,30,30,30,30,0,0


In [230]:
# Min-max Normalize Evaluation/similarity and Evaluation/diversity for each 'target_character' and add a new column 'Evaluation/similarity/norm' and 'Evaluation/diversity/norm'
df['Evaluation/similarity/norm'] = df.groupby('target_character')['Evaluation/similarity'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['Evaluation/diversity/norm'] = df.groupby('target_character')['Evaluation/diversity'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Evaluation/similarity/norm'] = df.groupby('target_character')['Evaluation/similarity'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Evaluation/diversity/norm'] = df.groupby('target_character')['Evaluation/diversity'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))


Unnamed: 0,target_character,pe,branch_factor,exp_name,evaluator,total_iterations,reward_feature,fewshot,problem,seed,Evaluation/llm_iteration,Evaluation/similarity,Evaluation/diversity,Evaluation/llm/similarity,Evaluation/llm/diversity,Evaluation/similarity/norm,Evaluation/diversity/norm
0,N,io,2,def,vit,1,array,False,binary,1,1,0.018449,0.595029,,,0.011181,1.000000
1,M,io,2,def,vit,1,array,False,binary,1,1,0.045062,0.147950,,,0.044626,0.338434
2,F,io,2,def,vit,1,array,False,binary,1,1,0.799518,0.000000,,,0.950594,0.000000
3,D,io,2,def,vit,1,array,False,binary,1,1,0.032404,0.137656,,,0.031829,0.285646
4,C,io,2,def,vit,1,array,False,binary,1,1,0.765589,0.000347,,,0.947941,0.000740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,D,cot,2,def,vit,6,array,True,binary,2,2,0.098399,0.182682,,,0.115904,0.379077
624,D,cot,2,def,vit,6,array,True,binary,2,3,0.037271,0.187888,,,0.038029,0.389880
625,D,cot,2,def,vit,6,array,True,binary,2,4,0.049118,0.144898,,,0.053122,0.300673
626,D,cot,2,def,vit,6,array,True,binary,2,5,0.043938,0.161179,,,0.046523,0.334458


In [247]:
# 각 실험마다 마지막 Iteration의 Similarity와 Diversity를 가져옴, 또한 각 실험의 최대 Similarity와 최소 Diversity를 가져옴
# Update the function to include calculations based on normalized similarity and diversity
experiment_cols = ['pe', 'evaluator', 'fewshot', 'seed']  # Assuming these are the experiment identifiers as per user's requirement
experiment_cols_without_seed = ['pe', 'evaluator', 'fewshot']

In [None]:

def extract_metrics_with_norm(group):
    # Maximum similarity iteration row
    max_sim_row = group.loc[group['Evaluation/similarity'].idxmax()]
    max_sim_norm_row = group.loc[group['Evaluation/similarity/norm'].idxmax()]
    # Final iteration row
    final_iteration_row = group.loc[group['Evaluation/llm_iteration'].idxmax()]
    
    return pd.Series({
        'Max Similarity Iteration': max_sim_row['Evaluation/llm_iteration'],
        'Max Similarity': max_sim_row['Evaluation/similarity'],
        'Diversity at Max Similarity': max_sim_row['Evaluation/diversity'],
        
        'Max Normalized Similarity Iteration': max_sim_norm_row['Evaluation/llm_iteration'],
        'Max Normalized Similarity': max_sim_norm_row['Evaluation/similarity/norm'],
        'Diversity at Max Normalized Similarity': max_sim_norm_row['Evaluation/diversity/norm'],
        
        'Final Iteration': final_iteration_row['Evaluation/llm_iteration'],
        'Final Similarity': final_iteration_row['Evaluation/similarity'],
        'Final Diversity': final_iteration_row['Evaluation/diversity'],
        'Final Normalized Similarity': final_iteration_row['Evaluation/similarity/norm'],
        'Final Normalized Diversity': final_iteration_row['Evaluation/diversity/norm']
    })

# Apply the updated function to each experiment group
result = df.groupby(experiment_cols).apply(extract_metrics_with_norm).reset_index()

# Display the enhanced results to the user
result

In [270]:
plotter = {'Max Similarity': ['count', 'mean', 'std'], 'Diversity at Max Similarity': ['mean', 'std'], 'Final Similarity': ['mean', 'std'], 'Final Diversity': ['mean', 'std'], 'Max Normalized Similarity': ['mean', 'std'], 'Diversity at Max Normalized Similarity': ['mean', 'std'], 'Final Normalized Similarity': ['mean', 'std'], 'Final Normalized Diversity': ['mean', 'std']}

In [252]:
result.groupby(['pe', 'evaluator', 'fewshot']).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Max Similarity,Max Similarity,Max Similarity,Diversity at Max Similarity,Diversity at Max Similarity,Final Similarity,Final Similarity,Final Diversity,Final Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
cot,vit,False,3,0.812065,0.007922,0.000109,0.000109,0.289378,0.449083,0.151174,0.13117
cot,vit,True,2,0.429855,0.457044,0.0008,0.001131,0.062615,0.062311,0.181092,0.256103
got,llm,False,3,0.776426,0.061534,0.00446,0.007658,0.02842,0.008687,0.071873,0.109656
got,vit,False,3,0.773417,0.058699,0.001101,0.001907,0.163894,0.166634,0.079401,0.112436
got,vit,True,1,0.821257,,0.000215,,0.063482,,0.176587,
got-ech1,vit,False,3,0.787539,0.033311,0.000212,0.000364,0.028848,0.009827,0.15118,0.059444
io,vit,False,3,0.413541,0.376244,0.03352,0.05676,0.153185,0.208424,0.231863,0.31827
tot,llm,False,3,0.723668,0.131964,0.041872,0.072513,0.361961,0.351741,0.028496,0.049357
tot,vit,False,3,0.799876,0.010191,1.1e-05,1.8e-05,0.291032,0.388777,0.209669,0.190927


## Exp 1. Reasoning performance of prompt engineering

In [271]:
# Get the fewshot=False and evaluator=vit
exp1_df = result[(result['fewshot'] == False) & (result['evaluator'] == 'vit')]
exp1_df.groupby(experiment_cols_without_seed).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
pe,evaluator,fewshot,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got-ech1,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
io,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3


In [272]:
exp1_df.groupby(experiment_cols_without_seed).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Max Similarity,Max Similarity,Max Similarity,Diversity at Max Similarity,Diversity at Max Similarity,Final Similarity,Final Similarity,Final Diversity,Final Diversity,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
cot,vit,False,3,0.812065,0.007922,0.000109,0.000109,0.289378,0.449083,0.151174,0.13117,0.977408,0.021693,0.000295,0.000294,0.348972,0.559869,0.345809,0.30005
got,vit,False,3,0.773417,0.058699,0.001101,0.001907,0.163894,0.166634,0.079401,0.112436,0.93742,0.067197,0.002969,0.005143,0.188903,0.194463,0.208493,0.307147
got-ech1,vit,False,3,0.787539,0.033311,0.000212,0.000364,0.028848,0.009827,0.15118,0.059444,0.963297,0.063571,0.000572,0.000983,0.022552,0.017818,0.367761,0.187001
io,vit,False,3,0.413541,0.376244,0.03352,0.05676,0.153185,0.208424,0.231863,0.31827,0.494246,0.45146,0.076373,0.130097,0.181108,0.263215,0.409706,0.523341
tot,vit,False,3,0.799876,0.010191,1.1e-05,1.8e-05,0.291032,0.388777,0.209669,0.190927,0.993386,0.011346,1.8e-05,3.1e-05,0.340686,0.465956,0.479615,0.436744


## Exp. 2: LLM and ViT-based node evaluation 

In [259]:
exp2_df = result[(result['fewshot'] == False)]
exp2_df.groupby(experiment_cols_without_seed).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
pe,evaluator,fewshot,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got,llm,False,3,3,3,3,3,3,3,3,3,3,3,3
got,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got-ech1,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
io,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,llm,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3


In [273]:
# get only cot and tot
exp2_df = exp2_df[exp2_df['pe'].isin(['tot', 'got'])]
exp2_df.groupby(experiment_cols_without_seed).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Max Similarity,Max Similarity,Max Similarity,Diversity at Max Similarity,Diversity at Max Similarity,Final Similarity,Final Similarity,Final Diversity,Final Diversity,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
got,llm,False,3,0.776426,0.061534,0.00446,0.007658,0.02842,0.008687,0.071873,0.109656,0.945378,0.064425,0.000555,0.000848,0.021981,0.016972,0.190554,0.298241
got,vit,False,3,0.773417,0.058699,0.001101,0.001907,0.163894,0.166634,0.079401,0.112436,0.93742,0.067197,0.002969,0.005143,0.188903,0.194463,0.208493,0.307147
tot,llm,False,3,0.723668,0.131964,0.041872,0.072513,0.361961,0.351741,0.028496,0.049357,0.883238,0.181279,0.112928,0.195576,0.422695,0.422602,0.065185,0.112903
tot,vit,False,3,0.799876,0.010191,1.1e-05,1.8e-05,0.291032,0.388777,0.209669,0.190927,0.993386,0.011346,1.8e-05,3.1e-05,0.340686,0.465956,0.479615,0.436744


## Exp 3: Controlling the first iteration reward function

In [268]:
# Get the fewshot=True and evaluator=vit
exp3_df = result[(result['pe'] != 'io') & (result['evaluator'] == 'vit')]
exp3_df.groupby(experiment_cols_without_seed).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
pe,evaluator,fewshot,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
cot,vit,True,2,2,2,2,2,2,2,2,2,2,2,2
got,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got,vit,True,1,1,1,1,1,1,1,1,1,1,1,1
got-ech1,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3


In [274]:
exp3_df.groupby(experiment_cols_without_seed).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Max Similarity,Max Similarity,Max Similarity,Diversity at Max Similarity,Diversity at Max Similarity,Final Similarity,Final Similarity,Final Diversity,Final Diversity,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
cot,vit,False,3,0.812065,0.007922,0.000109,0.000109,0.289378,0.449083,0.151174,0.13117,0.977408,0.021693,0.0002947187,0.000294,0.348972,0.559869,0.345809,0.30005
cot,vit,True,2,0.429855,0.457044,0.0008,0.001131,0.062615,0.062311,0.181092,0.256103,0.510316,0.542872,0.002157177,0.003051,0.06888,0.081413,0.304342,0.430404
got,vit,False,3,0.773417,0.058699,0.001101,0.001907,0.163894,0.166634,0.079401,0.112436,0.93742,0.067197,0.002969049,0.005143,0.188903,0.194463,0.208493,0.307147
got,vit,True,1,0.821257,,0.000215,,0.063482,,0.176587,,1.0,,6.411457e-07,,0.067325,,0.29677,
got-ech1,vit,False,3,0.787539,0.033311,0.000212,0.000364,0.028848,0.009827,0.15118,0.059444,0.963297,0.063571,0.0005723859,0.000983,0.022552,0.017818,0.367761,0.187001
tot,vit,False,3,0.799876,0.010191,1.1e-05,1.8e-05,0.291032,0.388777,0.209669,0.190927,0.993386,0.011346,1.767303e-05,3.1e-05,0.340686,0.465956,0.479615,0.436744


## Appendix: Additional Analysis