In [1]:
import wandb
import pandas as pd
import os
from tqdm import tqdm

from table_plotter import print_result_table

In [2]:
api = wandb.Api(timeout=600)


In [3]:
# get all runs for a project
runs = api.runs("inchangbaek4907/pcgrl-llm")
len(runs)

195

In [4]:
# Specify cache directory
cache_dir = "./wandb_cache"
os.makedirs(cache_dir, exist_ok=True)

dfs = []
skipped_runs = []  # List to store IDs of skipped runs

for run in tqdm(runs):
    # Skip the run if it's not finished
    if run.state != "finished":
        print(f"Skipping run {run.id} (state: {run.state})")
        skipped_runs.append(run.id)
        continue

    # Define cache filename based on run ID
    cache_file = os.path.join(cache_dir, f"{run.id}.csv")
    
    # Check if cached file exists
    if os.path.exists(cache_file):
        # Load cached DataFrame
        df = pd.read_csv(cache_file)
    else:
        # Retrieve history data for Evaluation/similarity and diversity
        sim = run.history(keys=["Evaluation/similarity", "Evaluation/llm_iteration"])
        div = run.history(keys=["Evaluation/diversity", "Evaluation/llm_iteration"])
        llm_sim = run.history(keys=["Evaluation/llm/similarity", "Evaluation/llm_iteration"])
        llm_div = run.history(keys=["Evaluation/llm/diversity", "Evaluation/llm_iteration"])
        
        # Sequentially merge the DataFrames on "Evaluation/llm_iteration"
        df = pd.merge(sim, div, on="Evaluation/llm_iteration", how="outer")
        df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")
        if 'Evaluation/llm_iteration' in llm_sim.columns:
            df = pd.merge(df, llm_sim, on="Evaluation/llm_iteration", how="outer")
            df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")
        if 'Evaluation/llm_iteration' in llm_div.columns:
            df = pd.merge(df, llm_div, on="Evaluation/llm_iteration", how="outer")
            df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")
            
        # Now, 'df' contains all the merged data based on "Evaluation/llm_iteration"

        df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")

        # Add run config to DataFrame with prefix 'config.'
        for key, value in run.config.items():
            if isinstance(value, list):
                value = ",".join(map(str, value))  # Convert list to comma-separated string
            df[key] = value

        # Filter columns
        key_filter = ['target_character', 'pe', 'branch_factor', 'exp_name', 'evaluator', 'total_iterations', 
                      'reward_feature', 'fewshot', 'problem', 'seed', 'Evaluation/llm_iteration', 
                      'Evaluation/similarity', 'Evaluation/diversity']
        auxiliary_key_filter = ['Evaluation/llm/similarity', 'Evaluation/llm/diversity']
        
        try:
            df = df[key_filter + auxiliary_key_filter]
        except KeyError:
            df = df[key_filter]
        
        # Save DataFrame to cache as CSV
        df.to_csv(cache_file, index=False)
    
    # Append DataFrame to list
    dfs.append(df)

# Concatenate all DataFrames
df = pd.concat(dfs, ignore_index=True)
df.head()  # Display the first few rows for inspection

# Print summary of skipped runs
print("\nSummary of Skipped Runs:")
print(f"Total skipped runs: {len(skipped_runs)}")
print("Skipped run IDs:", skipped_runs)

100%|██████████| 195/195 [00:02<00:00, 80.93it/s]


Summary of Skipped Runs:
Total skipped runs: 0
Skipped run IDs: []





In [5]:
time_str = pd.Timestamp.now().strftime("%Y-%m-%d-%H-%M-%S")

In [6]:
df.to_csv(f"wandb_output_{time_str}.csv", index=False)

In [7]:

df['seed'] = df['seed'] % 3
df

Unnamed: 0,target_character,pe,branch_factor,exp_name,evaluator,total_iterations,reward_feature,fewshot,problem,seed,Evaluation/llm_iteration,Evaluation/similarity,Evaluation/diversity,Evaluation/llm/similarity,Evaluation/llm/diversity
0,N,io,2,def,vit,1,array,False,binary,1,1,0.018449,0.595029,,
1,M,io,2,def,vit,1,array,False,binary,1,1,0.045062,0.147950,,
2,F,io,2,def,vit,1,array,False,binary,1,1,0.799518,0.000000,,
3,D,io,2,def,vit,1,array,False,binary,1,1,0.032404,0.137656,,
4,C,io,2,def,vit,1,array,False,binary,1,1,0.765589,0.000347,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1122,M,got,2,longiter,vit,9,array,False,binary,0,5,0.049230,0.167293,,
1123,M,got,2,longiter,vit,9,array,False,binary,0,6,0.051012,0.167400,,
1124,M,got,2,longiter,vit,9,array,False,binary,0,7,0.202214,0.204327,,
1125,M,got,2,longiter,vit,9,array,False,binary,0,8,0.608841,0.016238,,


In [8]:
# remove pe == cotsc
df = df[df['pe'] != 'cotsc']

# if the exname is not def make the pe to 'pe+exp_name'
df['pe'] = df.apply(lambda x: x['pe'] + '-' +  x['exp_name'] if x['exp_name'] != 'def' else x['pe'], axis=1)

In [9]:
df.groupby(['pe', 'evaluator', 'fewshot', 'seed']).count() 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,target_character,branch_factor,exp_name,total_iterations,reward_feature,problem,Evaluation/llm_iteration,Evaluation/similarity,Evaluation/diversity,Evaluation/llm/similarity,Evaluation/llm/diversity
pe,evaluator,fewshot,seed,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cot,vit,False,0,24,24,24,24,24,24,24,24,24,0,0
cot,vit,False,1,36,36,36,36,36,36,36,36,36,0,0
cot,vit,False,2,30,30,30,30,30,30,30,30,30,0,0
cot,vit,True,0,30,30,30,30,30,30,30,30,30,0,0
cot,vit,True,1,30,30,30,30,30,30,30,30,30,0,0
cot,vit,True,2,30,30,30,30,30,30,30,30,30,0,0
cot-encfb,vit,False,0,30,30,30,30,30,30,30,30,30,0,0
cot-encfb,vit,False,1,30,30,30,30,30,30,30,30,30,0,0
cot-encfb,vit,False,2,29,29,29,29,29,29,29,29,29,0,0
got,llm,False,0,30,30,30,30,30,30,30,30,30,30,30


In [10]:
# Min-max Normalize Evaluation/similarity and Evaluation/diversity for each 'target_character' and add a new column 'Evaluation/similarity/norm' and 'Evaluation/diversity/norm'
df['Evaluation/similarity/norm'] = df.groupby('target_character')['Evaluation/similarity'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['Evaluation/diversity/norm'] = df.groupby('target_character')['Evaluation/diversity'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df

Unnamed: 0,target_character,pe,branch_factor,exp_name,evaluator,total_iterations,reward_feature,fewshot,problem,seed,Evaluation/llm_iteration,Evaluation/similarity,Evaluation/diversity,Evaluation/llm/similarity,Evaluation/llm/diversity,Evaluation/similarity/norm,Evaluation/diversity/norm
0,N,io,2,def,vit,1,array,False,binary,1,1,0.018449,0.595029,,,0.010878,1.000000
1,M,io,2,def,vit,1,array,False,binary,1,1,0.045062,0.147950,,,0.045574,0.315946
2,F,io,2,def,vit,1,array,False,binary,1,1,0.799518,0.000000,,,0.950696,0.000000
3,D,io,2,def,vit,1,array,False,binary,1,1,0.032404,0.137656,,,0.034002,0.247650
4,C,io,2,def,vit,1,array,False,binary,1,1,0.765589,0.000347,,,0.947774,0.000649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1122,M,got-longiter,2,longiter,vit,9,array,False,binary,0,5,0.049230,0.167293,,,0.050856,0.357254
1123,M,got-longiter,2,longiter,vit,9,array,False,binary,0,6,0.051012,0.167400,,,0.053114,0.357481
1124,M,got-longiter,2,longiter,vit,9,array,False,binary,0,7,0.202214,0.204327,,,0.244729,0.436339
1125,M,got-longiter,2,longiter,vit,9,array,False,binary,0,8,0.608841,0.016238,,,0.760041,0.034675


In [11]:
# 각 실험마다 마지막 Iteration의 Similarity와 Diversity를 가져옴, 또한 각 실험의 최대 Similarity와 최소 Diversity를 가져옴
# Update the function to include calculations based on normalized similarity and diversity
experiment_cols = ['pe', 'evaluator', 'fewshot', 'seed']  # Assuming these are the experiment identifiers as per user's requirement
experiment_cols_without_seed = ['pe', 'evaluator', 'fewshot']

In [12]:

def extract_metrics_with_norm(group):
    # Maximum similarity iteration row
    max_sim_row = group.loc[group['Evaluation/similarity'].idxmax()]
    max_sim_norm_row = group.loc[group['Evaluation/similarity/norm'].idxmax()]
    # Final iteration row
    final_iteration_row = group.loc[group['Evaluation/llm_iteration'].idxmax()]
    
    return pd.Series({
        'Max Similarity Iteration': max_sim_row['Evaluation/llm_iteration'],
        'Max Similarity': max_sim_row['Evaluation/similarity'],
        'Diversity at Max Similarity': max_sim_row['Evaluation/diversity'],
        
        'Max Normalized Similarity Iteration': max_sim_norm_row['Evaluation/llm_iteration'],
        'Max Normalized Similarity': max_sim_norm_row['Evaluation/similarity/norm'],
        'Diversity at Max Normalized Similarity': max_sim_norm_row['Evaluation/diversity/norm'],
        
        'Final Iteration': final_iteration_row['Evaluation/llm_iteration'],
        'Final Similarity': final_iteration_row['Evaluation/similarity'],
        'Final Diversity': final_iteration_row['Evaluation/diversity'],
        'Final Normalized Similarity': final_iteration_row['Evaluation/similarity/norm'],
        'Final Normalized Diversity': final_iteration_row['Evaluation/diversity/norm']
    })

# Apply the updated function to each experiment group
result = df.groupby(experiment_cols).apply(extract_metrics_with_norm).reset_index()

# Display the enhanced results to the user
result

  result = df.groupby(experiment_cols).apply(extract_metrics_with_norm).reset_index()


Unnamed: 0,pe,evaluator,fewshot,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
0,cot,vit,False,0,3.0,0.804452,0.000218,3.0,0.95667,0.000451237,6.0,0.029134,0.218648,0.025388,0.46692
1,cot,vit,False,1,3.0,0.81148,0.0,3.0,0.991102,0.0,6.0,0.807933,3e-06,0.986718,5e-06
2,cot,vit,False,2,6.0,0.831062,4.8e-05,5.0,1.0,7.423716e-06,6.0,0.031065,0.234873,0.027835,0.50157
3,cot,vit,True,0,6.0,0.796896,5.9e-05,6.0,0.973075,9.890662e-05,6.0,0.796896,5.9e-05,0.973075,9.9e-05
4,cot,vit,True,1,1.0,0.753034,0.0016,1.0,0.894402,0.003307376,6.0,0.018554,0.362184,0.011008,0.608684
5,cot,vit,True,2,2.0,0.736431,0.0,2.0,0.874295,0.0,6.0,0.02574,0.160454,0.01989,0.269657
6,cot-encfb,vit,False,0,3.0,0.811961,8.6e-05,3.0,0.965764,0.0001776709,6.0,0.005653,0.003451,0.0,0.006208
7,cot-encfb,vit,False,1,2.0,0.796314,0.000151,2.0,0.946816,0.0003115341,6.0,0.081575,0.470034,0.090597,0.879005
8,cot-encfb,vit,False,2,3.0,0.695398,0.005089,3.0,0.824603,0.01052144,6.0,0.015744,0.388934,0.0081,0.72734
9,got,llm,False,0,3.0,0.808905,5e-05,3.0,0.962063,0.000102353,6.0,0.031173,0.0,0.027971,0.0


In [13]:
plotter = {'Max Normalized Similarity': ['mean', 'std'], 'Diversity at Max Normalized Similarity': ['mean', 'std'], 'Max Similarity Iteration': ['mean', 'std'], 'Final Normalized Similarity': ['mean', 'std'], 'Final Normalized Diversity': ['mean', 'std']}

In [14]:
result.groupby(['pe', 'evaluator', 'fewshot']).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Max Similarity Iteration,Max Similarity Iteration,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
cot,vit,False,0.982591,0.022884,0.000153,0.000258,4.0,1.732051,0.346647,0.554319,0.322831,0.280113
cot,vit,True,0.913924,0.052203,0.001135,0.001882,3.0,2.645751,0.334658,0.552904,0.292813,0.304952
cot-encfb,vit,False,0.912394,0.076618,0.00367,0.005934,2.666667,0.57735,0.032899,0.050132,0.537518,0.466335
got,llm,False,0.942574,0.060667,0.000477,0.000739,1.666667,1.154701,0.021929,0.014777,0.147448,0.227586
got,vit,False,0.937551,0.06706,0.002276,0.003942,2.666667,2.081666,0.188936,0.196033,0.162179,0.233817
got,vit,True,0.929675,0.055938,0.003333,0.005544,2.333333,2.309401,0.258092,0.389669,0.110371,0.161636
got-encfb,vit,False,0.986836,0.009218,2e-06,3e-06,1.333333,0.57735,0.358125,0.320487,0.403314,0.389071
got-longiter,vit,False,0.885514,0.099275,0.085865,0.145016,4.0,3.0,0.233946,0.179967,0.359351,0.125487
io,vit,False,0.494916,0.451791,0.071354,0.121405,1.0,0.0,0.181643,0.264923,0.404687,0.52604
tot,llm,False,0.880511,0.1781,0.086572,0.149927,2.0,1.0,0.423527,0.422219,0.060853,0.105401


## Exp 1. Reasoning performance of prompt engineering

In [15]:
# Get the fewshot=False and evaluator=vit
exp1_df = result[(result['fewshot'] == False) & (result['evaluator'] == 'vit')]
exp1_df.groupby(experiment_cols_without_seed).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
pe,evaluator,fewshot,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
cot-encfb,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got-encfb,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got-longiter,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
io,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
tot-encfb,vit,False,3,3,3,3,3,3,3,3,3,3,3,3


In [16]:
exp1_df.groupby(experiment_cols_without_seed).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Max Similarity Iteration,Max Similarity Iteration,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
cot,vit,False,0.982591,0.022884,0.000153,0.000258,4.0,1.732051,0.346647,0.554319,0.322831,0.280113
cot-encfb,vit,False,0.912394,0.076618,0.00367,0.005934,2.666667,0.57735,0.032899,0.050132,0.537518,0.466335
got,vit,False,0.937551,0.06706,0.002276,0.003942,2.666667,2.081666,0.188936,0.196033,0.162179,0.233817
got-encfb,vit,False,0.986836,0.009218,2e-06,3e-06,1.333333,0.57735,0.358125,0.320487,0.403314,0.389071
got-longiter,vit,False,0.885514,0.099275,0.085865,0.145016,4.0,3.0,0.233946,0.179967,0.359351,0.125487
io,vit,False,0.494916,0.451791,0.071354,0.121405,1.0,0.0,0.181643,0.264923,0.404687,0.52604
tot,vit,False,0.987594,0.014482,1.8e-05,3.1e-05,4.0,2.645751,0.341474,0.46553,0.447746,0.407723
tot-encfb,vit,False,0.967582,0.020409,0.000292,0.000334,4.666667,0.57735,0.032472,0.018474,0.296309,0.404674


In [17]:
print_result_table(exp1_df, category_columns=['pe'])

Iteration Type,Best Similarity Iteration (Mean ± Std),Best Similarity Iteration (Mean ± Std),Best Similarity Iteration (Mean ± Std),Final Iteration (Mean ± Std),Final Iteration (Mean ± Std)
Metric,Iteration,Similarity,Diversity,Similarity,Diversity
pe,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
io,1.000 ± 0.000,0.495 ± 0.452,0.071 ± 0.121,0.182 ± 0.265,0.405 ± 0.526
cot,4.000 ± 1.732,0.983 ± 0.023,0.000 ± 0.000,0.347 ± 0.554,0.323 ± 0.280
cot-encfb,2.667 ± 0.577,0.912 ± 0.077,0.004 ± 0.006,0.033 ± 0.050,0.538 ± 0.466
tot,4.000 ± 2.646,0.988 ± 0.014,0.000 ± 0.000,0.341 ± 0.466,0.448 ± 0.408
tot-encfb,4.667 ± 0.577,0.968 ± 0.020,0.000 ± 0.000,0.032 ± 0.018,0.296 ± 0.405
got,2.667 ± 2.082,0.938 ± 0.067,0.002 ± 0.004,0.189 ± 0.196,0.162 ± 0.234
got-encfb,1.333 ± 0.577,0.987 ± 0.009,0.000 ± 0.000,0.358 ± 0.320,0.403 ± 0.389
got-longiter,4.000 ± 3.000,0.886 ± 0.099,0.086 ± 0.145,0.234 ± 0.180,0.359 ± 0.125


## Exp. 2: LLM and ViT-based node evaluation 

In [18]:
exp2_df = result[(result['fewshot'] == False)]
exp2_df = exp2_df[exp2_df['pe'].isin(['tot', 'got'])]
exp2_df.groupby(experiment_cols_without_seed).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
pe,evaluator,fewshot,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
got,llm,False,3,3,3,3,3,3,3,3,3,3,3,3
got,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,llm,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3


In [19]:
# get only cot and tot
exp2_df.groupby(experiment_cols_without_seed).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Max Similarity Iteration,Max Similarity Iteration,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
got,llm,False,0.942574,0.060667,0.000477,0.000739,1.666667,1.154701,0.021929,0.014777,0.147448,0.227586
got,vit,False,0.937551,0.06706,0.002276,0.003942,2.666667,2.081666,0.188936,0.196033,0.162179,0.233817
tot,llm,False,0.880511,0.1781,0.086572,0.149927,2.0,1.0,0.423527,0.422219,0.060853,0.105401
tot,vit,False,0.987594,0.014482,1.8e-05,3.1e-05,4.0,2.645751,0.341474,0.46553,0.447746,0.407723


In [20]:
print_result_table(exp2_df, category_columns=['evaluator', 'pe'])

AttributeError: 'tuple' object has no attribute 'split'

## Exp 3: Controlling the first iteration reward function

In [21]:
# Get the fewshot=True and evaluator=vit
exp3_df = result[(result['pe'] != 'io') & (result['evaluator'] == 'vit')]
exp3_df.groupby(experiment_cols_without_seed).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
pe,evaluator,fewshot,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
cot,vit,True,3,3,3,3,3,3,3,3,3,3,3,3
cot-encfb,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got,vit,True,3,3,3,3,3,3,3,3,3,3,3,3
got-encfb,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got-longiter,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,vit,True,3,3,3,3,3,3,3,3,3,3,3,3
tot-encfb,vit,False,3,3,3,3,3,3,3,3,3,3,3,3


In [22]:
exp3_df.groupby(experiment_cols_without_seed).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Max Similarity Iteration,Max Similarity Iteration,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
cot,vit,False,0.982591,0.022884,0.000153,0.000258,4.0,1.732051,0.346647,0.554319,0.322831,0.280113
cot,vit,True,0.913924,0.052203,0.001135,0.001882,3.0,2.645751,0.334658,0.552904,0.292813,0.304952
cot-encfb,vit,False,0.912394,0.076618,0.00367,0.005934,2.666667,0.57735,0.032899,0.050132,0.537518,0.466335
got,vit,False,0.937551,0.06706,0.002276,0.003942,2.666667,2.081666,0.188936,0.196033,0.162179,0.233817
got,vit,True,0.929675,0.055938,0.003333,0.005544,2.333333,2.309401,0.258092,0.389669,0.110371,0.161636
got-encfb,vit,False,0.986836,0.009218,2e-06,3e-06,1.333333,0.57735,0.358125,0.320487,0.403314,0.389071
got-longiter,vit,False,0.885514,0.099275,0.085865,0.145016,4.0,3.0,0.233946,0.179967,0.359351,0.125487
tot,vit,False,0.987594,0.014482,1.8e-05,3.1e-05,4.0,2.645751,0.341474,0.46553,0.447746,0.407723
tot,vit,True,0.921507,0.065687,0.000145,0.000251,1.0,0.0,0.035927,0.049807,0.403783,0.082175
tot-encfb,vit,False,0.967582,0.020409,0.000292,0.000334,4.666667,0.57735,0.032472,0.018474,0.296309,0.404674


In [23]:
# Short the dataframe with fewshot True is on top
exp3_df = exp3_df.sort_values(by='fewshot', ascending=True)
print_result_table(exp3_df, category_columns=['pe', 'fewshot'])

Unnamed: 0_level_0,Iteration Type,Best Similarity Iteration (Mean ± Std),Best Similarity Iteration (Mean ± Std),Best Similarity Iteration (Mean ± Std),Final Iteration (Mean ± Std),Final Iteration (Mean ± Std)
Unnamed: 0_level_1,Metric,Iteration,Similarity,Diversity,Similarity,Diversity
pe,fewshot,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
cot,False,4.000 ± 1.732,0.983 ± 0.023,0.000 ± 0.000,0.347 ± 0.554,0.323 ± 0.280
cot,True,3.000 ± 2.646,0.914 ± 0.052,0.001 ± 0.002,0.335 ± 0.553,0.293 ± 0.305
tot,False,4.000 ± 2.646,0.988 ± 0.014,0.000 ± 0.000,0.341 ± 0.466,0.448 ± 0.408
tot,True,1.000 ± 0.000,0.922 ± 0.066,0.000 ± 0.000,0.036 ± 0.050,0.404 ± 0.082
got,False,2.667 ± 2.082,0.938 ± 0.067,0.002 ± 0.004,0.189 ± 0.196,0.162 ± 0.234
got,True,2.333 ± 2.309,0.930 ± 0.056,0.003 ± 0.006,0.258 ± 0.390,0.110 ± 0.162
cot-encfb,False,2.667 ± 0.577,0.912 ± 0.077,0.004 ± 0.006,0.033 ± 0.050,0.538 ± 0.466
got-encfb,False,1.333 ± 0.577,0.987 ± 0.009,0.000 ± 0.000,0.358 ± 0.320,0.403 ± 0.389
got-longiter,False,4.000 ± 3.000,0.886 ± 0.099,0.086 ± 0.145,0.234 ± 0.180,0.359 ± 0.125
tot-encfb,False,4.667 ± 0.577,0.968 ± 0.020,0.000 ± 0.000,0.032 ± 0.018,0.296 ± 0.405


## Appendix: Additional Analysis