In [2]:
import wandb
import pandas as pd
import os
from tqdm import tqdm

from table_plotter import print_result_table

In [3]:
api = wandb.Api(timeout=600)


In [4]:
# get all runs for a project
runs = api.runs("inchangbaek4907/pcgrl-llm")
len(runs)

195

In [5]:
# Specify cache directory
cache_dir = "./wandb_cache"
os.makedirs(cache_dir, exist_ok=True)

dfs = []
skipped_runs = []  # List to store IDs of skipped runs

for run in tqdm(runs):
    # Skip the run if it's not finished
    if run.state != "finished":
        print(f"Skipping run {run.id} (state: {run.state})")
        skipped_runs.append(run.id)
        continue

    # Define cache filename based on run ID
    cache_file = os.path.join(cache_dir, f"{run.id}.csv")
    
    # Check if cached file exists
    if os.path.exists(cache_file):
        # Load cached DataFrame
        df = pd.read_csv(cache_file)
    else:
        # Retrieve history data for Evaluation/similarity and diversity
        sim = run.history(keys=["Evaluation/similarity", "Evaluation/llm_iteration"])
        div = run.history(keys=["Evaluation/diversity", "Evaluation/llm_iteration"])
        llm_sim = run.history(keys=["Evaluation/llm/similarity", "Evaluation/llm_iteration"])
        llm_div = run.history(keys=["Evaluation/llm/diversity", "Evaluation/llm_iteration"])
        
        # Sequentially merge the DataFrames on "Evaluation/llm_iteration"
        df = pd.merge(sim, div, on="Evaluation/llm_iteration", how="outer")
        df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")
        if 'Evaluation/llm_iteration' in llm_sim.columns:
            df = pd.merge(df, llm_sim, on="Evaluation/llm_iteration", how="outer")
            df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")
        if 'Evaluation/llm_iteration' in llm_div.columns:
            df = pd.merge(df, llm_div, on="Evaluation/llm_iteration", how="outer")
            df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")
            
        # Now, 'df' contains all the merged data based on "Evaluation/llm_iteration"

        df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")

        # Add run config to DataFrame with prefix 'config.'
        for key, value in run.config.items():
            if isinstance(value, list):
                value = ",".join(map(str, value))  # Convert list to comma-separated string
            df[key] = value

        # Filter columns
        key_filter = ['target_character', 'pe', 'branch_factor', 'exp_name', 'evaluator', 'total_iterations', 
                      'reward_feature', 'fewshot', 'problem', 'seed', 'Evaluation/llm_iteration', 
                      'Evaluation/similarity', 'Evaluation/diversity']
        auxiliary_key_filter = ['Evaluation/llm/similarity', 'Evaluation/llm/diversity']
        
        try:
            df = df[key_filter + auxiliary_key_filter]
        except KeyError:
            df = df[key_filter]
        
        # Save DataFrame to cache as CSV
        df.to_csv(cache_file, index=False)
    
    # Append DataFrame to list
    dfs.append(df)

# Concatenate all DataFrames
df = pd.concat(dfs, ignore_index=True)
df.head()  # Display the first few rows for inspection

# Print summary of skipped runs
print("\nSummary of Skipped Runs:")
print(f"Total skipped runs: {len(skipped_runs)}")
print("Skipped run IDs:", skipped_runs)

100%|██████████| 195/195 [07:17<00:00,  2.24s/it]


Summary of Skipped Runs:
Total skipped runs: 0
Skipped run IDs: []





In [6]:
time_str = pd.Timestamp.now().strftime("%Y-%m-%d-%H-%M-%S")

In [7]:
df.to_csv(f"wandb_output_{time_str}.csv", index=False)

In [8]:

df['seed'] = df['seed'] % 3
df

Unnamed: 0,target_character,pe,branch_factor,exp_name,evaluator,total_iterations,reward_feature,fewshot,problem,seed,Evaluation/llm_iteration,Evaluation/similarity,Evaluation/diversity,Evaluation/llm/similarity,Evaluation/llm/diversity
0,N,io,2,def,vit,1,array,False,binary,1,1,0.018449,0.595029,,
1,M,io,2,def,vit,1,array,False,binary,1,1,0.045062,0.147950,,
2,F,io,2,def,vit,1,array,False,binary,1,1,0.799518,0.000000,,
3,D,io,2,def,vit,1,array,False,binary,1,1,0.032404,0.137656,,
4,C,io,2,def,vit,1,array,False,binary,1,1,0.765589,0.000347,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1122,M,got,2,longiter,vit,9,array,False,binary,0,5,0.049230,0.167293,,
1123,M,got,2,longiter,vit,9,array,False,binary,0,6,0.051012,0.167400,,
1124,M,got,2,longiter,vit,9,array,False,binary,0,7,0.202214,0.204327,,
1125,M,got,2,longiter,vit,9,array,False,binary,0,8,0.608841,0.016238,,


In [9]:
# remove pe == cotsc
df = df[df['pe'] != 'cotsc']

# if the exname is not def make the pe to 'pe+exp_name'
df['pe'] = df.apply(lambda x: x['pe'] + '-' +  x['exp_name'] if x['exp_name'] != 'def' else x['pe'], axis=1)

In [10]:
df.groupby(['pe', 'evaluator', 'fewshot', 'seed']).count() 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,target_character,branch_factor,exp_name,total_iterations,reward_feature,problem,Evaluation/llm_iteration,Evaluation/similarity,Evaluation/diversity,Evaluation/llm/similarity,Evaluation/llm/diversity
pe,evaluator,fewshot,seed,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cot,vit,False,0,24,24,24,24,24,24,24,24,24,0,0
cot,vit,False,1,36,36,36,36,36,36,36,36,36,0,0
cot,vit,False,2,30,30,30,30,30,30,30,30,30,0,0
cot,vit,True,0,30,30,30,30,30,30,30,30,30,0,0
cot,vit,True,1,30,30,30,30,30,30,30,30,30,0,0
cot,vit,True,2,30,30,30,30,30,30,30,30,30,0,0
cot-encfb,vit,False,0,30,30,30,30,30,30,30,30,30,0,0
cot-encfb,vit,False,1,30,30,30,30,30,30,30,30,30,0,0
cot-encfb,vit,False,2,29,29,29,29,29,29,29,29,29,0,0
got,llm,False,0,30,30,30,30,30,30,30,30,30,30,30


In [11]:
# Min-max Normalize Evaluation/similarity and Evaluation/diversity for each 'target_character' and add a new column 'Evaluation/similarity/norm' and 'Evaluation/diversity/norm'
df['Evaluation/similarity/norm'] = df.groupby('target_character')['Evaluation/similarity'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['Evaluation/diversity/norm'] = df.groupby('target_character')['Evaluation/diversity'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df

Unnamed: 0,target_character,pe,branch_factor,exp_name,evaluator,total_iterations,reward_feature,fewshot,problem,seed,Evaluation/llm_iteration,Evaluation/similarity,Evaluation/diversity,Evaluation/llm/similarity,Evaluation/llm/diversity,Evaluation/similarity/norm,Evaluation/diversity/norm
0,N,io,2,def,vit,1,array,False,binary,1,1,0.018449,0.595029,,,0.010878,1.000000
1,M,io,2,def,vit,1,array,False,binary,1,1,0.045062,0.147950,,,0.045574,0.315946
2,F,io,2,def,vit,1,array,False,binary,1,1,0.799518,0.000000,,,0.950696,0.000000
3,D,io,2,def,vit,1,array,False,binary,1,1,0.032404,0.137656,,,0.034002,0.247650
4,C,io,2,def,vit,1,array,False,binary,1,1,0.765589,0.000347,,,0.947774,0.000649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1122,M,got-longiter,2,longiter,vit,9,array,False,binary,0,5,0.049230,0.167293,,,0.050856,0.357254
1123,M,got-longiter,2,longiter,vit,9,array,False,binary,0,6,0.051012,0.167400,,,0.053114,0.357481
1124,M,got-longiter,2,longiter,vit,9,array,False,binary,0,7,0.202214,0.204327,,,0.244729,0.436339
1125,M,got-longiter,2,longiter,vit,9,array,False,binary,0,8,0.608841,0.016238,,,0.760041,0.034675


In [12]:
# 각 실험마다 마지막 Iteration의 Similarity와 Diversity를 가져옴, 또한 각 실험의 최대 Similarity와 최소 Diversity를 가져옴
# Update the function to include calculations based on normalized similarity and diversity
experiment_cols = ['target_character', 'pe', 'evaluator', 'fewshot', 'seed']  # Assuming these are the experiment identifiers as per user's requirement
experiment_cols_without_seed = ['target_character', 'pe', 'evaluator', 'fewshot']

In [13]:

def extract_metrics_with_norm(group):
    # Maximum similarity iteration row
    max_sim_row = group.loc[group['Evaluation/similarity'].idxmax()]
    max_sim_norm_row = group.loc[group['Evaluation/similarity/norm'].idxmax()]
    # Final iteration row
    final_iteration_row = group.loc[group['Evaluation/llm_iteration'].idxmax()]
    
    return pd.Series({
        'Max Similarity Iteration': max_sim_row['Evaluation/llm_iteration'],
        'Max Similarity': max_sim_row['Evaluation/similarity'],
        'Diversity at Max Similarity': max_sim_row['Evaluation/diversity'],
        
        'Max Normalized Similarity Iteration': max_sim_norm_row['Evaluation/llm_iteration'],
        'Max Normalized Similarity': max_sim_norm_row['Evaluation/similarity/norm'],
        'Diversity at Max Normalized Similarity': max_sim_norm_row['Evaluation/diversity/norm'],
        
        'Final Iteration': final_iteration_row['Evaluation/llm_iteration'],
        'Final Similarity': final_iteration_row['Evaluation/similarity'],
        'Final Diversity': final_iteration_row['Evaluation/diversity'],
        'Final Normalized Similarity': final_iteration_row['Evaluation/similarity/norm'],
        'Final Normalized Diversity': final_iteration_row['Evaluation/diversity/norm']
    })

# Apply the updated function to each experiment group
result = df.groupby(experiment_cols).apply(extract_metrics_with_norm).reset_index()

# Display the enhanced results to the user
result

  result = df.groupby(experiment_cols).apply(extract_metrics_with_norm).reset_index()


Unnamed: 0,target_character,pe,evaluator,fewshot,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
0,C,cot,vit,False,1,5.0,0.199735,0.068222,5.0,0.238671,0.127580,6.0,0.017137,0.226798,0.009846,0.424131
1,C,cot,vit,False,2,3.0,0.165821,0.000000,3.0,0.196170,0.000000,6.0,0.019047,0.124546,0.012240,0.232912
2,C,cot,vit,True,0,1.0,0.171087,0.022992,1.0,0.202770,0.042997,6.0,0.171087,0.022992,0.202770,0.042997
3,C,cot,vit,True,1,6.0,0.483973,0.026739,6.0,0.594866,0.050004,6.0,0.483973,0.026739,0.594866,0.050004
4,C,cot,vit,True,2,4.0,0.239788,0.126869,4.0,0.288863,0.237256,6.0,0.129345,0.016808,0.150460,0.031432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,N,tot,vit,True,1,1.0,0.048219,0.081245,1.0,0.047675,0.136539,6.0,0.027921,0.000000,0.022586,0.000000
190,N,tot,vit,True,2,2.0,0.040261,0.260393,2.0,0.037839,0.437613,6.0,0.026634,0.124258,0.020995,0.208827
191,N,tot-encfb,vit,False,0,5.0,0.811480,0.000000,5.0,0.991102,0.000000,6.0,0.807737,0.000029,0.986475,0.000049
192,N,tot-encfb,vit,False,1,1.0,0.269528,0.000000,1.0,0.321224,0.000000,6.0,0.027921,0.000000,0.022586,0.000000


In [14]:
plotter = {'Max Normalized Similarity': ['mean', 'std'], 'Diversity at Max Normalized Similarity': ['mean', 'std'], 'Max Similarity Iteration': ['mean', 'std'], 'Final Normalized Similarity': ['mean', 'std'], 'Final Normalized Diversity': ['mean', 'std']}

In [16]:
result.groupby(['pe', 'evaluator', 'fewshot', 'target_character']).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Max Similarity Iteration,Max Similarity Iteration,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,target_character,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
cot,vit,False,C,0.217420,0.030053,0.063790,0.090213,4.000000,1.414214,0.011043,0.001692,0.328522,0.135212
cot,vit,False,D,0.079968,0.056442,0.186997,0.180431,2.333333,0.577350,0.021111,0.010805,0.478923,0.201545
cot,vit,False,F,0.872418,0.174582,0.003149,0.004980,3.333333,2.516611,0.654593,0.527796,0.218882,0.377809
cot,vit,False,M,0.275086,0.377989,0.329053,0.285080,3.333333,2.516611,0.028172,0.002966,0.584972,0.175323
cot,vit,False,N,0.679753,0.546996,0.090928,0.157485,4.666667,1.527525,0.677202,0.544791,0.090928,0.157485
...,...,...,...,...,...,...,...,...,...,...,...,...,...
tot-encfb,vit,False,C,0.743260,0.383682,0.056582,0.097812,2.333333,2.309401,0.681669,0.357878,0.031042,0.048991
tot-encfb,vit,False,D,0.287982,0.164980,0.048940,0.083838,1.666667,1.154701,0.074294,0.071816,0.435207,0.356720
tot-encfb,vit,False,F,0.873448,0.138057,0.007603,0.012478,4.333333,0.577350,0.479885,0.439822,0.104899,0.180922
tot-encfb,vit,False,M,0.305698,0.451319,0.059236,0.099586,3.000000,1.732051,0.146978,0.200315,0.223871,0.281759


## Exp 1. Reasoning performance of prompt engineering

In [17]:
# Get the fewshot=False and evaluator=vit
exp1_df = result[(result['fewshot'] == False) & (result['evaluator'] == 'vit')]
exp1_df.groupby(experiment_cols_without_seed).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
target_character,pe,evaluator,fewshot,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
C,cot,vit,False,2,2,2,2,2,2,2,2,2,2,2,2
C,cot-encfb,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
C,got,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
C,got-encfb,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
C,got-longiter,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
C,io,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
C,tot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
C,tot-encfb,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
D,cot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
D,cot-encfb,vit,False,3,3,3,3,3,3,3,3,3,3,3,3


In [18]:
exp1_df.groupby(experiment_cols_without_seed).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Max Similarity Iteration,Max Similarity Iteration,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
target_character,pe,evaluator,fewshot,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
C,cot,vit,False,0.21742,0.030053,0.06379,0.090213,4.0,1.414214,0.011043,0.001692,0.328522,0.135212
C,cot-encfb,vit,False,0.442607,0.216077,0.31334,0.224293,2.0,0.0,0.03814,0.045588,0.630183,0.309074
C,got,vit,False,0.383675,0.535422,0.018558,0.032139,3.666667,2.516611,0.148749,0.217651,0.202153,0.303208
C,got-encfb,vit,False,0.216658,0.022243,0.348646,0.357226,2.333333,1.527525,0.079299,0.090455,0.118912,0.130616
C,got-longiter,vit,False,0.815252,0.023105,0.200678,0.176071,2.333333,1.527525,0.118894,0.118153,0.291035,0.255039
C,io,vit,False,0.338068,0.528421,0.353759,0.336647,1.0,0.0,0.338068,0.528421,0.353759,0.336647
C,tot,vit,False,0.32998,0.259856,0.315919,0.086091,1.666667,1.154701,0.046946,0.065029,0.357465,0.141628
C,tot-encfb,vit,False,0.74326,0.383682,0.056582,0.097812,2.333333,2.309401,0.681669,0.357878,0.031042,0.048991
D,cot,vit,False,0.079968,0.056442,0.186997,0.180431,2.333333,0.57735,0.021111,0.010805,0.478923,0.201545
D,cot-encfb,vit,False,0.144626,0.119443,0.425544,0.398965,2.333333,1.527525,0.020214,0.019701,0.325349,0.280424


In [19]:
print_result_table(exp1_df, category_columns=['pe', 'target_character'])

AttributeError: 'tuple' object has no attribute 'split'

## Exp. 2: LLM and ViT-based node evaluation 

In [18]:
exp2_df = result[(result['fewshot'] == False)]
exp2_df = exp2_df[exp2_df['pe'].isin(['tot', 'got'])]
exp2_df.groupby(experiment_cols_without_seed).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
pe,evaluator,fewshot,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
got,llm,False,3,3,3,3,3,3,3,3,3,3,3,3
got,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,llm,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3


In [19]:
# get only cot and tot
exp2_df.groupby(experiment_cols_without_seed).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Max Similarity Iteration,Max Similarity Iteration,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
got,llm,False,0.942574,0.060667,0.000477,0.000739,1.666667,1.154701,0.021929,0.014777,0.147448,0.227586
got,vit,False,0.937551,0.06706,0.002276,0.003942,2.666667,2.081666,0.188936,0.196033,0.162179,0.233817
tot,llm,False,0.880511,0.1781,0.086572,0.149927,2.0,1.0,0.423527,0.422219,0.060853,0.105401
tot,vit,False,0.987594,0.014482,1.8e-05,3.1e-05,4.0,2.645751,0.341474,0.46553,0.447746,0.407723


In [20]:
print_result_table(exp2_df, category_columns=['evaluator', 'pe'])

AttributeError: 'tuple' object has no attribute 'split'

## Exp 3: Controlling the first iteration reward function

In [21]:
# Get the fewshot=True and evaluator=vit
exp3_df = result[(result['pe'] != 'io') & (result['evaluator'] == 'vit')]
exp3_df.groupby(experiment_cols_without_seed).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
pe,evaluator,fewshot,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
cot,vit,True,3,3,3,3,3,3,3,3,3,3,3,3
cot-encfb,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got,vit,True,3,3,3,3,3,3,3,3,3,3,3,3
got-encfb,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got-longiter,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,vit,True,3,3,3,3,3,3,3,3,3,3,3,3
tot-encfb,vit,False,3,3,3,3,3,3,3,3,3,3,3,3


In [22]:
exp3_df.groupby(experiment_cols_without_seed).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Max Similarity Iteration,Max Similarity Iteration,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
cot,vit,False,0.982591,0.022884,0.000153,0.000258,4.0,1.732051,0.346647,0.554319,0.322831,0.280113
cot,vit,True,0.913924,0.052203,0.001135,0.001882,3.0,2.645751,0.334658,0.552904,0.292813,0.304952
cot-encfb,vit,False,0.912394,0.076618,0.00367,0.005934,2.666667,0.57735,0.032899,0.050132,0.537518,0.466335
got,vit,False,0.937551,0.06706,0.002276,0.003942,2.666667,2.081666,0.188936,0.196033,0.162179,0.233817
got,vit,True,0.929675,0.055938,0.003333,0.005544,2.333333,2.309401,0.258092,0.389669,0.110371,0.161636
got-encfb,vit,False,0.986836,0.009218,2e-06,3e-06,1.333333,0.57735,0.358125,0.320487,0.403314,0.389071
got-longiter,vit,False,0.885514,0.099275,0.085865,0.145016,4.0,3.0,0.233946,0.179967,0.359351,0.125487
tot,vit,False,0.987594,0.014482,1.8e-05,3.1e-05,4.0,2.645751,0.341474,0.46553,0.447746,0.407723
tot,vit,True,0.921507,0.065687,0.000145,0.000251,1.0,0.0,0.035927,0.049807,0.403783,0.082175
tot-encfb,vit,False,0.967582,0.020409,0.000292,0.000334,4.666667,0.57735,0.032472,0.018474,0.296309,0.404674


In [23]:
# Short the dataframe with fewshot True is on top
exp3_df = exp3_df.sort_values(by='fewshot', ascending=True)
print_result_table(exp3_df, category_columns=['pe', 'fewshot'])

Unnamed: 0_level_0,Iteration Type,Best Similarity Iteration (Mean ± Std),Best Similarity Iteration (Mean ± Std),Best Similarity Iteration (Mean ± Std),Final Iteration (Mean ± Std),Final Iteration (Mean ± Std)
Unnamed: 0_level_1,Metric,Iteration,Similarity,Diversity,Similarity,Diversity
pe,fewshot,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
cot,False,4.000 ± 1.732,0.983 ± 0.023,0.000 ± 0.000,0.347 ± 0.554,0.323 ± 0.280
cot,True,3.000 ± 2.646,0.914 ± 0.052,0.001 ± 0.002,0.335 ± 0.553,0.293 ± 0.305
tot,False,4.000 ± 2.646,0.988 ± 0.014,0.000 ± 0.000,0.341 ± 0.466,0.448 ± 0.408
tot,True,1.000 ± 0.000,0.922 ± 0.066,0.000 ± 0.000,0.036 ± 0.050,0.404 ± 0.082
got,False,2.667 ± 2.082,0.938 ± 0.067,0.002 ± 0.004,0.189 ± 0.196,0.162 ± 0.234
got,True,2.333 ± 2.309,0.930 ± 0.056,0.003 ± 0.006,0.258 ± 0.390,0.110 ± 0.162
cot-encfb,False,2.667 ± 0.577,0.912 ± 0.077,0.004 ± 0.006,0.033 ± 0.050,0.538 ± 0.466
got-encfb,False,1.333 ± 0.577,0.987 ± 0.009,0.000 ± 0.000,0.358 ± 0.320,0.403 ± 0.389
got-longiter,False,4.000 ± 3.000,0.886 ± 0.099,0.086 ± 0.145,0.234 ± 0.180,0.359 ± 0.125
tot-encfb,False,4.667 ± 0.577,0.968 ± 0.020,0.000 ± 0.000,0.032 ± 0.018,0.296 ± 0.405


## Appendix: Additional Analysis