In [104]:
import wandb
import pandas as pd
import os
from tqdm import tqdm

from table_plotter import print_result_table

In [105]:
api = wandb.Api(timeout=600)


In [106]:
# get all runs for a project
runs = api.runs("inchangbaek4907/pcgrl-llm")
len(runs)

140

In [107]:
# Specify cache directory
cache_dir = "./wandb_cache"
os.makedirs(cache_dir, exist_ok=True)

dfs = []
skipped_runs = []  # List to store IDs of skipped runs

for run in tqdm(runs):
    # Skip the run if it's not finished
    if run.state != "finished":
        print(f"Skipping run {run.id} (state: {run.state})")
        skipped_runs.append(run.id)
        continue

    # Define cache filename based on run ID
    cache_file = os.path.join(cache_dir, f"{run.id}.csv")
    
    # Check if cached file exists
    if os.path.exists(cache_file):
        # Load cached DataFrame
        df = pd.read_csv(cache_file)
    else:
        # Retrieve history data for Evaluation/similarity and diversity
        sim = run.history(keys=["Evaluation/similarity", "Evaluation/llm_iteration"])
        div = run.history(keys=["Evaluation/diversity", "Evaluation/llm_iteration"])
        llm_sim = run.history(keys=["Evaluation/llm/similarity", "Evaluation/llm_iteration"])
        llm_div = run.history(keys=["Evaluation/llm/diversity", "Evaluation/llm_iteration"])
        
        # Sequentially merge the DataFrames on "Evaluation/llm_iteration"
        df = pd.merge(sim, div, on="Evaluation/llm_iteration", how="outer")
        df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")
        if 'Evaluation/llm_iteration' in llm_sim.columns:
            df = pd.merge(df, llm_sim, on="Evaluation/llm_iteration", how="outer")
            df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")
        if 'Evaluation/llm_iteration' in llm_div.columns:
            df = pd.merge(df, llm_div, on="Evaluation/llm_iteration", how="outer")
            df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")
            
        # Now, 'df' contains all the merged data based on "Evaluation/llm_iteration"

        df = df.drop(columns=["_step_x", "_step_y"], errors="ignore")

        # Add run config to DataFrame with prefix 'config.'
        for key, value in run.config.items():
            if isinstance(value, list):
                value = ",".join(map(str, value))  # Convert list to comma-separated string
            df[key] = value

        # Filter columns
        key_filter = ['target_character', 'pe', 'branch_factor', 'exp_name', 'evaluator', 'total_iterations', 
                      'reward_feature', 'fewshot', 'problem', 'seed', 'Evaluation/llm_iteration', 
                      'Evaluation/similarity', 'Evaluation/diversity']
        auxiliary_key_filter = ['Evaluation/llm/similarity', 'Evaluation/llm/diversity']
        
        try:
            df = df[key_filter + auxiliary_key_filter]
        except KeyError:
            df = df[key_filter]
        
        # Save DataFrame to cache as CSV
        df.to_csv(cache_file, index=False)
    
    # Append DataFrame to list
    dfs.append(df)

# Concatenate all DataFrames
df = pd.concat(dfs, ignore_index=True)
df.head()  # Display the first few rows for inspection

# Print summary of skipped runs
print("\nSummary of Skipped Runs:")
print(f"Total skipped runs: {len(skipped_runs)}")
print("Skipped run IDs:", skipped_runs)

100%|██████████| 140/140 [00:00<00:00, 168.94it/s]


Summary of Skipped Runs:
Total skipped runs: 0
Skipped run IDs: []





In [108]:
time_str = pd.Timestamp.now().strftime("%Y-%m-%d-%H-%M-%S")

In [109]:
df.to_csv(f"wandb_output_{time_str}.csv", index=False)

In [110]:

df['seed'] = df['seed'] % 3
df

Unnamed: 0,target_character,pe,branch_factor,exp_name,evaluator,total_iterations,reward_feature,fewshot,problem,seed,Evaluation/llm_iteration,Evaluation/similarity,Evaluation/diversity,Evaluation/llm/similarity,Evaluation/llm/diversity
0,N,io,2,def,vit,1,array,False,binary,1,1,0.018449,0.595029,,
1,M,io,2,def,vit,1,array,False,binary,1,1,0.045062,0.147950,,
2,F,io,2,def,vit,1,array,False,binary,1,1,0.799518,0.000000,,
3,D,io,2,def,vit,1,array,False,binary,1,1,0.032404,0.137656,,
4,C,io,2,def,vit,1,array,False,binary,1,1,0.765589,0.000347,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,C,tot,2,def,vit,6,array,True,binary,0,2,0.044665,0.000000,,
756,C,tot,2,def,vit,6,array,True,binary,0,3,0.193625,0.153446,,
757,C,tot,2,def,vit,6,array,True,binary,0,4,0.210502,0.075594,,
758,C,tot,2,def,vit,6,array,True,binary,0,5,0.031002,0.000000,,


In [111]:
# remove pe == cotsc
df = df[df['pe'] != 'cotsc']

# if the exname is not def make the pe to 'pe+exp_name'
df['pe'] = df.apply(lambda x: x['pe'] + '-' +  x['exp_name'] if x['exp_name'] != 'def' else x['pe'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pe'] = df.apply(lambda x: x['pe'] + '-' +  x['exp_name'] if x['exp_name'] != 'def' else x['pe'], axis=1)


In [112]:
df.groupby(['pe', 'evaluator', 'fewshot', 'seed']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,target_character,branch_factor,exp_name,total_iterations,reward_feature,problem,Evaluation/llm_iteration,Evaluation/similarity,Evaluation/diversity,Evaluation/llm/similarity,Evaluation/llm/diversity
pe,evaluator,fewshot,seed,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cot,vit,False,0,30,30,30,30,30,30,30,30,30,0,0
cot,vit,False,1,30,30,30,30,30,30,30,30,30,0,0
cot,vit,False,2,30,30,30,30,30,30,30,30,30,0,0
cot,vit,True,0,30,30,30,30,30,30,30,30,30,0,0
cot,vit,True,1,30,30,30,30,30,30,30,30,30,0,0
cot,vit,True,2,30,30,30,30,30,30,30,30,30,0,0
got,llm,False,0,30,30,30,30,30,30,30,30,30,30,30
got,llm,False,1,30,30,30,30,30,30,30,30,30,30,30
got,llm,False,2,30,30,30,30,30,30,30,30,30,30,30
got,vit,False,0,30,30,30,30,30,30,30,30,30,0,0


In [113]:
# Min-max Normalize Evaluation/similarity and Evaluation/diversity for each 'target_character' and add a new column 'Evaluation/similarity/norm' and 'Evaluation/diversity/norm'
df['Evaluation/similarity/norm'] = df.groupby('target_character')['Evaluation/similarity'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['Evaluation/diversity/norm'] = df.groupby('target_character')['Evaluation/diversity'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Evaluation/similarity/norm'] = df.groupby('target_character')['Evaluation/similarity'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Evaluation/diversity/norm'] = df.groupby('target_character')['Evaluation/diversity'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))


Unnamed: 0,target_character,pe,branch_factor,exp_name,evaluator,total_iterations,reward_feature,fewshot,problem,seed,Evaluation/llm_iteration,Evaluation/similarity,Evaluation/diversity,Evaluation/llm/similarity,Evaluation/llm/diversity,Evaluation/similarity/norm,Evaluation/diversity/norm
0,N,io,2,def,vit,1,array,False,binary,1,1,0.018449,0.595029,,,0.009704,1.000000
1,M,io,2,def,vit,1,array,False,binary,1,1,0.045062,0.147950,,,0.044978,0.338434
2,F,io,2,def,vit,1,array,False,binary,1,1,0.799518,0.000000,,,0.950696,0.000000
3,D,io,2,def,vit,1,array,False,binary,1,1,0.032404,0.137656,,,0.031829,0.265148
4,C,io,2,def,vit,1,array,False,binary,1,1,0.765589,0.000347,,,0.947774,0.000649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,C,tot,2,def,vit,6,array,True,binary,0,2,0.044665,0.000000,,,0.044342,0.000000
756,C,tot,2,def,vit,6,array,True,binary,0,3,0.193625,0.153446,,,0.231013,0.286957
757,C,tot,2,def,vit,6,array,True,binary,0,4,0.210502,0.075594,,,0.252163,0.141367
758,C,tot,2,def,vit,6,array,True,binary,0,5,0.031002,0.000000,,,0.027221,0.000000


In [114]:
# 각 실험마다 마지막 Iteration의 Similarity와 Diversity를 가져옴, 또한 각 실험의 최대 Similarity와 최소 Diversity를 가져옴
# Update the function to include calculations based on normalized similarity and diversity
experiment_cols = ['pe', 'evaluator', 'fewshot', 'seed']  # Assuming these are the experiment identifiers as per user's requirement
experiment_cols_without_seed = ['pe', 'evaluator', 'fewshot']

In [115]:

def extract_metrics_with_norm(group):
    # Maximum similarity iteration row
    max_sim_row = group.loc[group['Evaluation/similarity'].idxmax()]
    max_sim_norm_row = group.loc[group['Evaluation/similarity/norm'].idxmax()]
    # Final iteration row
    final_iteration_row = group.loc[group['Evaluation/llm_iteration'].idxmax()]
    
    return pd.Series({
        'Max Similarity Iteration': max_sim_row['Evaluation/llm_iteration'],
        'Max Similarity': max_sim_row['Evaluation/similarity'],
        'Diversity at Max Similarity': max_sim_row['Evaluation/diversity'],
        
        'Max Normalized Similarity Iteration': max_sim_norm_row['Evaluation/llm_iteration'],
        'Max Normalized Similarity': max_sim_norm_row['Evaluation/similarity/norm'],
        'Diversity at Max Normalized Similarity': max_sim_norm_row['Evaluation/diversity/norm'],
        
        'Final Iteration': final_iteration_row['Evaluation/llm_iteration'],
        'Final Similarity': final_iteration_row['Evaluation/similarity'],
        'Final Diversity': final_iteration_row['Evaluation/diversity'],
        'Final Normalized Similarity': final_iteration_row['Evaluation/similarity/norm'],
        'Final Normalized Diversity': final_iteration_row['Evaluation/diversity/norm']
    })

# Apply the updated function to each experiment group
result = df.groupby(experiment_cols).apply(extract_metrics_with_norm).reset_index()

# Display the enhanced results to the user
result

  result = df.groupby(experiment_cols).apply(extract_metrics_with_norm).reset_index()


Unnamed: 0,pe,evaluator,fewshot,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
0,cot,vit,False,0,3.0,0.804452,0.000218,3.0,0.95667,0.0005519364,6.0,0.029134,0.218648,0.024706,0.500154
1,cot,vit,False,1,3.0,0.81148,0.0,3.0,0.999873,0.0,6.0,0.807933,3e-06,0.995445,5e-06
2,cot,vit,False,2,2.0,0.820263,0.00011,2.0,0.975819,0.0002771142,6.0,0.031065,0.234873,0.027164,0.53727
3,cot,vit,True,0,6.0,0.796896,5.9e-05,6.0,0.981664,9.890662e-05,6.0,0.796896,5.9e-05,0.981664,9.9e-05
4,cot,vit,True,1,1.0,0.753034,0.0016,1.0,0.894402,0.00404546,6.0,0.018554,0.362184,0.009836,0.608684
5,cot,vit,True,2,2.0,0.736431,0.0,2.0,0.874295,0.0,6.0,0.02574,0.160454,0.018807,0.269657
6,got,llm,False,0,3.0,0.808905,5e-05,3.0,0.962063,0.0001251945,6.0,0.031173,0.0,0.0273,0.0
7,got,llm,False,1,1.0,0.814916,2.8e-05,5.0,0.999873,0.0,6.0,0.035396,0.017532,0.032728,0.032786
8,got,llm,False,2,1.0,0.705457,0.013302,5.0,0.874275,0.001421834,6.0,0.01869,0.198087,0.005088,0.500958
9,got,vit,False,0,2.0,0.840231,0.0,2.0,1.0,0.0,6.0,0.103562,0.000323,0.122481,0.000621


In [116]:
plotter = {'Max Normalized Similarity': ['mean', 'std'], 'Diversity at Max Normalized Similarity': ['mean', 'std'], 'Max Similarity Iteration': ['mean', 'std'], 'Final Normalized Similarity': ['mean', 'std'], 'Final Normalized Diversity': ['mean', 'std']}

In [117]:
result.groupby(['pe', 'evaluator', 'fewshot']).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Max Similarity Iteration,Max Similarity Iteration,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
cot,vit,False,0.977454,0.021648,0.000276,0.000276,2.666667,0.57735,0.349105,0.559748,0.345809,0.30005
cot,vit,True,0.916787,0.057077,0.001381,0.002308,3.0,2.645751,0.336769,0.558513,0.292813,0.304952
got,llm,False,0.945404,0.064435,0.000516,0.000787,1.666667,1.154701,0.021705,0.014645,0.177915,0.280243
got,vit,False,0.937511,0.067052,0.002784,0.004822,2.666667,2.081666,0.18828,0.19636,0.194201,0.288931
got,vit,True,0.933643,0.060379,0.003584,0.005928,2.333333,2.309401,0.259426,0.393623,0.110371,0.161636
io,vit,False,0.495079,0.45218,0.076373,0.130097,1.0,0.0,0.181415,0.266251,0.409706,0.523341
tot,llm,False,0.883435,0.180874,0.105889,0.183387,2.0,1.0,0.423316,0.422508,0.065185,0.112903
tot,vit,False,0.993376,0.011363,1.8e-05,3.1e-05,4.0,2.645751,0.341161,0.465834,0.479615,0.436744
tot,vit,True,0.922666,0.067673,0.000177,0.000307,1.0,0.0,0.035057,0.050085,0.457707,0.131691


## Exp 1. Reasoning performance of prompt engineering

In [118]:
# Get the fewshot=False and evaluator=vit
exp1_df = result[(result['fewshot'] == False) & (result['evaluator'] == 'vit')]
exp1_df.groupby(experiment_cols_without_seed).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
pe,evaluator,fewshot,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
io,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3


In [119]:
exp1_df.groupby(experiment_cols_without_seed).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Max Similarity Iteration,Max Similarity Iteration,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
cot,vit,False,0.977454,0.021648,0.000276,0.000276,2.666667,0.57735,0.349105,0.559748,0.345809,0.30005
got,vit,False,0.937511,0.067052,0.002784,0.004822,2.666667,2.081666,0.18828,0.19636,0.194201,0.288931
io,vit,False,0.495079,0.45218,0.076373,0.130097,1.0,0.0,0.181415,0.266251,0.409706,0.523341
tot,vit,False,0.993376,0.011363,1.8e-05,3.1e-05,4.0,2.645751,0.341161,0.465834,0.479615,0.436744


In [120]:
print_result_table(exp1_df, category_columns=['pe'])

Iteration Type,Best Similarity Iteration (Mean ± Std),Best Similarity Iteration (Mean ± Std),Best Similarity Iteration (Mean ± Std),Final Iteration (Mean ± Std),Final Iteration (Mean ± Std)
Metric,Iteration,Similarity,Diversity,Similarity,Diversity
pe,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
io,1.000 ± 0.000,0.495 ± 0.452,0.076 ± 0.130,0.181 ± 0.266,0.410 ± 0.523
cot,2.667 ± 0.577,0.977 ± 0.022,0.000 ± 0.000,0.349 ± 0.560,0.346 ± 0.300
tot,4.000 ± 2.646,0.993 ± 0.011,0.000 ± 0.000,0.341 ± 0.466,0.480 ± 0.437
got,2.667 ± 2.082,0.938 ± 0.067,0.003 ± 0.005,0.188 ± 0.196,0.194 ± 0.289


## Exp. 2: LLM and ViT-based node evaluation 

In [121]:
exp2_df = result[(result['fewshot'] == False)]
exp2_df = exp2_df[exp2_df['pe'].isin(['tot', 'got'])]
exp2_df.groupby(experiment_cols_without_seed).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
pe,evaluator,fewshot,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
got,llm,False,3,3,3,3,3,3,3,3,3,3,3,3
got,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,llm,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3


In [122]:
# get only cot and tot
exp2_df.groupby(experiment_cols_without_seed).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Max Similarity Iteration,Max Similarity Iteration,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
got,llm,False,0.945404,0.064435,0.000516,0.000787,1.666667,1.154701,0.021705,0.014645,0.177915,0.280243
got,vit,False,0.937511,0.067052,0.002784,0.004822,2.666667,2.081666,0.18828,0.19636,0.194201,0.288931
tot,llm,False,0.883435,0.180874,0.105889,0.183387,2.0,1.0,0.423316,0.422508,0.065185,0.112903
tot,vit,False,0.993376,0.011363,1.8e-05,3.1e-05,4.0,2.645751,0.341161,0.465834,0.479615,0.436744


In [123]:
print_result_table(exp2_df, category_columns=['evaluator', 'pe'])

Unnamed: 0_level_0,Iteration Type,Best Similarity Iteration (Mean ± Std),Best Similarity Iteration (Mean ± Std),Best Similarity Iteration (Mean ± Std),Final Iteration (Mean ± Std),Final Iteration (Mean ± Std)
Unnamed: 0_level_1,Metric,Iteration,Similarity,Diversity,Similarity,Diversity
evaluator,pe,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
llm,tot,2.000 ± 1.000,0.883 ± 0.181,0.106 ± 0.183,0.423 ± 0.423,0.065 ± 0.113
llm,got,1.667 ± 1.155,0.945 ± 0.064,0.001 ± 0.001,0.022 ± 0.015,0.178 ± 0.280
vit,tot,4.000 ± 2.646,0.993 ± 0.011,0.000 ± 0.000,0.341 ± 0.466,0.480 ± 0.437
vit,got,2.667 ± 2.082,0.938 ± 0.067,0.003 ± 0.005,0.188 ± 0.196,0.194 ± 0.289


## Exp 3: Controlling the first iteration reward function

In [124]:
# Get the fewshot=True and evaluator=vit
exp3_df = result[(result['pe'] != 'io') & (result['evaluator'] == 'vit')]
exp3_df.groupby(experiment_cols_without_seed).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,Max Similarity Iteration,Max Similarity,Diversity at Max Similarity,Max Normalized Similarity Iteration,Max Normalized Similarity,Diversity at Max Normalized Similarity,Final Iteration,Final Similarity,Final Diversity,Final Normalized Similarity,Final Normalized Diversity
pe,evaluator,fewshot,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
cot,vit,True,3,3,3,3,3,3,3,3,3,3,3,3
got,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
got,vit,True,3,3,3,3,3,3,3,3,3,3,3,3
tot,vit,False,3,3,3,3,3,3,3,3,3,3,3,3
tot,vit,True,3,3,3,3,3,3,3,3,3,3,3,3


In [125]:
exp3_df.groupby(experiment_cols_without_seed).agg(plotter)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Max Normalized Similarity,Max Normalized Similarity,Diversity at Max Normalized Similarity,Diversity at Max Normalized Similarity,Max Similarity Iteration,Max Similarity Iteration,Final Normalized Similarity,Final Normalized Similarity,Final Normalized Diversity,Final Normalized Diversity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
pe,evaluator,fewshot,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
cot,vit,False,0.977454,0.021648,0.000276,0.000276,2.666667,0.57735,0.349105,0.559748,0.345809,0.30005
cot,vit,True,0.916787,0.057077,0.001381,0.002308,3.0,2.645751,0.336769,0.558513,0.292813,0.304952
got,vit,False,0.937511,0.067052,0.002784,0.004822,2.666667,2.081666,0.18828,0.19636,0.194201,0.288931
got,vit,True,0.933643,0.060379,0.003584,0.005928,2.333333,2.309401,0.259426,0.393623,0.110371,0.161636
tot,vit,False,0.993376,0.011363,1.8e-05,3.1e-05,4.0,2.645751,0.341161,0.465834,0.479615,0.436744
tot,vit,True,0.922666,0.067673,0.000177,0.000307,1.0,0.0,0.035057,0.050085,0.457707,0.131691


In [127]:
# Short the dataframe with fewshot True is on top
exp3_df = exp3_df.sort_values(by='fewshot', ascending=True)
print_result_table(exp3_df, category_columns=['pe', 'fewshot'])

Unnamed: 0_level_0,Iteration Type,Best Similarity Iteration (Mean ± Std),Best Similarity Iteration (Mean ± Std),Best Similarity Iteration (Mean ± Std),Final Iteration (Mean ± Std),Final Iteration (Mean ± Std)
Unnamed: 0_level_1,Metric,Iteration,Similarity,Diversity,Similarity,Diversity
pe,fewshot,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
cot,False,2.667 ± 0.577,0.977 ± 0.022,0.000 ± 0.000,0.349 ± 0.560,0.346 ± 0.300
cot,True,3.000 ± 2.646,0.917 ± 0.057,0.001 ± 0.002,0.337 ± 0.559,0.293 ± 0.305
tot,False,4.000 ± 2.646,0.993 ± 0.011,0.000 ± 0.000,0.341 ± 0.466,0.480 ± 0.437
tot,True,1.000 ± 0.000,0.923 ± 0.068,0.000 ± 0.000,0.035 ± 0.050,0.458 ± 0.132
got,False,2.667 ± 2.082,0.938 ± 0.067,0.003 ± 0.005,0.188 ± 0.196,0.194 ± 0.289
got,True,2.333 ± 2.309,0.934 ± 0.060,0.004 ± 0.006,0.259 ± 0.394,0.110 ± 0.162


## Appendix: Additional Analysis