Success Rate Numbers for Blocksworld - unused rows have all 0s

In [7]:
## SUCCESS RATES AT EACH ROUND (INCL. O1) + stats to prove repetition does not matter. 

import json

NUM_ROUNDS = 5
NUM_BASIC_PROBLEMS = 50
NUM_LAGER_PROBLEMS = 20

PRICE_PER_MINI_TOKEN = 12 / 1000000 # $12 per 1M output
PRICE_PER_PREVIEW_TOKEN = 60 / 1000000 # $60 per 1M output

PARENT_FOLDERS = [
    'baseline_o1_results',
    'o1_mini_results/main_results',
    'o1_mini_results/error_correction_through_repetition',
    'larger_problem_sizes/large_test_problems',
    'larger_problem_sizes/mid_test_problems'

]

SUBFOLDERS = [
    'no_strategy',
    'handwritten',
    'strategy_1', 
    'strategy_2', 
    'strategy_3'
]

COST_ANALYSIS_FOLDERS = [
    'baseline_o1_results',
    'o1_mini_results/main_results'
]

success_rate_dict = {}
cost_dict = {parent : {subfolder:[[] for _ in range(NUM_ROUNDS)] for subfolder in SUBFOLDERS} for parent in COST_ANALYSIS_FOLDERS} 

for parent_folder in PARENT_FOLDERS:
    success_rate_dict[parent_folder] = {}
    for subfolder_name in SUBFOLDERS:
        success_rate_dict[parent_folder][subfolder_name] = []
        for round_num in range(NUM_ROUNDS):
            infile_location = f'paper_data/responses/blocksworld/{parent_folder}/{subfolder_name}/round_{round_num}/solution_results.json'
            try:
                with open(infile_location) as infile:
                    round_results = json.load(infile)
                success_rate_dict[parent_folder][subfolder_name].append(len([True for data in round_results if data['result_data']['test_result']['result'] == 'SUCCESS']))
            except FileNotFoundError:
                success_rate_dict[parent_folder][subfolder_name].append(0)
                continue
            
            if not parent_folder in COST_ANALYSIS_FOLDERS:
                continue

            for result_data in round_results:
                with open(f"paper_data/responses/blocksworld/{parent_folder}/{subfolder_name}/round_{round_num}/{result_data['problem_data']['tag']}_response.json") as infile:
                    cost_dict[parent_folder][subfolder_name][round_num].append(json.load(infile)['response']['reasoning_tokens'])
                            

print('\n======= SUCCESS RATES ========\n')

# small correction for ec with repetition -
for subfolder_name in SUBFOLDERS:
    success_rate_dict['o1_mini_results/error_correction_through_repetition'][subfolder_name][0] = success_rate_dict['o1_mini_results/main_results'][subfolder_name][0]

for parent_key in success_rate_dict:
    print(f'\n{parent_key}\n------\n')
    num_problems = NUM_LAGER_PROBLEMS if parent_key.startswith('larger_problem_sizes') else NUM_BASIC_PROBLEMS
    for key in success_rate_dict[parent_key]:
        print(key, [int(sum(success_rate_dict[parent_key][key][:i]) / num_problems * 100) for i in range(1, 6)])  


print('\n======= COST ANALYSIS ========\n')
for parent_key in cost_dict:
    token_cost = PRICE_PER_PREVIEW_TOKEN if parent_key == 'baseline_o1_results' else PRICE_PER_MINI_TOKEN
    for key in cost_dict[parent_key]:
        initial_tpt = int(sum(cost_dict[parent_key][key][0]) / len(cost_dict[parent_key][key][0])) if cost_dict[parent_key][key][0] else '-'
        intitial_cps = round(sum(cost_dict[parent_key][key][0]) * token_cost / success_rate_dict[parent_key][key][0], 2) if success_rate_dict[parent_key][key][0] else '-'
        ec_tpt = int(sum(sum(cost_dict[parent_key][key][1:], [])) / len(sum(cost_dict[parent_key][key][1:], []))) if sum(cost_dict[parent_key][key][1:], []) else '-'
        total_tokens = sum(sum(cost_dict[parent_key][key], []))
        total_cost = round(total_tokens * token_cost, 2)
        total_cps = round(total_cost / sum(success_rate_dict[parent_key][key]), 2) if sum(success_rate_dict[parent_key][key]) else '-'
        print(' | '.join([str(x) for x in (parent_key, key, initial_tpt, ec_tpt, total_tokens, total_cost, total_cps)]))





baseline_o1_results
------

no_strategy [88, 98, 100, 100, 100]
handwritten [0, 0, 0, 0, 0]
strategy_1 [0, 0, 0, 0, 0]
strategy_2 [0, 0, 0, 0, 0]
strategy_3 [0, 0, 0, 0, 0]

o1_mini_results/main_results
------

no_strategy [30, 40, 54, 66, 68]
handwritten [98, 100, 100, 100, 100]
strategy_1 [56, 76, 90, 92, 94]
strategy_2 [50, 66, 76, 86, 90]
strategy_3 [32, 60, 70, 82, 86]

o1_mini_results/error_correction_through_repetition
------

no_strategy [30, 44, 52, 56, 62]
handwritten [98, 98, 98, 98, 98]
strategy_1 [56, 72, 80, 86, 90]
strategy_2 [50, 72, 84, 90, 90]
strategy_3 [32, 56, 70, 80, 86]

larger_problem_sizes/large_test_problems
------

no_strategy [0, 0, 0, 0, 5]
handwritten [70, 85, 100, 100, 100]
strategy_1 [0, 0, 0, 0, 10]
strategy_2 [0, 0, 0, 0, 0]
strategy_3 [0, 0, 0, 0, 0]

larger_problem_sizes/mid_test_problems
------

no_strategy [0, 5, 10, 15, 25]
handwritten [85, 100, 100, 100, 100]
strategy_1 [5, 20, 25, 30, 35]
strategy_2 [0, 0, 0, 0, 0]
strategy_3 [0, 0, 0, 0, 0]


Token and Cost Analysis for Blocksworld

CRT3 TABLE (ERROR CORRECTION STATS)

In [8]:
import json

NUM_PROBLEMS = 150

MODEL_TYPES = [
    '4o',
    '4o_mini',
    '3.5_turbo'
]

STRATEGY_TYPES = [
    'no_strategy',
    'handwritten_strategy',
    'generated_strategy_1',
    'generated_strategy_2',
    'generated_strategy_3'
]

success_rate_dict = {model : {strategy: [] for strategy in STRATEGY_TYPES} for model in MODEL_TYPES}

for strategy in STRATEGY_TYPES:
    for model in MODEL_TYPES:
        for round_num in range(0, 3):
            with open(f'paper_data/responses/crt/{model}/{strategy}/round_{round_num}_results.json') as infile:
                result_data = json.load(infile)
            success_rate_dict[model][strategy].append(len([True for key in result_data if result_data[key]['result'] == 'CORRECT']))

for strategy in STRATEGY_TYPES:
    print(strategy, end = ' ')
    for model in MODEL_TYPES:
        ini = success_rate_dict[model][strategy][0]
        total = sum(success_rate_dict[model][strategy])
        print(' '.join([str(int(x / NUM_PROBLEMS * 100)) for x in (ini, total)]), end = ' ')
    print()


no_strategy 75 92 56 85 2 16 
handwritten_strategy 96 96 90 97 28 57 
generated_strategy_1 87 95 82 96 33 64 
generated_strategy_2 86 94 79 94 24 61 
generated_strategy_3 91 95 80 93 7 14 


In [9]:
# average tokens per blocks strategy - supports the analysis on page 6 

import json

of_interest = ['reasoning_tokens', 'output_tokens']

data = {key: [] for key in of_interest}
for i in range(1, 4):
    with open(f'paper_data/strategies/blocksworld/responses/generated_strategy_{i}_response.json') as infile:
        strategy_response_data = json.load(infile)
    for key in of_interest:
        data[key].append(strategy_response_data['response'][key])

for key in of_interest:
    print(key, int(sum(data[key]) / len(data[key])))

reasoning_tokens 3157
output_tokens 1137


In [10]:
# to show that the cause of token usage decrease is not directly from success rate

import json

NUM_ROUNDS = 5

SUBFOLDERS = [
    'no_strategy',
    'strategy_1', 
    'strategy_2', 
    'strategy_3'
]

correct_data = {strat:[] for strat in SUBFOLDERS}
incorrect_data = {strat:[] for strat in SUBFOLDERS}

for strat in SUBFOLDERS:
    # correct_tasks = []
    # incorrect_tasks = []
    for round_num in range(NUM_ROUNDS):
        with open(f'paper_data/responses/blocksworld/o1_mini_results/main_results/{strat}/round_{round_num}/solution_results.json') as infile:
            result_data = json.load(infile)

        for entry in result_data:
            tag = entry['problem_data']['tag']
            result = entry['result_data']['test_result']['result']
            with open(f'paper_data/responses/blocksworld/o1_mini_results/main_results/{strat}/round_{round_num}/{tag}_response.json') as infile:
                tokens_used = json.load(infile)['response']['reasoning_tokens']
            if result == 'SUCCESS':
                correct_data[strat].append(tokens_used)
            else:
                incorrect_data[strat].append(tokens_used)

def get_avg(numlist):
    return int(sum(numlist) / len(numlist))


print('no_strategy', get_avg(correct_data['no_strategy']), get_avg(incorrect_data['no_strategy']))
print('strategy', get_avg(sum([correct_data[strat] for strat in SUBFOLDERS[1:]], [])), get_avg(sum([incorrect_data[strat] for strat in SUBFOLDERS[1:]], [])))


## Not only is strategy lower in tokens for both success and failure, 
## the difference of ~900 tokens on average is not notable enough to explain 
## a 2000 token shift across only a 30% difference in success rates (starting at just 16%)


no_strategy 5048 6897
strategy 4148 5083
