In [1]:
import argparse
from functools import partial
from rank_bm25 import BM25Okapi
from models import gpts, claude 
from utils import load_json, save_json, generate_episodic_retrieval_queries, generate_semantic_retrieval_queries, generate_episodic_semantic_retrieval_queries, generate_random_retrieval_queries, generate_constant_retrieval_queries
from USACOBench.prompts import solve_prompt_fn, retrieval_prompt_fn, reflexion_prompt_fn, solve_detailed_prompt_fn, RetrievalType
from USACOBench.data_utils import load_corpus, load_problem_dict, load_problems
from evaluate import evaluate_model
from USACOBench.evaluation import print_metrics
from dotenv import load_dotenv
from utils import run_solve, run_retrieval, run_reflexion, calculate_final_rs, search
from USACOBench.evaluation.metrics import pass_at_k
from USACOBench.evaluation.result_type import ResultType
from collections import Counter
from enum import Enum

model_fn = gpts
problem_dict = load_problem_dict('usaco_subset307')
model_name = 'gpt-4-turbo'
model_fn = partial(model_fn, model=model_name)
load_dotenv()

def combine_list_dicts(solution_dict1, solution_dict2):
    res = dict()
    for problem_id in solution_dict1:
        res[problem_id] = solution_dict1[problem_id] + solution_dict2[problem_id]
    return res

import nest_asyncio
nest_asyncio.apply()

  from .autonotebook import tqdm as notebook_tqdm


## Retrieval Over Synthetic Knowledge Bases

#### Precursor 1:
One concern that I have is that if I recall correctly, retrieval with just the problem statement as the query did just as well (if not better) than retrieval with the problem as well as the solution to the problem. This may suggest some memorization at play...?

Alternatively, it may suggest that similar problem environments (reasoning about the same types of problems) is the most important for retrieval, which would be interesting. Let's verify these results again here.

In [2]:
# Running just solve over all queries
attempts = 1
rdict, sdict, rs, ss = run_solve(model_fn, model_name, problem_dict, attempts)

Evaluating on a subset of 307 out of 307 available query-ground_truth pairs...
Evaluating on 307 queries...
Generating...


  0%|          | 0/307 [00:00<?, ?it/s]

100%|██████████| 307/307 [02:19<00:00,  2.20it/s]


Finished generation, took 139.36070942878723 seconds
Judging...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_02_2024_19_46_23_023715.pred'
Code execution failed, skipping this attempt...


Process Process-575:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 90, in unsafe_execute
    with create_tempdir():
  File "/usr/lib/python3.10/contextlib.py", line 142, in __exit__
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 199, in create_tempdir
    with tempfile.TemporaryDirectory() as dirname:
  File "/usr/lib/python3.10/tempfile.py", line 1017, in __exit__
    self.cleanup()
  File "/usr/lib/python3.10/tempfile.py", line 1021, in cleanup
    self._rmtree(self.name, ignore_errors=self._ignore_cleanup_errors)
  File "/usr/lib/python3.10/tempfile.py", line 1003, in _rmtree
    _rmtree(name, onerror=onerror)
  File "/usr/lib/python3.10/shutil.py", line 725, in rm

[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_02_2024_19_46_41_269317.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_02_2024_19_47_36_125475.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_02_2024_19_48_23_907348.pred'
Code execution failed, skipping this attempt...


100%|██████████| 307/307 [00:00<00:00, 299106.00it/s]


Saving results at judge_sandbox/result_sets_08_02_2024_19_46_19_597209.pickle...
Finished judging, took 287.2698097229004 seconds
Saved json at results/results_gpt-4-turbo_solve_1attempts_08_02_2024_19_51_06_868660.json


In [2]:
rdict, sdict, rs, ss = load_json('results/results_gpt-4-turbo_solve_1attempts_08_02_2024_19_51_06_868660')
pass_at_k(rs)

(0.05537459283387622, 0.013053167062670616)

A little interesting that the performance is so low... I remember it being a lot better on ionic.

In [9]:
# Episodic Retrieval with Problem Descrip + Generated Solution Query
attempts = 1
num_retrieved = 1
rdict, sdict, rs, ss = run_retrieval(model_fn, model_name, problem_dict, attempts, ss, num_retrieved, RetrievalType.EPISODIC)

Saved json at queries_firstsolve_1problem_episodic_08_02_2024_20_02_27_029838.json
Evaluating on a subset of 307 out of 307 available query-ground_truth pairs...
Evaluating on 307 queries...
Generating...


100%|██████████| 307/307 [02:10<00:00,  2.36it/s]


Finished generation, took 130.28575921058655 seconds
Judging...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_02_2024_20_06_25_188861.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_02_2024_20_07_02_838283.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_02_2024_20_07_03_377422.pred'
Code execution failed, skipping this attempt...


100%|██████████| 307/307 [00:00<00:00, 318489.07it/s]


Saving results at judge_sandbox/result_sets_08_02_2024_20_04_37_330881.pickle...
Finished judging, took 287.1852343082428 seconds
Saved json at results/results_gpt-4-turbo_episodic_retrieval_1attempts_08_02_2024_20_09_24_517730.json


In [10]:
pass_at_k(rs)

(0.11400651465798045, 0.01813889851472319)

In [3]:
# Episodic Retrieval with Problem Descrip Query
attempts = 1
num_retrieved = 1
rdict, sdict, rs, ss = run_retrieval(model_fn, model_name, problem_dict, attempts, ss, num_retrieved, RetrievalType.EPISODIC, use_text=False)

Saved json at queries_firstsolve_1problem_episodic_08_02_2024_20_28_08_105072.json
Evaluating on a subset of 307 out of 307 available query-ground_truth pairs...
Evaluating on 307 queries...
Generating...


100%|██████████| 307/307 [02:03<00:00,  2.49it/s]


Finished generation, took 123.44882607460022 seconds
Judging...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_02_2024_20_30_56_668443.pred'
Code execution failed, skipping this attempt...


100%|██████████| 307/307 [00:00<00:00, 303691.35it/s]


Saving results at judge_sandbox/result_sets_08_02_2024_20_30_11_572542.pickle...
Finished judging, took 287.1614258289337 seconds
Saved json at results/results_gpt-4-turbo_episodic_retrieval_1attempts_08_02_2024_20_34_58_735782.json


In [4]:
pass_at_k(rs)

(0.11726384364820847, 0.018362353022901092)

Ok so...

It isn't necessarily better: but it performs very similarly. What can we say about this? It seems that problem environments are the main important factor for retrieval. (Main interpretation?)

## Part 1: Benchmarking Synthetic Knowledge Retrieval

We again use the LOOCV method to estimate maximum retrieval effectiveness.

#### Part 1.1 Generating the synthetic knowledge base

1. Write prompt for a more readable solution. (Generate solution first and then code)
2. Try to solve every problem that can be solved.
3. Try settings where mix of incorrect solutions + correct solutions are included in the bank, as well as only correct solutions (probably only correct solutions will work?)

In [None]:
# Running solve with a detailed prompt function
attempts = 3
queries = []
for problem_id in problem_dict.keys():
    queries.append({'problem_id': problem_id, 'problem_description': problem_dict[problem_id]['description']})

rdict, sdict, rs, ss = evaluate_model(model_fn, solve_detailed_prompt_fn, queries=queries, verbose=True, attempts=attempts, problem_ids=list(problem_dict.keys()))
save_json([rdict, sdict, rs, ss], f'results/results_{model_name}_solve_{attempts}attempts_detailed_prompt')

In [9]:
# Only takes in aggregated solution_dicts
class RetrievalBankType(int, Enum):
    CORRECT = 1
    P_CORRECT = 2
    GENERAL = 3
    

def gen_retrieval_bank(solution_dict, option):
    retrieval_bank = dict()

    for problem_id in solution_dict:
        solution_set = solution_dict[problem_id]

        max_tests_passed = 0
        best_solution = None
        for solution in solution_set:
            if option == RetrievalBankType.CORRECT:
                if solution['result']['num_tests'] == solution['result']['num_passed']:
                    best_solution = solution
            
            elif solution['result']['num_passed'] > max_tests_passed:
                max_tests_passed = solution['result']['num_passed']
                best_solution = solution
        
        if best_solution:
            retrieval_bank[problem_id] = best_solution
        elif option == RetrievalBankType.GENERAL:
            retrieval_bank[problem_id] = solution_set[-1]
    
    return retrieval_bank

# Aggregating solution texts
results1 = load_json('results/results_gpt-4-turbo_solve_2attempts_detailed_prompt_08_03_2024_12_05_38_124001')
results2 = load_json('results/results_gpt-4-turbo_solve_3attempts_detailed_prompt_08_03_2024_12_24_27_523579')
solution_dicts_combined = combine_list_dicts(results1[1], results2[1])
result_dicts_combined = combine_list_dicts(results1[0], results2[0])

retrieval_bank_correct = gen_retrieval_bank(solution_dicts_combined, RetrievalBankType.CORRECT)
retrieval_bank_p_correct = gen_retrieval_bank(solution_dicts_combined, RetrievalBankType.P_CORRECT)
retrieval_bank_general = gen_retrieval_bank(solution_dicts_combined, RetrievalBankType.GENERAL)

198

In [59]:
# Performing retrieval over these retrieval banks:
# Solutions are the query solutions
def generate_bank_retrieval_queries(p, problem_dict, solutions, retrieval_bank, use_text=True):
    query_texts = []
    problem_ids = []

    for solution in solutions:
        solution_text = solution[0]['solution']
        problem_id = solution[0]['problem_id']
        if problem_id in problem_dict.keys():
            if use_text:
                query_texts.append(problem_dict[problem_id]['description'] + '\n' + solution_text)
            else:
                query_texts.append(problem_dict[problem_id]['description'] + '\n' + solution_text)
            problem_ids.append(problem_id)
            
    resulting_queries = []
    for i, problem_id in enumerate(problem_ids):
        eligible_retrieval = list(retrieval_bank.keys())
        if problem_id in eligible_retrieval:
            eligible_retrieval.remove(problem_id)
        
        corpus = [problem_dict[problem_id]['description'] + '\nSolution: \n' + retrieval_bank[problem_id]['solution'] for problem_id in eligible_retrieval]
        tokenized_corpus = [doc.split(' ') for doc in corpus]
        bm25 = BM25Okapi(tokenized_corpus)

        curr_query = query_texts[i]
        tokenized_query = curr_query.split(" ")
        similar_problem_texts = bm25.get_top_n(tokenized_query, corpus, n=p)
        similar_problem_text = ""
        words = ["First", "Second", "Third", "Fourth", "Fifth", "Sixth", "Seventh"]
        similar_problem_ids = []
        for i, text in enumerate(similar_problem_texts):
            similar_problem_text += f"\n\n {words[i]} problem and solution \n\n" + text 
            for j, cor in enumerate(corpus):
                if text == cor:
                    similar_problem_ids.append(eligible_retrieval[j])
        
        for val in similar_problem_ids:
            if val == None:
                print(similar_problem_texts)
                assert 1 == 2
        
        resulting_queries.append({'problem_id': problem_id, 'retrieval_text': '[BEGIN SIMILAR PROBLEMS]\n' + similar_problem_text + '\n[END SIMILAR PROBLEMS]\n', 'retrieval_problem_ids': similar_problem_ids, 'problem_description': problem_dict[problem_id]['description']})
    # save_json(resulting_queries, f'queries_{p}problem_bank_retrieval')
    return resulting_queries

rdict, sdict, rs, ss = load_json('results/results_gpt-4-turbo_solve_2attempts_detailed_prompt_08_03_2024_12_05_38_124001')
ss = [[solution_set[0]] for solution_set in ss]

In [60]:
retrieval_bank_correct_queries = generate_bank_retrieval_queries(1, problem_dict, ss, retrieval_bank_correct)
retrieval_bank_p_correct_queries = generate_bank_retrieval_queries(1, problem_dict, ss, retrieval_bank_p_correct)
retrieval_bank_general_queries = generate_bank_retrieval_queries(1, problem_dict, ss, retrieval_bank_general)

In [64]:
save_json(retrieval_bank_correct_queries, 'queries_retrieval_bank_correct')
save_json(retrieval_bank_p_correct_queries, 'queries_retrieval_bank_p_correct')
save_json(retrieval_bank_general_queries, 'queries_retrieval_bank_general')

Saved json at queries_retrieval_bank_correct_08_03_2024_13_30_54_917355.json
Saved json at queries_retrieval_bank_p_correct_08_03_2024_13_30_54_931327.json
Saved json at queries_retrieval_bank_general_08_03_2024_13_30_54_941728.json


'queries_retrieval_bank_general_08_03_2024_13_30_54_941728.json'

#### Part 1.2 Evaluating the three synthetic retrieval settings:

In [73]:
# Evaluation Time: CORRECT
attempts = 1
r_prompt_fn = partial(retrieval_prompt_fn, retrieval_type=RetrievalType.EPISODIC)
rdict, sdict, rs, ss = evaluate_model(model_fn, r_prompt_fn, queries=retrieval_bank_correct_queries, verbose=True, attempts=attempts, problem_ids=list(problem_dict.keys()))

Evaluating on a subset of 307 out of 307 available query-ground_truth pairs...
Evaluating on 307 queries...
Generating...


  0%|          | 0/307 [00:00<?, ?it/s]

100%|██████████| 307/307 [01:56<00:00,  2.64it/s]


Finished generation attempt 0, took 116.19124460220337 seconds
Judging...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_14_14_04_980643.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_14_14_17_167991.pred'
Code execution failed, skipping this attempt...


Process Process-4462:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 90, in unsafe_execute
    with create_tempdir():
  File "/usr/lib/python3.10/contextlib.py", line 142, in __exit__
    next(self.gen)
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 199, in create_tempdir
    with tempfile.TemporaryDirectory() as dirname:
  File "/usr/lib/python3.10/tempfile.py", line 1017, in __exit__
  File "/usr/lib/python3.10/tempfile.py", line 1021, in cleanup
    self._rmtree(self.name, ignore_errors=self._ignore_cleanup_errors)
  File "/usr/lib/python3.10/tempfile.py", line 1003, in _rmtree
    _rmtree(name, onerror=onerror)
  File "/usr/lib/python3.10/shutil.py", line 725, in r

[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_14_16_10_324782.pred'
Code execution failed, skipping this attempt...


100%|██████████| 307/307 [00:00<00:00, 265823.97it/s]


Saving results at judge_sandbox/result_sets_08_03_2024_14_11_15_527621.pickle...
Finished judging attempt 0, took 320.04561734199524 seconds


In [74]:
save_json([rdict, sdict, rs, ss], f'results/results_{model_name}_correct_retrieval_{attempts}attempts')

Saved json at results/results_gpt-4-turbo_correct_retrieval_1attempts_08_03_2024_14_16_35_585102.json


'results/results_gpt-4-turbo_correct_retrieval_1attempts_08_03_2024_14_16_35_585102.json'

In [75]:
pass_at_k(rs)

(0.09771986970684039, 0.01694700465220386)

In [66]:
# Use this one ig because it is better (?)
pass_at_k(rs)

(0.11400651465798045, 0.01813889851472319)

In [67]:
# Evaluation Time: P_CORRECT
attempts = 1
r_prompt_fn = partial(retrieval_prompt_fn, retrieval_type=RetrievalType.EPISODIC)
rdict, sdict, rs, ss = evaluate_model(model_fn, r_prompt_fn, queries=retrieval_bank_p_correct_queries, verbose=True, attempts=attempts, problem_ids=list(problem_dict.keys()))

Evaluating on a subset of 307 out of 307 available query-ground_truth pairs...
Evaluating on 307 queries...
Generating...


100%|██████████| 307/307 [02:06<00:00,  2.43it/s]


Finished generation attempt 0, took 126.54790019989014 seconds
Judging...


Process Process-71:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 90, in unsafe_execute
    with create_tempdir():
  File "/usr/lib/python3.10/contextlib.py", line 142, in __exit__
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 199, in create_tempdir
  File "/usr/lib/python3.10/tempfile.py", line 1017, in __exit__
    self.cleanup()
  File "/usr/lib/python3.10/tempfile.py", line 1021, in cleanup
    self._rmtree(self.name, ignore_errors=self._ignore_cleanup_errors)
  File "/usr/lib/python3.10/tempfile.py", line 1003, in _rmtree
    _rmtree(name, onerror=onerror)
  File "/usr/lib/python3.10/shutil.py", line 725, in rmtree
  File "/usr/lib/python3.10/shutil.py", line 63

[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_13_50_29_944745.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_13_51_29_626027.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_13_51_37_199330.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_13_51_48_760224.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_13_52_14_824068.pred'
Code execution failed, skipping this attempt...


100%|██████████| 307/307 [00:00<00:00, 266263.72it/s]


Saving results at judge_sandbox/result_sets_08_03_2024_13_48_09_069215.pickle...
Finished judging attempt 0, took 249.8976788520813 seconds


In [69]:
save_json([rdict, sdict, rs, ss], f'results/results_{model_name}_p_correct_retrieval_{attempts}attempts')

Saved json at results/results_gpt-4-turbo_p_correct_retrieval_1attempts_08_03_2024_13_56_38_004312.json


'results/results_gpt-4-turbo_p_correct_retrieval_1attempts_08_03_2024_13_56_38_004312.json'

In [68]:
pass_at_k(rs)

(0.09446254071661238, 0.016692209764851206)

In [70]:
# Evaluation Time: GENERAL
attempts = 1
r_prompt_fn = partial(retrieval_prompt_fn, retrieval_type=RetrievalType.EPISODIC)
rdict, sdict, rs, ss = evaluate_model(model_fn, r_prompt_fn, queries=retrieval_bank_general_queries, verbose=True, attempts=attempts, problem_ids=list(problem_dict.keys()))

Evaluating on a subset of 307 out of 307 available query-ground_truth pairs...
Evaluating on 307 queries...
Generating...


100%|██████████| 307/307 [01:56<00:00,  2.64it/s]


Finished generation attempt 0, took 116.25402164459229 seconds
Judging...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_14_00_45_653261.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_14_01_27_334279.pred'
Code execution failed, skipping this attempt...


Process Process-4441:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 90, in unsafe_execute
    with create_tempdir():
  File "/usr/lib/python3.10/contextlib.py", line 142, in __exit__
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 199, in create_tempdir
    with tempfile.TemporaryDirectory() as dirname:
  File "/usr/lib/python3.10/tempfile.py", line 1017, in __exit__
    self.cleanup()
  File "/usr/lib/python3.10/tempfile.py", line 1021, in cleanup
    self._rmtree(self.name, ignore_errors=self._ignore_cleanup_errors)
  File "/usr/lib/python3.10/tempfile.py", line 1003, in _rmtree
    _rmtree(name, onerror=onerror)
  File "/usr/lib/python3.10/shutil.py", line 725, in r

[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_14_02_12_996739.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_14_02_19_552982.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_14_02_24_473287.pred'
Code execution failed, skipping this attempt...


100%|██████████| 307/307 [00:00<00:00, 275139.17it/s]


Saving results at judge_sandbox/result_sets_08_03_2024_13_59_05_619814.pickle...
Finished judging attempt 0, took 224.04444909095764 seconds


In [71]:
save_json([rdict, sdict, rs, ss], f'results/results_{model_name}_general_retrieval_{attempts}attempts')

Saved json at results/results_gpt-4-turbo_general_retrieval_1attempts_08_03_2024_14_08_15_888396.json


'results/results_gpt-4-turbo_general_retrieval_1attempts_08_03_2024_14_08_15_888396.json'

In [72]:
pass_at_k(rs)

(0.0781758957654723, 0.015321146856523098)

#### Part 1.3 Evaluating Some Baselines
1. Random Retrieval
2. In Context Demo Retrieval (Just one question)

In [2]:
# Quick test of random retrieval setting:
retrieval_random_queries = generate_random_retrieval_queries(1, problem_dict)

Saved json at queries_random_1problem_episodic_08_03_2024_16_49_40_385417.json


In [6]:
# Evaluation Time: RANDOM
attempts = 1
r_prompt_fn = partial(retrieval_prompt_fn, retrieval_type=RetrievalType.EPISODIC)
rdict, sdict, rs, ss = evaluate_model(model_fn, r_prompt_fn, queries=retrieval_random_queries, verbose=True, attempts=attempts, problem_ids=list(problem_dict.keys()))

Evaluating on a subset of 307 out of 307 available query-ground_truth pairs...
Evaluating on 307 queries...
Generating...


100%|██████████| 307/307 [02:02<00:00,  2.50it/s]


Finished generation attempt 0, took 122.95390510559082 seconds
Judging...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_16_54_00_099964.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_16_54_56_147945.pred'
Code execution failed, skipping this attempt...


Process Process-4472:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 90, in unsafe_execute
    with create_tempdir():
  File "/usr/lib/python3.10/contextlib.py", line 142, in __exit__
    next(self.gen)
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 199, in create_tempdir
    with tempfile.TemporaryDirectory() as dirname:
  File "/usr/lib/python3.10/tempfile.py", line 1017, in __exit__
  File "/usr/lib/python3.10/tempfile.py", line 1021, in cleanup
    self._rmtree(self.name, ignore_errors=self._ignore_cleanup_errors)
  File "/usr/lib/python3.10/tempfile.py", line 1003, in _rmtree
    _rmtree(name, onerror=onerror)
  File "/usr/lib/python3.10/shutil.py", line 725, in r

[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_16_56_45_740015.pred'
Code execution failed, skipping this attempt...


100%|██████████| 307/307 [00:00<00:00, 325493.26it/s]


Saving results at judge_sandbox/result_sets_08_03_2024_16_53_24_158103.pickle...
Finished judging attempt 0, took 252.80005383491516 seconds


In [7]:
save_json([rdict, sdict, rs, ss], f'results/results_{model_name}_random_retrieval_{attempts}attempts')

Saved json at results/results_gpt-4-turbo_random_retrieval_1attempts_08_03_2024_16_58_20_731163.json


'results/results_gpt-4-turbo_random_retrieval_1attempts_08_03_2024_16_58_20_731163.json'

In [8]:
pass_at_k(rs)

(0.10749185667752444, 0.017677656394106317)

In [2]:
# Quick test of constant retrieval setting
retrieval_constant_queries = generate_constant_retrieval_queries(1, problem_dict)

Saved json at queries_random_1problem_episodic_08_03_2024_17_16_26_794761.json


In [3]:
# Evaluation Time: CONSTANT
attempts = 1
r_prompt_fn = partial(retrieval_prompt_fn, retrieval_type=RetrievalType.EPISODIC)
rdict, sdict, rs, ss = evaluate_model(model_fn, r_prompt_fn, queries=retrieval_constant_queries, verbose=True, attempts=attempts, problem_ids=list(problem_dict.keys()))

Evaluating on a subset of 307 out of 307 available query-ground_truth pairs...
Evaluating on 307 queries...
Generating...


100%|██████████| 307/307 [01:50<00:00,  2.79it/s]


Finished generation attempt 0, took 110.0956757068634 seconds
Judging...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_17_19_08_516480.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_17_19_11_268419.pred'
Code execution failed, skipping this attempt...


Process Process-1108:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 90, in unsafe_execute
    with create_tempdir():
  File "/usr/lib/python3.10/contextlib.py", line 142, in __exit__
    next(self.gen)
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 199, in create_tempdir
    with tempfile.TemporaryDirectory() as dirname:
  File "/usr/lib/python3.10/tempfile.py", line 1017, in __exit__
  File "/usr/lib/python3.10/tempfile.py", line 1021, in cleanup
    self._rmtree(self.name, ignore_errors=self._ignore_cleanup_errors)
  File "/usr/lib/python3.10/tempfile.py", line 1003, in _rmtree
    _rmtree(name, onerror=onerror)
  File "/usr/lib/python3.10/shutil.py", line 725, in r

[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_17_20_59_197672.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_17_21_01_438087.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_17_22_09_868558.pred'
Code execution failed, skipping this attempt...
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_03_2024_17_22_24_698970.pred'
Code execution failed, skipping this attempt...


100%|██████████| 307/307 [00:00<00:00, 297447.75it/s]


Saving results at judge_sandbox/result_sets_08_03_2024_17_19_04_914581.pickle...
Finished judging attempt 0, took 232.3764204978943 seconds


In [9]:
retrieval_constant_queries[3]

{'problem_id': '1327_silver_field_day',
 'retrieval_text': '[BEGIN SIMILAR PROBLEMS]\n\n\n First problem and solution \n\nFarmer John, desperate to win the award for best cow photographer at the county\nfair, is trying to take the perfect photograph of his $N$ cows\n($2 \\leq N \\leq 2\\cdot 10^5$, $N$ even).\n\nFarmer John owns cows of two potential breeds: Guernseys and Holsteins.  \nTo make his photo as aesthetic as possible, he wants to line up his \ncows so that as many Guernseys are in even-numbered positions in the line \nas possible (the first position in the line is an odd position, the next is\nan even position, and so on). Due to his lack of strong\ncommunication with his cows, the only way he can achieve his goal is by asking\neven length "prefixes" of his cows to reverse themselves (a prefix \nconsists of the range of cows from the first cow up to the $j$th cow\nfor some position $j$).\n\nPlease count the minimum number of reversals required for Farmer John to achieve\nhis

In [4]:
save_json([rdict, sdict, rs, ss], f'results/results_{model_name}_constant_retrieval_{attempts}attempts')

Saved json at results/results_gpt-4-turbo_constant_retrieval_1attempts_08_03_2024_17_22_57_300383.json


'results/results_gpt-4-turbo_constant_retrieval_1attempts_08_03_2024_17_22_57_300383.json'

In [5]:
pass_at_k(rs)

(0.09771986970684039, 0.01694700465220386)

#### Part 1.4 Evaluating on different model

In [2]:
model_fn = gpts
problem_dict = load_problem_dict('usaco_subset307')
model_name = 'gpt-4o'
model_fn = partial(model_fn, model=model_name)

In [3]:
# Running just solve over all queries
attempts = 1
rdict, sdict, rs, ss = run_solve(model_fn, model_name, problem_dict, attempts)

Evaluating on a subset of 307 out of 307 available query-ground_truth pairs...
Evaluating on 307 queries...
Generating...


100%|██████████| 307/307 [00:58<00:00,  5.21it/s]


Finished generation attempt 0, took 58.92554068565369 seconds
Judging...
Could not parse code from generated solution — returning entire solution
substring not found
[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_05_2024_19_54_45_258047.pred'
Code execution failed, skipping this attempt...


Process Process-128:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 90, in unsafe_execute
    with create_tempdir():
  File "/usr/lib/python3.10/contextlib.py", line 142, in __exit__
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 199, in create_tempdir
  File "/usr/lib/python3.10/tempfile.py", line 1017, in __exit__
    self.cleanup()
  File "/usr/lib/python3.10/tempfile.py", line 1021, in cleanup
    self._rmtree(self.name, ignore_errors=self._ignore_cleanup_errors)
  File "/usr/lib/python3.10/tempfile.py", line 1003, in _rmtree
    _rmtree(name, onerror=onerror)
  File "/usr/lib/python3.10/shutil.py", line 725, in rmtree
  File "/usr/lib/python3.10/shutil.py", line 6

[Errno 2] No such file or directory: '/home/benshi34/USACOnew/judge_sandbox/predictions/usaco/1_08_05_2024_19_54_47_980001.pred'
Code execution failed, skipping this attempt...


Process Process-165:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 90, in unsafe_execute
    with create_tempdir():
  File "/usr/lib/python3.10/contextlib.py", line 142, in __exit__
  File "/home/benshi34/USACOnew/USACOBench/evaluation/judges/usaco_utils.py", line 199, in create_tempdir
  File "/usr/lib/python3.10/tempfile.py", line 1017, in __exit__
  File "/usr/lib/python3.10/tempfile.py", line 1021, in cleanup
  File "/usr/lib/python3.10/tempfile.py", line 1003, in _rmtree
  File "/usr/lib/python3.10/shutil.py", line 725, in rmtree
  File "/usr/lib/python3.10/shutil.py", line 633, in _rmtree_safe_fd
  File "/usr/lib/python3.10/shutil.py", line 629, in _rmtree_safe_fd
OSError: [Errno 12] Cannot alloca

In [None]:
rdict, sdict, rs, ss = load_json('results/results_gpt-4-turbo_solve_1attempts_08_02_2024_19_51_06_868660')
pass_at_k(rs)