Code from A3 problem1-prompt.ipynb and https://github.com/SewoongLee/reproduce-llama3-arithmetic/blob/main/llama3-tutorial-gsm8k.ipynb

You need to set your OPENAI_API_KEY environment variable in .env

In [1]:
from openai import OpenAI
from tqdm import tqdm
import textwrap
import dotenv
import os
%load_ext dotenv
%dotenv

client=OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# we use gpt-4o-mini as default: current pricing is 
# $0.150 / 1M input tokens, $0.600 / 1M output tokens
def generate_gpt_response(prompt,messages=[], model="gpt-3.5-turbo", temperature=0.7):
    messages.append({"role": "user", "content": prompt})
            
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature = temperature
    )
    content=response.choices[0].message.content
    messages.append({'role': 'assistant', 'content': content})
    return messages


def print_message(message, width=80):
    text=message["role"]+": " + message["content"]
    wrapped_text = textwrap.fill(text, width=width)
    
    for line in wrapped_text.split('\n'):
        print(f" {line.ljust(width)} ")        
    

cannot find .env file


load ablated gsm8k json files

In [None]:
import os
import random
import json
import re

random.seed(56)

ablated_dataset = []
q_indices = []
folder_path = "gsm_template/output"

files = os.listdir(folder_path)
sorted_files = sorted(files, key=lambda x: int(x.split('--')[0][1:]))

print(sorted_files)
print()

repeat = 20

for file_name in sorted_files:
    f_path = os.path.join(folder_path, file_name)

    match = re.search(r'\d+', f_path)
    num = match.group(0)
    q_indices.append(int(num))

    with open(f_path, 'r') as f:
        lines = f.readlines()

    # random_lines = random.sample([json.loads(item.strip()) for item in lines], min(repeat, len(lines)))
    # print(random_lines)
    # print()

    # ablated_dataset.append(random_lines)
    ablated_dataset.append([json.loads(item.strip()) for item in lines])

print()
print(len(ablated_dataset))
print(len(ablated_dataset[0]))
print(q_indices)


['q0--NUM100.jsonl', 'q1--NUM100.jsonl', 'q2--NUM100.jsonl', 'q3--NUM100.jsonl', 'q4--NUM100.jsonl', 'q5--NUM100.jsonl', 'q6--NUM100.jsonl', 'q7--NUM100.jsonl', 'q8--NUM100.jsonl', 'q9--NUM100.jsonl', 'q11--NUM100.jsonl', 'q12--NUM100.jsonl', 'q13--NUM100.jsonl', 'q14--NUM100.jsonl', 'q15--NUM100.jsonl', 'q19--NUM100.jsonl', 'q20--NUM100.jsonl', 'q23--NUM100.jsonl', 'q24--NUM100.jsonl', 'q25--NUM100.jsonl']

[{'problem': 'Kc Otis sold clips to 48 of their friends in April, and then they sold 50% as many clips in May.How many clips did Kc Otis sell altogether in April and May?', 'result': '72', 'template_name': 0, 'idx': 71}, {'problem': 'Josh Joaquin sold clips to 48 of their friends in April, and then they sold 50% as many clips in May.How many clips did Josh Joaquin sell altogether in April and May?', 'result': '72', 'template_name': 0, 'idx': 1}, {'problem': 'Cristina Boogie sold clips to 48 of their friends in April, and then they sold 50% as many clips in May.How many clips did Cr

We will use the following system prompt

In [5]:
system_prompt= \
'''You are an expert math tutor specializing in grade-school-level problems like those in the GSM8K dataset. 
Solve each problem step by step, explaining the reasoning and calculations clearly and concisely to help the student understand the process. 
Please return your final answer after the delimiter \"####\" as a numeric number'''

We can answer from gpt api using fewshots in context learning and system prompt

In [6]:
import re

def extract_ans_from_response(answer: str, eos=None):
    if eos:
        answer = answer.split(eos)[0].strip()

    answer = answer.split('####')
    
    if len(answer) > 2:
        answer = answer[-2].strip()
    elif len(answer) == 2:
        answer = answer[-1].strip()
    else:
        answer = "NA"

    if isinstance(answer, str):
        for remove_char in [',', '$', '%', 'g']:
            answer = answer.replace(remove_char, '')

    try:
        return int(answer)
    except ValueError:
        return answer

# def extract_ans_from_response(answer: str, eos=None):
#     if eos:
#         answer = answer.split(eos)[0].strip()

#     answer = answer.split('####')[-1].strip()

#     for remove_char in [',', '$', '%', 'g']:
#         answer = answer.replace(remove_char, '')

#     try:
#         return int(answer)
#     except ValueError:
#         return answer

# def get_num_answer(text):
#     num_matches = re.findall(r'\d+\.?\d*', text)

#     if not num_matches:
#         return None
    
#     final_answer = num_matches[-1]

#     return final_answer

# Implement the function answer_from_gpt(question,temperature,num_shots), which should return the gpt answer as a string
# (see how it is called in the function evaluate_fewshots below)
# temperature is the temperature parameter passed to gpt api
# if num_shots>0, you should use the first num_shots examples in train_data for in-context learning
# Please use the system prompt provided above (see lecture10-gpt.ipynb for examples of system prompt usage)
#
def answer_from_gpt(question, temperature, num_shots):
    example = []

    # if num_shots > 0:
    #     for i in range(num_shots):
    #         curr_ex = train_data[i]
    #         example.append({"role": "user", "content": curr_ex['question']})
    #         example.append({"role": "assistant", "content": curr_ex['answer']})

    full_prompt = system_prompt + "\n" + question
    # print("Q: ", full_prompt)

    messages = generate_gpt_response(prompt=full_prompt, messages=example, temperature=temperature)
    answer = messages[-1]["content"]
    # print("A: ", answer)
    last_token = extract_ans_from_response(answer)
    return last_token

Evaluate on a small subset of gsm8k test data

In [None]:
def evaluate_fewshots(data, temperature,num_shots):
    num_corrects=0
    for k in  tqdm(range(len(data))):
        gpt_answer=answer_from_gpt(data[k]['problem'],temperature=temperature,num_shots=num_shots)
        true_answer= data[k]['result']
        # print(f'k={k}: gpt_answer={gpt_answer}  true_answer={true_answer}')   # this is for debug only
        
        if gpt_answer != "NA":
            try:
                if float(gpt_answer) == float(true_answer):
                    # print("CORRECT")
                    num_corrects += 1
                else:
                    # print("INCORRECT")
                    num_corrects += 0
            except ValueError:
                # print("INCORRECT")
                num_corrects += 0
    
    accuracy = num_corrects/len(data)
    print(f'accuracy={accuracy:0.2f}')
    return accuracy

temperature = 0
num_shots = 0

total_inaccurate = 0

print(f'=== temperature={temperature} num_shots={num_shots} ===')
for i in range(len(ablated_dataset)):
    curr_ablation = ablated_dataset[i]

    print("Question ", q_indices[i])
    accuracy = evaluate_fewshots(curr_ablation, temperature, num_shots)
    if accuracy < 1:
        total_inaccurate += 1
    print()

print("Total Inaccurate: ", total_inaccurate)

=== temperature=0 num_shots=0 ===
Question  0


  5%|▌         | 1/20 [00:01<00:27,  1.44s/it]

k=0: gpt_answer=72  true_answer=72
CORRECT


 10%|█         | 2/20 [00:02<00:22,  1.26s/it]

k=1: gpt_answer=72  true_answer=72
CORRECT


 15%|█▌        | 3/20 [00:03<00:19,  1.14s/it]

k=2: gpt_answer=72  true_answer=72
CORRECT


 20%|██        | 4/20 [00:04<00:18,  1.14s/it]

k=3: gpt_answer=72  true_answer=72
CORRECT


 25%|██▌       | 5/20 [00:05<00:17,  1.17s/it]

k=4: gpt_answer=72  true_answer=72
CORRECT


 30%|███       | 6/20 [00:07<00:16,  1.18s/it]

k=5: gpt_answer=72  true_answer=72
CORRECT


 35%|███▌      | 7/20 [00:08<00:16,  1.25s/it]

k=6: gpt_answer=72  true_answer=72
CORRECT


 40%|████      | 8/20 [00:09<00:15,  1.28s/it]

k=7: gpt_answer=72  true_answer=72
CORRECT


 45%|████▌     | 9/20 [00:10<00:13,  1.18s/it]

k=8: gpt_answer=72  true_answer=72
CORRECT


 50%|█████     | 10/20 [00:12<00:12,  1.20s/it]

k=9: gpt_answer=72  true_answer=72
CORRECT


 55%|█████▌    | 11/20 [00:13<00:11,  1.31s/it]

k=10: gpt_answer=72  true_answer=72
CORRECT


 60%|██████    | 12/20 [00:15<00:10,  1.36s/it]

k=11: gpt_answer=72  true_answer=72
CORRECT


 65%|██████▌   | 13/20 [00:16<00:09,  1.35s/it]

k=12: gpt_answer=72  true_answer=72
CORRECT


 70%|███████   | 14/20 [00:17<00:07,  1.27s/it]

k=13: gpt_answer=72  true_answer=72
CORRECT


 75%|███████▌  | 15/20 [00:18<00:06,  1.27s/it]

k=14: gpt_answer=72  true_answer=72
CORRECT


 80%|████████  | 16/20 [00:20<00:06,  1.53s/it]

k=15: gpt_answer=72  true_answer=72
CORRECT


 85%|████████▌ | 17/20 [00:22<00:04,  1.55s/it]

k=16: gpt_answer=72  true_answer=72
CORRECT


 90%|█████████ | 18/20 [00:23<00:02,  1.42s/it]

k=17: gpt_answer=72  true_answer=72
CORRECT


 95%|█████████▌| 19/20 [00:24<00:01,  1.34s/it]

k=18: gpt_answer=72  true_answer=72
CORRECT


100%|██████████| 20/20 [00:26<00:00,  1.30s/it]


k=19: gpt_answer=72  true_answer=72
CORRECT
accuracy=1.00

Question  1


  5%|▌         | 1/20 [00:00<00:17,  1.07it/s]

k=0: gpt_answer=10  true_answer=10
CORRECT


 10%|█         | 2/20 [00:02<00:19,  1.07s/it]

k=1: gpt_answer=10  true_answer=10
CORRECT


 15%|█▌        | 3/20 [00:03<00:19,  1.13s/it]

k=2: gpt_answer=10  true_answer=10
CORRECT


 20%|██        | 4/20 [00:04<00:21,  1.34s/it]

k=3: gpt_answer=10  true_answer=10
CORRECT


 25%|██▌       | 5/20 [00:06<00:18,  1.26s/it]

k=4: gpt_answer=10  true_answer=10
CORRECT


 30%|███       | 6/20 [00:07<00:17,  1.27s/it]

k=5: gpt_answer=10.0  true_answer=10
CORRECT


 35%|███▌      | 7/20 [00:08<00:16,  1.27s/it]

k=6: gpt_answer=10.0  true_answer=10
CORRECT


 40%|████      | 8/20 [00:10<00:16,  1.37s/it]

k=7: gpt_answer=10.00  true_answer=10
CORRECT


 45%|████▌     | 9/20 [00:11<00:13,  1.27s/it]

k=8: gpt_answer=10  true_answer=10
CORRECT


 50%|█████     | 10/20 [00:12<00:12,  1.29s/it]

k=9: gpt_answer=10.00  true_answer=10
CORRECT


 55%|█████▌    | 11/20 [00:13<00:11,  1.29s/it]

k=10: gpt_answer=10  true_answer=10
CORRECT


 60%|██████    | 12/20 [00:15<00:10,  1.26s/it]

k=11: gpt_answer=10.  true_answer=10
CORRECT


 65%|██████▌   | 13/20 [00:16<00:08,  1.22s/it]

k=12: gpt_answer=10.0  true_answer=10
CORRECT


 70%|███████   | 14/20 [00:17<00:07,  1.17s/it]

k=13: gpt_answer=10  true_answer=10
CORRECT


 75%|███████▌  | 15/20 [00:18<00:06,  1.28s/it]

k=14: gpt_answer=9.9996  true_answer=10
INCORRECT


 80%|████████  | 16/20 [00:20<00:05,  1.37s/it]

k=15: gpt_answer=9.9996  true_answer=10
INCORRECT


 85%|████████▌ | 17/20 [00:21<00:03,  1.29s/it]

k=16: gpt_answer=10  true_answer=10
CORRECT


 90%|█████████ | 18/20 [00:22<00:02,  1.26s/it]

k=17: gpt_answer=10  true_answer=10
CORRECT


 95%|█████████▌| 19/20 [00:23<00:01,  1.24s/it]

k=18: gpt_answer=10  true_answer=10
CORRECT


100%|██████████| 20/20 [00:25<00:00,  1.27s/it]


k=19: gpt_answer=9.9996  true_answer=10
INCORRECT
accuracy=0.85

Question  2


  5%|▌         | 1/20 [00:01<00:28,  1.47s/it]

k=0: gpt_answer=5  true_answer=5.0
CORRECT


 10%|█         | 2/20 [00:02<00:25,  1.41s/it]

k=1: gpt_answer=5  true_answer=5.0
CORRECT


 15%|█▌        | 3/20 [00:04<00:23,  1.37s/it]

k=2: gpt_answer=5  true_answer=5.0
CORRECT


 20%|██        | 4/20 [00:05<00:21,  1.34s/it]

k=3: gpt_answer=5  true_answer=5.0
CORRECT


 25%|██▌       | 5/20 [00:06<00:20,  1.35s/it]

k=4: gpt_answer=5  true_answer=5.0
CORRECT


 30%|███       | 6/20 [00:08<00:21,  1.54s/it]

k=5: gpt_answer=5  true_answer=5.0
CORRECT


 35%|███▌      | 7/20 [00:10<00:19,  1.52s/it]

k=6: gpt_answer=5  true_answer=5.0
CORRECT


 40%|████      | 8/20 [00:11<00:17,  1.48s/it]

k=7: gpt_answer=5  true_answer=5.0
CORRECT


 45%|████▌     | 9/20 [00:13<00:16,  1.53s/it]

k=8: gpt_answer=5  true_answer=5.0
CORRECT


 50%|█████     | 10/20 [00:14<00:14,  1.50s/it]

k=9: gpt_answer=5  true_answer=5.0
CORRECT


 55%|█████▌    | 11/20 [00:16<00:14,  1.60s/it]

k=10: gpt_answer=5  true_answer=5.0
CORRECT


 60%|██████    | 12/20 [00:17<00:12,  1.57s/it]

k=11: gpt_answer=5  true_answer=5.0
CORRECT


 65%|██████▌   | 13/20 [00:19<00:10,  1.53s/it]

k=12: gpt_answer=5  true_answer=5.0
CORRECT


 70%|███████   | 14/20 [00:20<00:08,  1.47s/it]

k=13: gpt_answer=5  true_answer=5.0
CORRECT


 75%|███████▌  | 15/20 [00:22<00:07,  1.50s/it]

k=14: gpt_answer=5  true_answer=5.0
CORRECT


 80%|████████  | 16/20 [00:23<00:05,  1.45s/it]

k=15: gpt_answer=5  true_answer=5.0
CORRECT


 85%|████████▌ | 17/20 [00:25<00:04,  1.60s/it]

k=16: gpt_answer=5  true_answer=5.0
CORRECT


 90%|█████████ | 18/20 [00:27<00:03,  1.57s/it]

k=17: gpt_answer=5  true_answer=5.0
CORRECT


 95%|█████████▌| 19/20 [00:28<00:01,  1.59s/it]

k=18: gpt_answer=5.  true_answer=5.0
CORRECT


100%|██████████| 20/20 [00:30<00:00,  1.53s/it]


k=19: gpt_answer=5  true_answer=5.0
CORRECT
accuracy=1.00

Question  3


  5%|▌         | 1/20 [00:00<00:18,  1.04it/s]

k=0: gpt_answer=42  true_answer=42
CORRECT


 10%|█         | 2/20 [00:01<00:17,  1.04it/s]

k=1: gpt_answer=42  true_answer=42
CORRECT


 15%|█▌        | 3/20 [00:02<00:16,  1.06it/s]

k=2: gpt_answer=42  true_answer=42
CORRECT


 20%|██        | 4/20 [00:03<00:15,  1.02it/s]

k=3: gpt_answer=42  true_answer=42
CORRECT


 25%|██▌       | 5/20 [00:04<00:15,  1.00s/it]

k=4: gpt_answer=42  true_answer=42
CORRECT


 30%|███       | 6/20 [00:05<00:13,  1.00it/s]

k=5: gpt_answer=42  true_answer=42
CORRECT


 35%|███▌      | 7/20 [00:06<00:12,  1.01it/s]

k=6: gpt_answer=42  true_answer=42
CORRECT


 40%|████      | 8/20 [00:07<00:11,  1.03it/s]

k=7: gpt_answer=42  true_answer=42
CORRECT


 45%|████▌     | 9/20 [00:09<00:11,  1.06s/it]

k=8: gpt_answer=42  true_answer=42
CORRECT


 50%|█████     | 10/20 [00:10<00:10,  1.05s/it]

k=9: gpt_answer=42  true_answer=42
CORRECT


 55%|█████▌    | 11/20 [00:11<00:09,  1.03s/it]

k=10: gpt_answer=42  true_answer=42
CORRECT


 60%|██████    | 12/20 [00:12<00:08,  1.03s/it]

k=11: gpt_answer=42  true_answer=42
CORRECT


 65%|██████▌   | 13/20 [00:13<00:06,  1.01it/s]

k=12: gpt_answer=42  true_answer=42
CORRECT


 70%|███████   | 14/20 [00:13<00:05,  1.02it/s]

k=13: gpt_answer=42  true_answer=42
CORRECT


 75%|███████▌  | 15/20 [00:15<00:05,  1.06s/it]

k=14: gpt_answer=42  true_answer=42
CORRECT


 80%|████████  | 16/20 [00:16<00:04,  1.08s/it]

k=15: gpt_answer=42  true_answer=42
CORRECT


 85%|████████▌ | 17/20 [00:17<00:03,  1.07s/it]

k=16: gpt_answer=42  true_answer=42
CORRECT


 90%|█████████ | 18/20 [00:18<00:02,  1.02s/it]

k=17: gpt_answer=42  true_answer=42
CORRECT


 95%|█████████▌| 19/20 [00:19<00:01,  1.07s/it]

k=18: gpt_answer=42  true_answer=42
CORRECT


100%|██████████| 20/20 [00:20<00:00,  1.02s/it]


k=19: gpt_answer=42  true_answer=42
CORRECT
accuracy=1.00

Question  4


  5%|▌         | 1/20 [00:01<00:24,  1.30s/it]

k=0: gpt_answer=624  true_answer=624
CORRECT


 10%|█         | 2/20 [00:02<00:23,  1.32s/it]

k=1: gpt_answer=624  true_answer=624
CORRECT


 15%|█▌        | 3/20 [00:04<00:28,  1.66s/it]

k=2: gpt_answer=624  true_answer=624
CORRECT


 20%|██        | 4/20 [00:06<00:25,  1.60s/it]

k=3: gpt_answer=624  true_answer=624
CORRECT


 25%|██▌       | 5/20 [00:07<00:20,  1.40s/it]

k=4: gpt_answer=624  true_answer=624
CORRECT


 30%|███       | 6/20 [00:08<00:20,  1.46s/it]

k=5: gpt_answer=624  true_answer=624
CORRECT


 35%|███▌      | 7/20 [00:10<00:17,  1.37s/it]

k=6: gpt_answer=624  true_answer=624
CORRECT


 40%|████      | 8/20 [00:11<00:15,  1.33s/it]

k=7: gpt_answer=624  true_answer=624
CORRECT


 45%|████▌     | 9/20 [00:12<00:14,  1.30s/it]

k=8: gpt_answer=624  true_answer=624
CORRECT


 50%|█████     | 10/20 [00:13<00:13,  1.31s/it]

k=9: gpt_answer=624  true_answer=624
CORRECT


 55%|█████▌    | 11/20 [00:14<00:11,  1.25s/it]

k=10: gpt_answer=624  true_answer=624
CORRECT


 60%|██████    | 12/20 [00:16<00:10,  1.26s/it]

k=11: gpt_answer=624  true_answer=624
CORRECT


 65%|██████▌   | 13/20 [00:17<00:09,  1.30s/it]

k=12: gpt_answer=624  true_answer=624
CORRECT


 70%|███████   | 14/20 [00:18<00:07,  1.31s/it]

k=13: gpt_answer=624  true_answer=624
CORRECT


 75%|███████▌  | 15/20 [00:20<00:06,  1.26s/it]

k=14: gpt_answer=624  true_answer=624
CORRECT


 80%|████████  | 16/20 [00:21<00:04,  1.20s/it]

k=15: gpt_answer=624  true_answer=624
CORRECT


 85%|████████▌ | 17/20 [00:23<00:04,  1.40s/it]

k=16: gpt_answer=624  true_answer=624
CORRECT


 90%|█████████ | 18/20 [00:24<00:02,  1.31s/it]

k=17: gpt_answer=624  true_answer=624
CORRECT


 95%|█████████▌| 19/20 [00:25<00:01,  1.25s/it]

k=18: gpt_answer=624  true_answer=624
CORRECT


100%|██████████| 20/20 [00:27<00:00,  1.35s/it]


k=19: gpt_answer=624  true_answer=624
CORRECT
accuracy=1.00

Question  5


  5%|▌         | 1/20 [00:01<00:35,  1.85s/it]

k=0: gpt_answer=22  true_answer=35
INCORRECT


 10%|█         | 2/20 [00:03<00:33,  1.85s/it]

k=1: gpt_answer=35  true_answer=35
CORRECT


 15%|█▌        | 3/20 [00:05<00:30,  1.81s/it]

k=2: gpt_answer=35  true_answer=35
CORRECT


 20%|██        | 4/20 [00:07<00:30,  1.90s/it]

k=3: gpt_answer=22  true_answer=35
INCORRECT


 25%|██▌       | 5/20 [00:09<00:28,  1.93s/it]

k=4: gpt_answer=35  true_answer=35
CORRECT


 30%|███       | 6/20 [00:11<00:27,  1.95s/it]

k=5: gpt_answer=35  true_answer=35
CORRECT


 35%|███▌      | 7/20 [00:13<00:26,  2.07s/it]

k=6: gpt_answer=35  true_answer=35
CORRECT


 40%|████      | 8/20 [00:15<00:24,  2.06s/it]

k=7: gpt_answer=35  true_answer=35
CORRECT


 45%|████▌     | 9/20 [00:17<00:21,  1.99s/it]

k=8: gpt_answer=35  true_answer=35
CORRECT


 50%|█████     | 10/20 [00:19<00:18,  1.90s/it]

k=9: gpt_answer=25  true_answer=35
INCORRECT


 55%|█████▌    | 11/20 [00:21<00:18,  2.03s/it]

k=10: gpt_answer=22  true_answer=35
INCORRECT


 60%|██████    | 12/20 [00:23<00:16,  2.04s/it]

k=11: gpt_answer=35  true_answer=35
CORRECT


 65%|██████▌   | 13/20 [00:25<00:13,  1.95s/it]

k=12: gpt_answer=35  true_answer=35
CORRECT


 70%|███████   | 14/20 [00:27<00:11,  1.90s/it]

k=13: gpt_answer=35  true_answer=35
CORRECT


 75%|███████▌  | 15/20 [00:29<00:09,  1.93s/it]

k=14: gpt_answer=35  true_answer=35
CORRECT


 80%|████████  | 16/20 [00:31<00:08,  2.06s/it]

k=15: gpt_answer=35  true_answer=35
CORRECT


 85%|████████▌ | 17/20 [00:33<00:06,  2.09s/it]

k=16: gpt_answer=35  true_answer=35
CORRECT


 90%|█████████ | 18/20 [00:35<00:03,  1.98s/it]

k=17: gpt_answer=35  true_answer=35
CORRECT


 95%|█████████▌| 19/20 [00:37<00:01,  1.94s/it]

k=18: gpt_answer=35  true_answer=35
CORRECT


100%|██████████| 20/20 [00:39<00:00,  1.96s/it]


k=19: gpt_answer=35  true_answer=35
CORRECT
accuracy=0.80

Question  6


  5%|▌         | 1/20 [00:01<00:23,  1.25s/it]

k=0: gpt_answer=48  true_answer=48
CORRECT


 10%|█         | 2/20 [00:02<00:20,  1.12s/it]

k=1: gpt_answer=48  true_answer=48
CORRECT


 15%|█▌        | 3/20 [00:03<00:19,  1.14s/it]

k=2: gpt_answer=48  true_answer=48
CORRECT


 20%|██        | 4/20 [00:04<00:19,  1.23s/it]

k=3: gpt_answer=48  true_answer=48
CORRECT


 25%|██▌       | 5/20 [00:06<00:19,  1.28s/it]

k=4: gpt_answer=48  true_answer=48
CORRECT


 30%|███       | 6/20 [00:07<00:18,  1.30s/it]

k=5: gpt_answer=48  true_answer=48
CORRECT


 35%|███▌      | 7/20 [00:08<00:16,  1.24s/it]

k=6: gpt_answer=48  true_answer=48
CORRECT


 40%|████      | 8/20 [00:09<00:14,  1.18s/it]

k=7: gpt_answer=48  true_answer=48
CORRECT


 45%|████▌     | 9/20 [00:10<00:13,  1.20s/it]

k=8: gpt_answer=48  true_answer=48
CORRECT


 50%|█████     | 10/20 [00:12<00:11,  1.17s/it]

k=9: gpt_answer=48  true_answer=48
CORRECT


 55%|█████▌    | 11/20 [00:13<00:10,  1.14s/it]

k=10: gpt_answer=48  true_answer=48
CORRECT


 60%|██████    | 12/20 [00:14<00:09,  1.17s/it]

k=11: gpt_answer=48  true_answer=48
CORRECT


 65%|██████▌   | 13/20 [00:15<00:08,  1.17s/it]

k=12: gpt_answer=48  true_answer=48
CORRECT


 70%|███████   | 14/20 [00:16<00:06,  1.14s/it]

k=13: gpt_answer=48  true_answer=48
CORRECT


 75%|███████▌  | 15/20 [00:17<00:05,  1.17s/it]

k=14: gpt_answer=48  true_answer=48
CORRECT


 80%|████████  | 16/20 [00:18<00:04,  1.12s/it]

k=15: gpt_answer=48  true_answer=48
CORRECT


 85%|████████▌ | 17/20 [00:19<00:03,  1.13s/it]

k=16: gpt_answer=48  true_answer=48
CORRECT


 90%|█████████ | 18/20 [00:21<00:02,  1.13s/it]

k=17: gpt_answer=48  true_answer=48
CORRECT


 95%|█████████▌| 19/20 [00:22<00:01,  1.11s/it]

k=18: gpt_answer=48  true_answer=48
CORRECT


100%|██████████| 20/20 [00:23<00:00,  1.17s/it]


k=19: gpt_answer=48  true_answer=48
CORRECT
accuracy=1.00

Question  7


  5%|▌         | 1/20 [00:01<00:37,  1.95s/it]

k=0: gpt_answer=18  true_answer=16
INCORRECT


 10%|█         | 2/20 [00:03<00:27,  1.51s/it]

k=1: gpt_answer=16  true_answer=16
CORRECT


 15%|█▌        | 3/20 [00:04<00:27,  1.63s/it]

k=2: gpt_answer=30  true_answer=16
INCORRECT


 20%|██        | 4/20 [00:06<00:25,  1.57s/it]

k=3: gpt_answer=16  true_answer=16
CORRECT


 25%|██▌       | 5/20 [00:07<00:21,  1.45s/it]

k=4: gpt_answer=16  true_answer=16
CORRECT


 30%|███       | 6/20 [00:09<00:21,  1.57s/it]

k=5: gpt_answer=30  true_answer=16
INCORRECT


 35%|███▌      | 7/20 [00:10<00:19,  1.54s/it]

k=6: gpt_answer=16  true_answer=16
CORRECT


 40%|████      | 8/20 [00:12<00:18,  1.51s/it]

k=7: gpt_answer=16  true_answer=16
CORRECT


 45%|████▌     | 9/20 [00:14<00:19,  1.79s/it]

k=8: gpt_answer=16  true_answer=16
CORRECT


 50%|█████     | 10/20 [00:16<00:16,  1.66s/it]

k=9: gpt_answer=16  true_answer=16
CORRECT


 55%|█████▌    | 11/20 [00:17<00:15,  1.68s/it]

k=10: gpt_answer=30  true_answer=16
INCORRECT


 60%|██████    | 12/20 [00:19<00:12,  1.56s/it]

k=11: gpt_answer=16  true_answer=16
CORRECT


 65%|██████▌   | 13/20 [00:20<00:10,  1.47s/it]

k=12: gpt_answer=16  true_answer=16
CORRECT


 70%|███████   | 14/20 [00:21<00:08,  1.43s/it]

k=13: gpt_answer=16  true_answer=16
CORRECT


 75%|███████▌  | 15/20 [00:24<00:09,  1.94s/it]

k=14: gpt_answer=14  true_answer=16
INCORRECT


 80%|████████  | 16/20 [00:27<00:08,  2.08s/it]

k=15: gpt_answer=18  true_answer=16
INCORRECT


 85%|████████▌ | 17/20 [00:28<00:05,  1.80s/it]

k=16: gpt_answer=16  true_answer=16
CORRECT


 90%|█████████ | 18/20 [00:29<00:03,  1.67s/it]

k=17: gpt_answer=16  true_answer=16
CORRECT


 95%|█████████▌| 19/20 [00:30<00:01,  1.51s/it]

k=18: gpt_answer=16  true_answer=16
CORRECT


100%|██████████| 20/20 [00:32<00:00,  1.64s/it]


k=19: gpt_answer=18  true_answer=16
INCORRECT
accuracy=0.65

Question  8


  5%|▌         | 1/20 [00:01<00:25,  1.36s/it]

k=0: gpt_answer=41  true_answer=41
CORRECT


 10%|█         | 2/20 [00:02<00:25,  1.41s/it]

k=1: gpt_answer=41.  true_answer=41
CORRECT


 15%|█▌        | 3/20 [00:04<00:28,  1.69s/it]

k=2: gpt_answer=41.  true_answer=41
CORRECT


 20%|██        | 4/20 [00:06<00:26,  1.65s/it]

k=3: gpt_answer=41  true_answer=41
CORRECT


 25%|██▌       | 5/20 [00:07<00:23,  1.60s/it]

k=4: gpt_answer=41  true_answer=41
CORRECT


 30%|███       | 6/20 [00:09<00:22,  1.57s/it]

k=5: gpt_answer=41  true_answer=41
CORRECT


 35%|███▌      | 7/20 [00:11<00:20,  1.57s/it]

k=6: gpt_answer=41  true_answer=41
CORRECT


 40%|████      | 8/20 [00:12<00:18,  1.54s/it]

k=7: gpt_answer=41  true_answer=41
CORRECT


 45%|████▌     | 9/20 [00:13<00:16,  1.50s/it]

k=8: gpt_answer=41  true_answer=41
CORRECT


 50%|█████     | 10/20 [00:15<00:15,  1.51s/it]

k=9: gpt_answer=41  true_answer=41
CORRECT


 55%|█████▌    | 11/20 [00:16<00:12,  1.39s/it]

k=10: gpt_answer=41  true_answer=41
CORRECT


 60%|██████    | 12/20 [00:18<00:11,  1.41s/it]

k=11: gpt_answer=41  true_answer=41
CORRECT


 65%|██████▌   | 13/20 [00:19<00:10,  1.46s/it]

k=12: gpt_answer=41  true_answer=41
CORRECT


 70%|███████   | 14/20 [00:21<00:08,  1.47s/it]

k=13: gpt_answer=41  true_answer=41
CORRECT


 75%|███████▌  | 15/20 [00:22<00:07,  1.45s/it]

k=14: gpt_answer=41  true_answer=41
CORRECT


 80%|████████  | 16/20 [00:23<00:05,  1.47s/it]

k=15: gpt_answer=41  true_answer=41
CORRECT


 85%|████████▌ | 17/20 [00:25<00:04,  1.49s/it]

k=16: gpt_answer=41  true_answer=41
CORRECT


 90%|█████████ | 18/20 [00:26<00:02,  1.48s/it]

k=17: gpt_answer=41  true_answer=41
CORRECT


 95%|█████████▌| 19/20 [00:28<00:01,  1.41s/it]

k=18: gpt_answer=41.  true_answer=41
CORRECT


100%|██████████| 20/20 [00:29<00:00,  1.49s/it]


k=19: gpt_answer=41  true_answer=41
CORRECT
accuracy=1.00

Question  9


  5%|▌         | 1/20 [00:01<00:32,  1.69s/it]

k=0: gpt_answer=990.  true_answer=990.0
CORRECT


 10%|█         | 2/20 [00:03<00:30,  1.71s/it]

k=1: gpt_answer=990.  true_answer=990.0
CORRECT


 15%|█▌        | 3/20 [00:05<00:28,  1.67s/it]

k=2: gpt_answer=990  true_answer=990.0
CORRECT


 20%|██        | 4/20 [00:06<00:26,  1.69s/it]

k=3: gpt_answer=990  true_answer=990.0
CORRECT


 25%|██▌       | 5/20 [00:08<00:24,  1.63s/it]

k=4: gpt_answer=990.  true_answer=990.0
CORRECT


 30%|███       | 6/20 [00:09<00:23,  1.65s/it]

k=5: gpt_answer=Total earnins = 990  true_answer=990.0
INCORRECT


 35%|███▌      | 7/20 [00:11<00:20,  1.60s/it]

k=6: gpt_answer=990  true_answer=990.0
CORRECT


 40%|████      | 8/20 [00:13<00:19,  1.60s/it]

k=7: gpt_answer=990  true_answer=990.0
CORRECT


 45%|████▌     | 9/20 [00:14<00:17,  1.56s/it]

k=8: gpt_answer=990  true_answer=990.0
CORRECT


 50%|█████     | 10/20 [00:16<00:15,  1.60s/it]

k=9: gpt_answer=990  true_answer=990.0
CORRECT


 55%|█████▌    | 11/20 [00:17<00:14,  1.65s/it]

k=10: gpt_answer=990.  true_answer=990.0
CORRECT


 60%|██████    | 12/20 [00:19<00:13,  1.72s/it]

k=11: gpt_answer=990  true_answer=990.0
CORRECT


 65%|██████▌   | 13/20 [00:21<00:12,  1.81s/it]

k=12: gpt_answer=990  true_answer=990.0
CORRECT


 70%|███████   | 14/20 [00:26<00:16,  2.68s/it]

k=13: gpt_answer=990  true_answer=990.0
CORRECT


 75%|███████▌  | 15/20 [00:27<00:11,  2.24s/it]

k=14: gpt_answer=990  true_answer=990.0
CORRECT


 80%|████████  | 16/20 [00:29<00:07,  1.99s/it]

k=15: gpt_answer=990.  true_answer=990.0
CORRECT


 85%|████████▌ | 17/20 [00:31<00:05,  1.97s/it]

k=16: gpt_answer=990.  true_answer=990.0
CORRECT


 90%|█████████ | 18/20 [00:32<00:03,  1.89s/it]

k=17: gpt_answer=990  true_answer=990.0
CORRECT


 95%|█████████▌| 19/20 [00:34<00:01,  1.74s/it]

k=18: gpt_answer=990  true_answer=990.0
CORRECT


100%|██████████| 20/20 [00:35<00:00,  1.80s/it]


k=19: gpt_answer=990.  true_answer=990.0
CORRECT
accuracy=0.95

Question  11


  5%|▌         | 1/20 [00:02<00:51,  2.71s/it]

k=0: gpt_answer=5  true_answer=5
CORRECT


 10%|█         | 2/20 [00:05<00:47,  2.63s/it]

k=1: gpt_answer=5  true_answer=5
CORRECT


 15%|█▌        | 3/20 [00:07<00:42,  2.50s/it]

k=2: gpt_answer=5  true_answer=5
CORRECT


 20%|██        | 4/20 [00:10<00:41,  2.62s/it]

k=3: gpt_answer=5  true_answer=5
CORRECT


 25%|██▌       | 5/20 [00:12<00:34,  2.30s/it]

k=4: gpt_answer=5  true_answer=5
CORRECT


 30%|███       | 6/20 [00:14<00:30,  2.15s/it]

k=5: gpt_answer=5  true_answer=5
CORRECT


 35%|███▌      | 7/20 [00:15<00:26,  2.05s/it]

k=6: gpt_answer=5  true_answer=5
CORRECT


 40%|████      | 8/20 [00:17<00:24,  2.07s/it]

k=7: gpt_answer=2  true_answer=5
INCORRECT


 45%|████▌     | 9/20 [00:19<00:22,  2.04s/it]

k=8: gpt_answer=5  true_answer=5
CORRECT


 50%|█████     | 10/20 [00:22<00:21,  2.11s/it]

k=9: gpt_answer=5  true_answer=5
CORRECT


 55%|█████▌    | 11/20 [00:24<00:19,  2.19s/it]

k=10: gpt_answer=1  true_answer=5
INCORRECT


 60%|██████    | 12/20 [00:26<00:17,  2.23s/it]

k=11: gpt_answer=5  true_answer=5
CORRECT


 65%|██████▌   | 13/20 [00:28<00:14,  2.12s/it]

k=12: gpt_answer=1  true_answer=5
INCORRECT


 70%|███████   | 14/20 [00:30<00:12,  2.06s/it]

k=13: gpt_answer=5  true_answer=5
CORRECT


 75%|███████▌  | 15/20 [00:32<00:10,  2.09s/it]

k=14: gpt_answer=2  true_answer=5
INCORRECT


 80%|████████  | 16/20 [00:34<00:07,  1.93s/it]

k=15: gpt_answer=0  true_answer=5
INCORRECT


 85%|████████▌ | 17/20 [00:36<00:05,  1.95s/it]

k=16: gpt_answer=12  true_answer=5
INCORRECT


 90%|█████████ | 18/20 [00:38<00:03,  1.89s/it]

k=17: gpt_answer=0  true_answer=5
INCORRECT


 95%|█████████▌| 19/20 [00:39<00:01,  1.84s/it]

k=18: gpt_answer=9  true_answer=5
INCORRECT


100%|██████████| 20/20 [00:42<00:00,  2.10s/it]


k=19: gpt_answer=5  true_answer=5
CORRECT
accuracy=0.60

Question  12


  5%|▌         | 1/20 [00:01<00:20,  1.08s/it]

k=0: gpt_answer=85  true_answer=85
CORRECT


 10%|█         | 2/20 [00:02<00:18,  1.04s/it]

k=1: gpt_answer=85  true_answer=85
CORRECT


 15%|█▌        | 3/20 [00:03<00:18,  1.11s/it]

k=2: gpt_answer=85  true_answer=85
CORRECT


 20%|██        | 4/20 [00:04<00:19,  1.21s/it]

k=3: gpt_answer=85  true_answer=85
CORRECT


 25%|██▌       | 5/20 [00:05<00:16,  1.13s/it]

k=4: gpt_answer=85  true_answer=85
CORRECT


 30%|███       | 6/20 [00:06<00:15,  1.13s/it]

k=5: gpt_answer=85  true_answer=85
CORRECT


 35%|███▌      | 7/20 [00:07<00:14,  1.14s/it]

k=6: gpt_answer=85  true_answer=85
CORRECT


 40%|████      | 8/20 [00:09<00:13,  1.15s/it]

k=7: gpt_answer=85  true_answer=85
CORRECT


 45%|████▌     | 9/20 [00:10<00:13,  1.18s/it]

k=8: gpt_answer=85  true_answer=85
CORRECT


 50%|█████     | 10/20 [00:11<00:11,  1.12s/it]

k=9: gpt_answer=85  true_answer=85
CORRECT


 55%|█████▌    | 11/20 [00:12<00:09,  1.10s/it]

k=10: gpt_answer=85  true_answer=85
CORRECT


 60%|██████    | 12/20 [00:13<00:08,  1.08s/it]

k=11: gpt_answer=85  true_answer=85
CORRECT


 65%|██████▌   | 13/20 [00:14<00:07,  1.00s/it]

k=12: gpt_answer=85  true_answer=85
CORRECT


 70%|███████   | 14/20 [00:15<00:05,  1.00it/s]

k=13: gpt_answer=85  true_answer=85
CORRECT


 75%|███████▌  | 15/20 [00:16<00:05,  1.00s/it]

k=14: gpt_answer=85  true_answer=85
CORRECT


 80%|████████  | 16/20 [00:17<00:04,  1.12s/it]

k=15: gpt_answer=85  true_answer=85
CORRECT


 85%|████████▌ | 17/20 [00:18<00:03,  1.13s/it]

k=16: gpt_answer=85  true_answer=85
CORRECT


 90%|█████████ | 18/20 [00:20<00:02,  1.18s/it]

k=17: gpt_answer=85  true_answer=85
CORRECT


 95%|█████████▌| 19/20 [00:21<00:01,  1.20s/it]

k=18: gpt_answer=85  true_answer=85
CORRECT


100%|██████████| 20/20 [00:22<00:00,  1.12s/it]


k=19: gpt_answer=85  true_answer=85
CORRECT
accuracy=1.00

Question  13


  5%|▌         | 1/20 [00:01<00:23,  1.25s/it]

k=0: gpt_answer=35  true_answer=35
CORRECT


 10%|█         | 2/20 [00:02<00:25,  1.43s/it]

k=1: gpt_answer=35.  true_answer=35
CORRECT


 15%|█▌        | 3/20 [00:04<00:26,  1.54s/it]

k=2: gpt_answer=22.50  true_answer=35
INCORRECT


 20%|██        | 4/20 [00:06<00:26,  1.63s/it]

k=3: gpt_answer=35  true_answer=35
CORRECT


 25%|██▌       | 5/20 [00:07<00:23,  1.59s/it]

k=4: gpt_answer=22.5  true_answer=35
INCORRECT


 30%|███       | 6/20 [00:09<00:20,  1.47s/it]

k=5: gpt_answer=30  true_answer=35
INCORRECT


 35%|███▌      | 7/20 [00:10<00:18,  1.45s/it]

k=6: gpt_answer=27.50  true_answer=35
INCORRECT


 40%|████      | 8/20 [00:11<00:16,  1.39s/it]

k=7: gpt_answer=35  true_answer=35
CORRECT


 45%|████▌     | 9/20 [00:13<00:15,  1.38s/it]

k=8: gpt_answer=35  true_answer=35
CORRECT


 50%|█████     | 10/20 [00:14<00:14,  1.42s/it]

k=9: gpt_answer=35  true_answer=35
CORRECT


 55%|█████▌    | 11/20 [00:16<00:13,  1.52s/it]

k=10: gpt_answer=22.50  true_answer=35
INCORRECT


 60%|██████    | 12/20 [00:17<00:12,  1.54s/it]

k=11: gpt_answer=35.  true_answer=35
CORRECT


 65%|██████▌   | 13/20 [00:19<00:10,  1.54s/it]

k=12: gpt_answer=35  true_answer=35
CORRECT


 70%|███████   | 14/20 [00:20<00:08,  1.46s/it]

k=13: gpt_answer=35  true_answer=35
CORRECT


 75%|███████▌  | 15/20 [00:22<00:07,  1.46s/it]

k=14: gpt_answer=32.50  true_answer=35
INCORRECT


 80%|████████  | 16/20 [00:23<00:05,  1.43s/it]

k=15: gpt_answer=22.50  true_answer=35
INCORRECT


 85%|████████▌ | 17/20 [00:24<00:04,  1.40s/it]

k=16: gpt_answer=32.50  true_answer=35
INCORRECT


 90%|█████████ | 18/20 [00:26<00:02,  1.42s/it]

k=17: gpt_answer=35  true_answer=35
CORRECT


 95%|█████████▌| 19/20 [00:27<00:01,  1.39s/it]

k=18: gpt_answer=22.5  true_answer=35
INCORRECT


100%|██████████| 20/20 [00:29<00:00,  1.45s/it]


k=19: gpt_answer=45  true_answer=35
INCORRECT
accuracy=0.50

Question  14


  5%|▌         | 1/20 [00:01<00:23,  1.22s/it]

k=0: gpt_answer=5.  true_answer=5
CORRECT


 10%|█         | 2/20 [00:02<00:22,  1.23s/it]

k=1: gpt_answer=5  true_answer=5
CORRECT


 15%|█▌        | 3/20 [00:03<00:22,  1.30s/it]

k=2: gpt_answer=5  true_answer=5
CORRECT


 20%|██        | 4/20 [00:05<00:20,  1.29s/it]

k=3: gpt_answer=5  true_answer=5
CORRECT


 25%|██▌       | 5/20 [00:06<00:19,  1.30s/it]

k=4: gpt_answer=5  true_answer=5
CORRECT


 30%|███       | 6/20 [00:07<00:18,  1.31s/it]

k=5: gpt_answer=5  true_answer=5
CORRECT


 35%|███▌      | 7/20 [00:08<00:16,  1.28s/it]

k=6: gpt_answer=5  true_answer=5
CORRECT


 40%|████      | 8/20 [00:10<00:16,  1.35s/it]

k=7: gpt_answer=5  true_answer=5
CORRECT


 45%|████▌     | 9/20 [00:12<00:15,  1.44s/it]

k=8: gpt_answer=5  true_answer=5
CORRECT


 50%|█████     | 10/20 [00:13<00:14,  1.46s/it]

k=9: gpt_answer=5  true_answer=5
CORRECT


 55%|█████▌    | 11/20 [00:15<00:13,  1.50s/it]

k=10: gpt_answer=5  true_answer=5
CORRECT


 60%|██████    | 12/20 [00:16<00:12,  1.52s/it]

k=11: gpt_answer=5  true_answer=5
CORRECT


 65%|██████▌   | 13/20 [00:18<00:10,  1.54s/it]

k=12: gpt_answer=5  true_answer=5
CORRECT


 70%|███████   | 14/20 [00:19<00:08,  1.45s/it]

k=13: gpt_answer=5  true_answer=5
CORRECT


 75%|███████▌  | 15/20 [00:20<00:06,  1.39s/it]

k=14: gpt_answer=5  true_answer=5
CORRECT


 80%|████████  | 16/20 [00:22<00:05,  1.33s/it]

k=15: gpt_answer=5.  true_answer=5
CORRECT


 85%|████████▌ | 17/20 [00:23<00:03,  1.29s/it]

k=16: gpt_answer=5  true_answer=5
CORRECT


 90%|█████████ | 18/20 [00:24<00:02,  1.30s/it]

k=17: gpt_answer=5.  true_answer=5
CORRECT


 95%|█████████▌| 19/20 [00:25<00:01,  1.30s/it]

k=18: gpt_answer=5.  true_answer=5
CORRECT


100%|██████████| 20/20 [00:27<00:00,  1.35s/it]


k=19: gpt_answer=5  true_answer=5
CORRECT
accuracy=1.00

Question  15


  5%|▌         | 1/20 [00:02<00:43,  2.28s/it]

k=0: gpt_answer=450000  true_answer=448000.0
INCORRECT


 10%|█         | 2/20 [00:03<00:32,  1.82s/it]

k=1: gpt_answer=450000.  true_answer=448000.0
INCORRECT


 15%|█▌        | 3/20 [00:05<00:30,  1.82s/it]

k=2: gpt_answer=150450000.  true_answer=448000.0
INCORRECT


 20%|██        | 4/20 [00:07<00:27,  1.70s/it]

k=3: gpt_answer=450000.  true_answer=448000.0
INCORRECT


 25%|██▌       | 5/20 [00:08<00:26,  1.74s/it]

k=4: gpt_answer=450000.  true_answer=448000.0
INCORRECT


 30%|███       | 6/20 [00:10<00:23,  1.69s/it]

k=5: gpt_answer=450000.  true_answer=448000.0
INCORRECT


 35%|███▌      | 7/20 [00:12<00:21,  1.65s/it]

k=6: gpt_answer=450000.  true_answer=448000.0
INCORRECT


 40%|████      | 8/20 [00:14<00:22,  1.86s/it]

k=7: gpt_answer=150450000.  true_answer=448000.0
INCORRECT


 45%|████▌     | 9/20 [00:17<00:23,  2.10s/it]

k=8: gpt_answer=624900000  true_answer=448000.0
INCORRECT


 50%|█████     | 10/20 [00:19<00:22,  2.23s/it]

k=9: gpt_answer=450000.  true_answer=448000.0
INCORRECT


 55%|█████▌    | 11/20 [00:20<00:17,  1.98s/it]

k=10: gpt_answer=450000.  true_answer=448000.0
INCORRECT


 60%|██████    | 12/20 [00:22<00:14,  1.81s/it]

k=11: gpt_answer=450000  true_answer=448000.0
INCORRECT


 65%|██████▌   | 13/20 [00:24<00:12,  1.83s/it]

k=12: gpt_answer=450000.  true_answer=448000.0
INCORRECT


 70%|███████   | 14/20 [00:25<00:10,  1.79s/it]

k=13: gpt_answer=450000.  true_answer=448000.0
INCORRECT


 75%|███████▌  | 15/20 [00:28<00:09,  1.95s/it]

k=14: gpt_answer=99550000.  true_answer=448000.0
INCORRECT


 80%|████████  | 16/20 [00:29<00:07,  1.88s/it]

k=15: gpt_answer=450000.  true_answer=448000.0
INCORRECT


 85%|████████▌ | 17/20 [00:31<00:05,  1.77s/it]

k=16: gpt_answer=45000  true_answer=448000.0
INCORRECT


 90%|█████████ | 18/20 [00:33<00:03,  1.74s/it]

k=17: gpt_answer=450000.  true_answer=448000.0
INCORRECT


 95%|█████████▌| 19/20 [00:35<00:01,  1.78s/it]

k=18: gpt_answer=450000  true_answer=448000.0
INCORRECT


100%|██████████| 20/20 [00:36<00:00,  1.83s/it]


k=19: gpt_answer=450000  true_answer=448000.0
INCORRECT
accuracy=0.00

Question  19


  5%|▌         | 1/20 [00:02<00:42,  2.23s/it]

k=0: gpt_answer=16  true_answer=16
CORRECT


 10%|█         | 2/20 [00:04<00:37,  2.10s/it]

k=1: gpt_answer=16  true_answer=16
CORRECT


 15%|█▌        | 3/20 [00:05<00:28,  1.70s/it]

k=2: gpt_answer=16  true_answer=16
CORRECT


 20%|██        | 4/20 [00:06<00:23,  1.48s/it]

k=3: gpt_answer=16  true_answer=16
CORRECT


 25%|██▌       | 5/20 [00:08<00:25,  1.72s/it]

k=4: gpt_answer=16  true_answer=16
CORRECT


 30%|███       | 6/20 [00:09<00:21,  1.54s/it]

k=5: gpt_answer=16  true_answer=16
CORRECT


 35%|███▌      | 7/20 [00:11<00:19,  1.47s/it]

k=6: gpt_answer=16  true_answer=16
CORRECT


 40%|████      | 8/20 [00:12<00:16,  1.39s/it]

k=7: gpt_answer=12  true_answer=16
INCORRECT


 45%|████▌     | 9/20 [00:14<00:17,  1.58s/it]

k=8: gpt_answer=16  true_answer=16
CORRECT


 50%|█████     | 10/20 [00:15<00:15,  1.50s/it]

k=9: gpt_answer=12  true_answer=16
INCORRECT


 55%|█████▌    | 11/20 [00:16<00:12,  1.39s/it]

k=10: gpt_answer=16  true_answer=16
CORRECT


 60%|██████    | 12/20 [00:18<00:11,  1.39s/it]

k=11: gpt_answer=16  true_answer=16
CORRECT


 65%|██████▌   | 13/20 [00:20<00:10,  1.55s/it]

k=12: gpt_answer=16  true_answer=16
CORRECT


 70%|███████   | 14/20 [00:21<00:08,  1.46s/it]

k=13: gpt_answer=16  true_answer=16
CORRECT


 75%|███████▌  | 15/20 [00:22<00:06,  1.36s/it]

k=14: gpt_answer=16  true_answer=16
CORRECT


 80%|████████  | 16/20 [00:23<00:05,  1.31s/it]

k=15: gpt_answer=16  true_answer=16
CORRECT


 85%|████████▌ | 17/20 [00:25<00:04,  1.41s/it]

k=16: gpt_answer=16  true_answer=16
CORRECT


 90%|█████████ | 18/20 [00:27<00:03,  1.57s/it]

k=17: gpt_answer=16  true_answer=16
CORRECT


 95%|█████████▌| 19/20 [00:28<00:01,  1.44s/it]

k=18: gpt_answer=16  true_answer=16
CORRECT


100%|██████████| 20/20 [00:29<00:00,  1.49s/it]


k=19: gpt_answer=16  true_answer=16
CORRECT
accuracy=0.90

Question  20


  5%|▌         | 1/20 [00:01<00:27,  1.44s/it]

k=0: gpt_answer=38  true_answer=38
CORRECT


 10%|█         | 2/20 [00:03<00:28,  1.56s/it]

k=1: gpt_answer=38  true_answer=38
CORRECT


 15%|█▌        | 3/20 [00:04<00:28,  1.65s/it]

k=2: gpt_answer=38  true_answer=38
CORRECT


 20%|██        | 4/20 [00:06<00:25,  1.62s/it]

k=3: gpt_answer=38  true_answer=38
CORRECT


 25%|██▌       | 5/20 [00:07<00:23,  1.56s/it]

k=4: gpt_answer=38  true_answer=38
CORRECT


 30%|███       | 6/20 [00:09<00:21,  1.51s/it]

k=5: gpt_answer=38  true_answer=38
CORRECT


 35%|███▌      | 7/20 [00:10<00:19,  1.52s/it]

k=6: gpt_answer=38  true_answer=38
CORRECT


 40%|████      | 8/20 [00:12<00:17,  1.49s/it]

k=7: gpt_answer=38  true_answer=38
CORRECT


 45%|████▌     | 9/20 [00:13<00:16,  1.53s/it]

k=8: gpt_answer=38  true_answer=38
CORRECT


 50%|█████     | 10/20 [00:15<00:14,  1.49s/it]

k=9: gpt_answer=38  true_answer=38
CORRECT


 55%|█████▌    | 11/20 [00:17<00:15,  1.68s/it]

k=10: gpt_answer=38  true_answer=38
CORRECT


 60%|██████    | 12/20 [00:18<00:12,  1.60s/it]

k=11: gpt_answer=38  true_answer=38
CORRECT


 65%|██████▌   | 13/20 [00:20<00:10,  1.55s/it]

k=12: gpt_answer=38  true_answer=38
CORRECT


 70%|███████   | 14/20 [00:21<00:09,  1.55s/it]

k=13: gpt_answer=38  true_answer=38
CORRECT


 75%|███████▌  | 15/20 [00:23<00:07,  1.52s/it]

k=14: gpt_answer=38  true_answer=38
CORRECT


 80%|████████  | 16/20 [00:24<00:06,  1.55s/it]

k=15: gpt_answer=38  true_answer=38
CORRECT


 85%|████████▌ | 17/20 [00:26<00:04,  1.50s/it]

k=16: gpt_answer=38  true_answer=38
CORRECT


 90%|█████████ | 18/20 [00:27<00:02,  1.46s/it]

k=17: gpt_answer=38  true_answer=38
CORRECT


 95%|█████████▌| 19/20 [00:28<00:01,  1.44s/it]

k=18: gpt_answer=38  true_answer=38
CORRECT


100%|██████████| 20/20 [00:30<00:00,  1.54s/it]


k=19: gpt_answer=38  true_answer=38
CORRECT
accuracy=1.00

Question  23


  5%|▌         | 1/20 [00:01<00:28,  1.51s/it]

k=0: gpt_answer=5  true_answer=5
CORRECT


 10%|█         | 2/20 [00:03<00:27,  1.50s/it]

k=1: gpt_answer=5  true_answer=5
CORRECT


 15%|█▌        | 3/20 [00:04<00:27,  1.62s/it]

k=2: gpt_answer=5  true_answer=5
CORRECT


 20%|██        | 4/20 [00:06<00:25,  1.61s/it]

k=3: gpt_answer=5  true_answer=5
CORRECT


 25%|██▌       | 5/20 [00:07<00:24,  1.61s/it]

k=4: gpt_answer=5  true_answer=5
CORRECT


 30%|███       | 6/20 [00:09<00:22,  1.63s/it]

k=5: gpt_answer=5  true_answer=5
CORRECT


 35%|███▌      | 7/20 [00:10<00:19,  1.53s/it]

k=6: gpt_answer=5  true_answer=5
CORRECT


 40%|████      | 8/20 [00:12<00:20,  1.67s/it]

k=7: gpt_answer=5  true_answer=5
CORRECT


 45%|████▌     | 9/20 [00:14<00:17,  1.59s/it]

k=8: gpt_answer=5  true_answer=5
CORRECT


 50%|█████     | 10/20 [00:15<00:15,  1.59s/it]

k=9: gpt_answer=5  true_answer=5
CORRECT


 55%|█████▌    | 11/20 [00:18<00:15,  1.74s/it]

k=10: gpt_answer=5  true_answer=5
CORRECT


 60%|██████    | 12/20 [00:19<00:13,  1.68s/it]

k=11: gpt_answer=5  true_answer=5
CORRECT


 65%|██████▌   | 13/20 [00:20<00:11,  1.58s/it]

k=12: gpt_answer=5  true_answer=5
CORRECT


 70%|███████   | 14/20 [00:22<00:09,  1.55s/it]

k=13: gpt_answer=5  true_answer=5
CORRECT


 75%|███████▌  | 15/20 [00:24<00:07,  1.58s/it]

k=14: gpt_answer=5  true_answer=5
CORRECT


 80%|████████  | 16/20 [00:25<00:06,  1.54s/it]

k=15: gpt_answer=5  true_answer=5
CORRECT


 85%|████████▌ | 17/20 [00:27<00:04,  1.55s/it]

k=16: gpt_answer=5  true_answer=5
CORRECT


 90%|█████████ | 18/20 [00:29<00:03,  1.68s/it]

k=17: gpt_answer=5  true_answer=5
CORRECT


 95%|█████████▌| 19/20 [00:30<00:01,  1.68s/it]

k=18: gpt_answer=5  true_answer=5
CORRECT


100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


k=19: gpt_answer=5  true_answer=5
CORRECT
accuracy=1.00

Question  24


  5%|▌         | 1/20 [00:00<00:18,  1.01it/s]

k=0: gpt_answer=62  true_answer=62
CORRECT


 10%|█         | 2/20 [00:02<00:21,  1.21s/it]

k=1: gpt_answer=62  true_answer=62
CORRECT


 15%|█▌        | 3/20 [00:03<00:18,  1.12s/it]

k=2: gpt_answer=62  true_answer=62
CORRECT


 20%|██        | 4/20 [00:04<00:16,  1.06s/it]

k=3: gpt_answer=62  true_answer=62
CORRECT


 25%|██▌       | 5/20 [00:05<00:15,  1.05s/it]

k=4: gpt_answer=62  true_answer=62
CORRECT


 30%|███       | 6/20 [00:06<00:14,  1.05s/it]

k=5: gpt_answer=62  true_answer=62
CORRECT


 35%|███▌      | 7/20 [00:07<00:14,  1.09s/it]

k=6: gpt_answer=62  true_answer=62
CORRECT


 40%|████      | 8/20 [00:08<00:13,  1.09s/it]

k=7: gpt_answer=62  true_answer=62
CORRECT


 45%|████▌     | 9/20 [00:09<00:11,  1.07s/it]

k=8: gpt_answer=62  true_answer=62
CORRECT


 50%|█████     | 10/20 [00:11<00:12,  1.25s/it]

k=9: gpt_answer=62  true_answer=62
CORRECT


 55%|█████▌    | 11/20 [00:12<00:10,  1.21s/it]

k=10: gpt_answer=62  true_answer=62
CORRECT


 60%|██████    | 12/20 [00:13<00:09,  1.14s/it]

k=11: gpt_answer=62  true_answer=62
CORRECT


 65%|██████▌   | 13/20 [00:14<00:08,  1.16s/it]

k=12: gpt_answer=62  true_answer=62
CORRECT


 70%|███████   | 14/20 [00:15<00:06,  1.11s/it]

k=13: gpt_answer=62  true_answer=62
CORRECT


 75%|███████▌  | 15/20 [00:16<00:05,  1.13s/it]

k=14: gpt_answer=62  true_answer=62
CORRECT


 80%|████████  | 16/20 [00:17<00:04,  1.10s/it]

k=15: gpt_answer=62  true_answer=62
CORRECT


 85%|████████▌ | 17/20 [00:18<00:03,  1.07s/it]

k=16: gpt_answer=62  true_answer=62
CORRECT


 90%|█████████ | 18/20 [00:19<00:02,  1.09s/it]

k=17: gpt_answer=62  true_answer=62
CORRECT


 95%|█████████▌| 19/20 [00:21<00:01,  1.09s/it]

k=18: gpt_answer=62  true_answer=62
CORRECT


100%|██████████| 20/20 [00:22<00:00,  1.14s/it]


k=19: gpt_answer=62  true_answer=62
CORRECT
accuracy=1.00

Question  25


  5%|▌         | 1/20 [00:01<00:22,  1.17s/it]

k=0: gpt_answer=110  true_answer=110
CORRECT


 10%|█         | 2/20 [00:02<00:22,  1.24s/it]

k=1: gpt_answer=110  true_answer=110
CORRECT


 15%|█▌        | 3/20 [00:03<00:21,  1.25s/it]

k=2: gpt_answer=110  true_answer=110
CORRECT


 20%|██        | 4/20 [00:05<00:23,  1.47s/it]

k=3: gpt_answer=110  true_answer=110
CORRECT


 25%|██▌       | 5/20 [00:06<00:20,  1.35s/it]

k=4: gpt_answer=110  true_answer=110
CORRECT


 30%|███       | 6/20 [00:07<00:17,  1.26s/it]

k=5: gpt_answer=110  true_answer=110
CORRECT


 35%|███▌      | 7/20 [00:09<00:17,  1.32s/it]

k=6: gpt_answer=110  true_answer=110
CORRECT


 40%|████      | 8/20 [00:10<00:15,  1.26s/it]

k=7: gpt_answer=110  true_answer=110
CORRECT


 45%|████▌     | 9/20 [00:11<00:14,  1.29s/it]

k=8: gpt_answer=110  true_answer=110
CORRECT


 50%|█████     | 10/20 [00:12<00:12,  1.29s/it]

k=9: gpt_answer=110  true_answer=110
CORRECT


 55%|█████▌    | 11/20 [00:14<00:11,  1.30s/it]

k=10: gpt_answer=110  true_answer=110
CORRECT


 60%|██████    | 12/20 [00:16<00:11,  1.43s/it]

k=11: gpt_answer=110  true_answer=110
CORRECT


 65%|██████▌   | 13/20 [00:17<00:09,  1.32s/it]

k=12: gpt_answer=110  true_answer=110
CORRECT


 70%|███████   | 14/20 [00:18<00:07,  1.31s/it]

k=13: gpt_answer=110  true_answer=110
CORRECT


 75%|███████▌  | 15/20 [00:19<00:05,  1.19s/it]

k=14: gpt_answer=110  true_answer=110
CORRECT


 80%|████████  | 16/20 [00:20<00:05,  1.27s/it]

k=15: gpt_answer=110  true_answer=110
CORRECT


 85%|████████▌ | 17/20 [00:22<00:03,  1.27s/it]

k=16: gpt_answer=110  true_answer=110
CORRECT


 90%|█████████ | 18/20 [00:23<00:02,  1.25s/it]

k=17: gpt_answer=110  true_answer=110
CORRECT


 95%|█████████▌| 19/20 [00:24<00:01,  1.34s/it]

k=18: gpt_answer=110  true_answer=110
CORRECT


100%|██████████| 20/20 [00:26<00:00,  1.31s/it]

k=19: gpt_answer=110  true_answer=110
CORRECT
accuracy=1.00

Total Inaccurate:  8



