Code from A3 problem1-prompt.ipynb and https://github.com/SewoongLee/reproduce-llama3-arithmetic/blob/main/llama3-tutorial-gsm8k.ipynb

You need to set your OPENAI_API_KEY environment variable in .env

In [34]:
from openai import OpenAI
from tqdm import tqdm
import textwrap
import dotenv
import os
%load_ext dotenv
%dotenv

client=OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# we use gpt-4o-mini as default: current pricing is 
# $0.150 / 1M input tokens, $0.600 / 1M output tokens
def generate_gpt_response(prompt,messages=[], model="gpt-3.5-turbo", temperature=0.7):
    messages.append({"role": "user", "content": prompt})
            
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature = temperature
    )
    content=response.choices[0].message.content
    messages.append({'role': 'assistant', 'content': content})
    return messages


def print_message(message, width=80):
    text=message["role"]+": " + message["content"]
    wrapped_text = textwrap.fill(text, width=width)
    
    for line in wrapped_text.split('\n'):
        print(f" {line.ljust(width)} ")        
    

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv
cannot find .env file


load ablated gsm8k json files

In [35]:
import os
import random
import json
import re

random.seed(56)

ablated_dataset = []
q_indices = []
folder_path = "gsm_template/output"

files = os.listdir(folder_path)
sorted_files = sorted(files, key=lambda x: int(x.split('--')[0][1:]))

print(sorted_files)
print()

for file_name in sorted_files:
    f_path = os.path.join(folder_path, file_name)

    match = re.search(r'\d+', f_path)
    num = match.group(0)
    q_indices.append(int(num))

    with open(f_path, 'r') as f:
        lines = f.readlines()

    random_lines = random.sample([json.loads(item.strip()) for item in lines], min(10, len(lines)))
    print(random_lines)
    print()

    ablated_dataset.append(random_lines)

print()
print(len(ablated_dataset))
print(q_indices)


['q0--NUM100.jsonl', 'q1--NUM100.jsonl', 'q2--NUM100.jsonl', 'q3--NUM100.jsonl', 'q4--NUM100.jsonl', 'q5--NUM100.jsonl', 'q6--NUM100.jsonl', 'q7--NUM100.jsonl', 'q8--NUM100.jsonl', 'q9--NUM100.jsonl', 'q11--NUM100.jsonl', 'q12--NUM100.jsonl', 'q13--NUM100.jsonl', 'q14--NUM100.jsonl', 'q15--NUM100.jsonl', 'q19--NUM100.jsonl', 'q20--NUM100.jsonl', 'q23--NUM100.jsonl', 'q24--NUM100.jsonl', 'q25--NUM100.jsonl']

[{'problem': 'Kc Otis sold clips to 48 of their friends in April, and then they sold 50% as many clips in May.How many clips did Kc Otis sell altogether in April and May?', 'result': '72', 'template_name': 0, 'idx': 71}, {'problem': 'Josh Joaquin sold clips to 48 of their friends in April, and then they sold 50% as many clips in May.How many clips did Josh Joaquin sell altogether in April and May?', 'result': '72', 'template_name': 0, 'idx': 1}, {'problem': 'Cristina Boogie sold clips to 48 of their friends in April, and then they sold 50% as many clips in May.How many clips did Cr

We will use the following system prompt

In [36]:
system_prompt= \
'''You are an expert math tutor specializing in grade-school-level problems like those in the GSM8K dataset. 
Solve each problem step by step, explaining the reasoning and calculations clearly and concisely to help the student understand the process. 
Please return your final answer after the delimiter \"####\" as a numeric number'''

We can answer from gpt api using fewshots in context learning and system prompt

In [37]:
import re

def extract_ans_from_response(answer: str, eos=None):
    if eos:
        answer = answer.split(eos)[0].strip()

    answer = answer.split('####')
    
    if len(answer) > 2:
        answer = answer[-2].strip()
    elif len(answer) == 2:
        answer = answer[-1].strip()
    else:
        answer = "NA"

    if isinstance(answer, str):
        for remove_char in [',', '$', '%', 'g']:
            answer = answer.replace(remove_char, '')

    try:
        return int(answer)
    except ValueError:
        return answer

# def extract_ans_from_response(answer: str, eos=None):
#     if eos:
#         answer = answer.split(eos)[0].strip()

#     answer = answer.split('####')[-1].strip()

#     for remove_char in [',', '$', '%', 'g']:
#         answer = answer.replace(remove_char, '')

#     try:
#         return int(answer)
#     except ValueError:
#         return answer

# def get_num_answer(text):
#     num_matches = re.findall(r'\d+\.?\d*', text)

#     if not num_matches:
#         return None
    
#     final_answer = num_matches[-1]

#     return final_answer

# Implement the function answer_from_gpt(question,temperature,num_shots), which should return the gpt answer as a string
# (see how it is called in the function evaluate_fewshots below)
# temperature is the temperature parameter passed to gpt api
# if num_shots>0, you should use the first num_shots examples in train_data for in-context learning
# Please use the system prompt provided above (see lecture10-gpt.ipynb for examples of system prompt usage)
#
def answer_from_gpt(question, temperature, num_shots):
    example = []

    # if num_shots > 0:
    #     for i in range(num_shots):
    #         curr_ex = train_data[i]
    #         example.append({"role": "user", "content": curr_ex['question']})
    #         example.append({"role": "assistant", "content": curr_ex['answer']})

    full_prompt = system_prompt + "\n" + question
    # print("Q: ", full_prompt)

    messages = generate_gpt_response(prompt=full_prompt, messages=example, temperature=temperature)
    answer = messages[-1]["content"]
    # print("A: ", answer)
    last_token = extract_ans_from_response(answer)
    return last_token

Evaluate on a small subset of gsm8k test data

In [38]:
def evaluate_fewshots(data, temperature,num_shots):
    num_corrects=0
    for k in  tqdm(range(len(data))):
        gpt_answer=answer_from_gpt(data[k]['problem'],temperature=temperature,num_shots=num_shots)
        true_answer= data[k]['result']
        print(f'k={k}: gpt_answer={gpt_answer}  true_answer={true_answer}')   # this is for debug only
        
        if gpt_answer != "NA":
            try:
                if float(gpt_answer) == float(true_answer):
                    print("CORRECT")
                    num_corrects += 1
                else:
                    print("INCORRECT")
            except ValueError:
                print("INCORRECT")
    
    accuracy = num_corrects/len(data)
    print(f'accuracy={accuracy:0.2f}')
    return accuracy

temperature = 0
num_shots = 0

total_inaccurate = 0

print(f'=== temperature={temperature} num_shots={num_shots} ===')
for i in range(len(ablated_dataset)):
    curr_ablation = ablated_dataset[i]

    print("Question ", q_indices[i])
    accuracy = evaluate_fewshots(curr_ablation, temperature, num_shots)
    if accuracy < 1:
        total_inaccurate += 1
    print()

print("Total Inaccurate: ", total_inaccurate)

=== temperature=0 num_shots=0 ===
Question  0


 10%|█         | 1/10 [00:01<00:12,  1.35s/it]

k=0: gpt_answer=72  true_answer=72
CORRECT


 20%|██        | 2/10 [00:02<00:10,  1.33s/it]

k=1: gpt_answer=72  true_answer=72
CORRECT


 30%|███       | 3/10 [00:03<00:09,  1.33s/it]

k=2: gpt_answer=72  true_answer=72
CORRECT


 40%|████      | 4/10 [00:05<00:08,  1.45s/it]

k=3: gpt_answer=72  true_answer=72
CORRECT


 50%|█████     | 5/10 [00:06<00:07,  1.41s/it]

k=4: gpt_answer=72  true_answer=72
CORRECT


 60%|██████    | 6/10 [00:08<00:05,  1.35s/it]

k=5: gpt_answer=72  true_answer=72
CORRECT


 70%|███████   | 7/10 [00:09<00:04,  1.41s/it]

k=6: gpt_answer=72  true_answer=72
CORRECT


 80%|████████  | 8/10 [00:11<00:02,  1.41s/it]

k=7: gpt_answer=72  true_answer=72
CORRECT


 90%|█████████ | 9/10 [00:14<00:02,  2.00s/it]

k=8: gpt_answer=72  true_answer=72
CORRECT


100%|██████████| 10/10 [00:15<00:00,  1.56s/it]


k=9: gpt_answer=72  true_answer=72
CORRECT
accuracy=1.00

Question  1


 10%|█         | 1/10 [00:01<00:09,  1.06s/it]

k=0: gpt_answer=9.9996  true_answer=10
INCORRECT


 20%|██        | 2/10 [00:02<00:09,  1.13s/it]

k=1: gpt_answer=10.  true_answer=10
CORRECT


 30%|███       | 3/10 [00:03<00:07,  1.12s/it]

k=2: gpt_answer=10  true_answer=10
CORRECT


 40%|████      | 4/10 [00:04<00:06,  1.05s/it]

k=3: gpt_answer=10  true_answer=10
CORRECT


 50%|█████     | 5/10 [00:05<00:05,  1.12s/it]

k=4: gpt_answer=9.9996  true_answer=10
INCORRECT


 60%|██████    | 6/10 [00:06<00:04,  1.16s/it]

k=5: gpt_answer=10.0  true_answer=10
CORRECT


 70%|███████   | 7/10 [00:08<00:03,  1.27s/it]

k=6: gpt_answer=10.0  true_answer=10
CORRECT


 80%|████████  | 8/10 [00:09<00:02,  1.33s/it]

k=7: gpt_answer=9.9996  true_answer=10
INCORRECT


 90%|█████████ | 9/10 [00:10<00:01,  1.29s/it]

k=8: gpt_answer=10.  true_answer=10
CORRECT


100%|██████████| 10/10 [00:12<00:00,  1.24s/it]


k=9: gpt_answer=10.00  true_answer=10
CORRECT
accuracy=0.70

Question  2


 10%|█         | 1/10 [00:01<00:11,  1.31s/it]

k=0: gpt_answer=5  true_answer=5.0
CORRECT


 20%|██        | 2/10 [00:03<00:12,  1.58s/it]

k=1: gpt_answer=5  true_answer=5.0
CORRECT


 30%|███       | 3/10 [00:04<00:10,  1.46s/it]

k=2: gpt_answer=5  true_answer=5.0
CORRECT


 40%|████      | 4/10 [00:05<00:08,  1.44s/it]

k=3: gpt_answer=5  true_answer=5.0
CORRECT


 50%|█████     | 5/10 [00:07<00:07,  1.45s/it]

k=4: gpt_answer=5.  true_answer=5.0
CORRECT


 60%|██████    | 6/10 [00:08<00:05,  1.41s/it]

k=5: gpt_answer=5  true_answer=5.0
CORRECT


 70%|███████   | 7/10 [00:10<00:04,  1.48s/it]

k=6: gpt_answer=5  true_answer=5.0
CORRECT


 80%|████████  | 8/10 [00:11<00:02,  1.46s/it]

k=7: gpt_answer=5  true_answer=5.0
CORRECT


 90%|█████████ | 9/10 [00:13<00:01,  1.47s/it]

k=8: gpt_answer=5  true_answer=5.0
CORRECT


100%|██████████| 10/10 [00:14<00:00,  1.45s/it]


k=9: gpt_answer=5  true_answer=5.0
CORRECT
accuracy=1.00

Question  3


 10%|█         | 1/10 [00:00<00:08,  1.03it/s]

k=0: gpt_answer=42  true_answer=42
CORRECT


 20%|██        | 2/10 [00:02<00:08,  1.07s/it]

k=1: gpt_answer=42  true_answer=42
CORRECT


 30%|███       | 3/10 [00:03<00:07,  1.00s/it]

k=2: gpt_answer=42  true_answer=42
CORRECT


 40%|████      | 4/10 [00:04<00:06,  1.09s/it]

k=3: gpt_answer=42  true_answer=42
CORRECT


 50%|█████     | 5/10 [00:05<00:05,  1.02s/it]

k=4: gpt_answer=42  true_answer=42
CORRECT


 60%|██████    | 6/10 [00:06<00:03,  1.00it/s]

k=5: gpt_answer=42  true_answer=42
CORRECT


 70%|███████   | 7/10 [00:07<00:02,  1.03it/s]

k=6: gpt_answer=42  true_answer=42
CORRECT


 80%|████████  | 8/10 [00:07<00:01,  1.05it/s]

k=7: gpt_answer=42  true_answer=42
CORRECT


 90%|█████████ | 9/10 [00:09<00:01,  1.00s/it]

k=8: gpt_answer=42  true_answer=42
CORRECT


100%|██████████| 10/10 [00:09<00:00,  1.00it/s]


k=9: gpt_answer=42  true_answer=42
CORRECT
accuracy=1.00

Question  4


 10%|█         | 1/10 [00:03<00:33,  3.67s/it]

k=0: gpt_answer=624  true_answer=624
CORRECT


 20%|██        | 2/10 [00:05<00:18,  2.33s/it]

k=1: gpt_answer=624  true_answer=624
CORRECT


 30%|███       | 3/10 [00:06<00:12,  1.84s/it]

k=2: gpt_answer=624  true_answer=624
CORRECT


 40%|████      | 4/10 [00:07<00:09,  1.54s/it]

k=3: gpt_answer=312  true_answer=624
INCORRECT


 50%|█████     | 5/10 [00:08<00:07,  1.43s/it]

k=4: gpt_answer=624  true_answer=624
CORRECT


 60%|██████    | 6/10 [00:10<00:05,  1.42s/it]

k=5: gpt_answer=624  true_answer=624
CORRECT


 70%|███████   | 7/10 [00:11<00:04,  1.37s/it]

k=6: gpt_answer=624  true_answer=624
CORRECT


 80%|████████  | 8/10 [00:12<00:02,  1.44s/it]

k=7: gpt_answer=624  true_answer=624
CORRECT


 90%|█████████ | 9/10 [00:15<00:01,  1.69s/it]

k=8: gpt_answer=624  true_answer=624
CORRECT


100%|██████████| 10/10 [00:16<00:00,  1.65s/it]


k=9: gpt_answer=624  true_answer=624
CORRECT
accuracy=0.90

Question  5


 10%|█         | 1/10 [00:02<00:18,  2.08s/it]

k=0: gpt_answer=22  true_answer=35
INCORRECT


 20%|██        | 2/10 [00:04<00:18,  2.33s/it]

k=1: gpt_answer=35  true_answer=35
CORRECT


 30%|███       | 3/10 [00:06<00:15,  2.21s/it]

k=2: gpt_answer=35  true_answer=35
CORRECT


 40%|████      | 4/10 [00:08<00:12,  2.10s/it]

k=3: gpt_answer=35  true_answer=35
CORRECT


 50%|█████     | 5/10 [00:10<00:09,  1.99s/it]

k=4: gpt_answer=35  true_answer=35
CORRECT


 60%|██████    | 6/10 [00:12<00:07,  1.94s/it]

k=5: gpt_answer=35  true_answer=35
CORRECT


 70%|███████   | 7/10 [00:14<00:06,  2.01s/it]

k=6: gpt_answer=35  true_answer=35
CORRECT


 80%|████████  | 8/10 [00:16<00:03,  2.00s/it]

k=7: gpt_answer=22  true_answer=35
INCORRECT


 90%|█████████ | 9/10 [00:18<00:01,  1.97s/it]

k=8: gpt_answer=35  true_answer=35
CORRECT


100%|██████████| 10/10 [00:20<00:00,  2.02s/it]


k=9: gpt_answer=35  true_answer=35
CORRECT
accuracy=0.80

Question  6


 10%|█         | 1/10 [00:01<00:10,  1.15s/it]

k=0: gpt_answer=48  true_answer=48
CORRECT


 20%|██        | 2/10 [00:02<00:09,  1.16s/it]

k=1: gpt_answer=48  true_answer=48
CORRECT


 30%|███       | 3/10 [00:04<00:09,  1.42s/it]

k=2: gpt_answer=48  true_answer=48
CORRECT


 40%|████      | 4/10 [00:05<00:07,  1.31s/it]

k=3: gpt_answer=48  true_answer=48
CORRECT


 50%|█████     | 5/10 [00:06<00:06,  1.22s/it]

k=4: gpt_answer=48  true_answer=48
CORRECT


 60%|██████    | 6/10 [00:08<00:05,  1.43s/it]

k=5: gpt_answer=48  true_answer=48
CORRECT


 70%|███████   | 7/10 [00:09<00:04,  1.33s/it]

k=6: gpt_answer=48  true_answer=48
CORRECT


 80%|████████  | 8/10 [00:10<00:02,  1.32s/it]

k=7: gpt_answer=48  true_answer=48
CORRECT


 90%|█████████ | 9/10 [00:11<00:01,  1.26s/it]

k=8: gpt_answer=48  true_answer=48
CORRECT


100%|██████████| 10/10 [00:12<00:00,  1.27s/it]


k=9: gpt_answer=48  true_answer=48
CORRECT
accuracy=1.00

Question  7


 10%|█         | 1/10 [00:01<00:10,  1.20s/it]

k=0: gpt_answer=16  true_answer=16
CORRECT


 20%|██        | 2/10 [00:03<00:12,  1.58s/it]

k=1: gpt_answer=30  true_answer=16
INCORRECT


 30%|███       | 3/10 [00:04<00:10,  1.48s/it]

k=2: gpt_answer=16  true_answer=16
CORRECT


 40%|████      | 4/10 [00:06<00:10,  1.69s/it]

k=3: gpt_answer=18  true_answer=16
INCORRECT


 50%|█████     | 5/10 [00:08<00:09,  1.80s/it]

k=4: gpt_answer=18  true_answer=16
INCORRECT


 60%|██████    | 6/10 [00:10<00:07,  1.87s/it]

k=5: gpt_answer=30  true_answer=16
INCORRECT


 70%|███████   | 7/10 [00:12<00:05,  1.92s/it]

k=6: gpt_answer=18  true_answer=16
INCORRECT


 80%|████████  | 8/10 [00:13<00:03,  1.77s/it]

k=7: gpt_answer=16  true_answer=16
CORRECT


 90%|█████████ | 9/10 [00:15<00:01,  1.83s/it]

k=8: gpt_answer=18  true_answer=16
INCORRECT


100%|██████████| 10/10 [00:18<00:00,  1.82s/it]


k=9: gpt_answer=1.25  true_answer=16
INCORRECT
accuracy=0.30

Question  8


 10%|█         | 1/10 [00:01<00:15,  1.74s/it]

k=0: gpt_answer=41  true_answer=41
CORRECT


 20%|██        | 2/10 [00:03<00:12,  1.60s/it]

k=1: gpt_answer=41  true_answer=41
CORRECT


 30%|███       | 3/10 [00:05<00:12,  1.82s/it]

k=2: gpt_answer=41.  true_answer=41
CORRECT


 40%|████      | 4/10 [00:06<00:10,  1.75s/it]

k=3: gpt_answer=41.  true_answer=41
CORRECT


 50%|█████     | 5/10 [00:08<00:08,  1.70s/it]

k=4: gpt_answer=41  true_answer=41
CORRECT


 60%|██████    | 6/10 [00:10<00:06,  1.67s/it]

k=5: gpt_answer=41  true_answer=41
CORRECT


 70%|███████   | 7/10 [00:11<00:04,  1.58s/it]

k=6: gpt_answer=57.  true_answer=41
INCORRECT


 80%|████████  | 8/10 [00:13<00:03,  1.56s/it]

k=7: gpt_answer=41  true_answer=41
CORRECT


 90%|█████████ | 9/10 [00:14<00:01,  1.43s/it]

k=8: gpt_answer=41  true_answer=41
CORRECT


100%|██████████| 10/10 [00:15<00:00,  1.59s/it]


k=9: gpt_answer=41  true_answer=41
CORRECT
accuracy=0.90

Question  9


 10%|█         | 1/10 [00:01<00:13,  1.47s/it]

k=0: gpt_answer=990.  true_answer=990.0
CORRECT


 20%|██        | 2/10 [00:03<00:14,  1.85s/it]

k=1: gpt_answer=990  true_answer=990.0
CORRECT


 30%|███       | 3/10 [00:05<00:12,  1.82s/it]

k=2: gpt_answer=990  true_answer=990.0
CORRECT


 40%|████      | 4/10 [00:07<00:11,  1.92s/it]

k=3: gpt_answer=990.00  true_answer=990.0
CORRECT


 50%|█████     | 5/10 [00:08<00:08,  1.77s/it]

k=4: gpt_answer=990  true_answer=990.0
CORRECT


 60%|██████    | 6/10 [00:10<00:07,  1.77s/it]

k=5: gpt_answer=990  true_answer=990.0
CORRECT


 70%|███████   | 7/10 [00:12<00:05,  1.70s/it]

k=6: gpt_answer=990.  true_answer=990.0
CORRECT


 80%|████████  | 8/10 [00:13<00:03,  1.63s/it]

k=7: gpt_answer=990.  true_answer=990.0
CORRECT


 90%|█████████ | 9/10 [00:15<00:01,  1.70s/it]

k=8: gpt_answer=990.  true_answer=990.0
CORRECT


100%|██████████| 10/10 [00:17<00:00,  1.72s/it]


k=9: gpt_answer=990  true_answer=990.0
CORRECT
accuracy=1.00

Question  11


 10%|█         | 1/10 [00:01<00:16,  1.84s/it]

k=0: gpt_answer=5  true_answer=5
CORRECT


 20%|██        | 2/10 [00:04<00:17,  2.18s/it]

k=1: gpt_answer=5  true_answer=5
CORRECT


 30%|███       | 3/10 [00:06<00:15,  2.19s/it]

k=2: gpt_answer=5  true_answer=5
CORRECT


 40%|████      | 4/10 [00:09<00:14,  2.35s/it]

k=3: gpt_answer=2  true_answer=5
INCORRECT


 50%|█████     | 5/10 [00:10<00:10,  2.07s/it]

k=4: gpt_answer=4  true_answer=5
INCORRECT


 60%|██████    | 6/10 [00:13<00:08,  2.22s/it]

k=5: gpt_answer=5  true_answer=5
CORRECT


 70%|███████   | 7/10 [00:14<00:05,  1.97s/it]

k=6: gpt_answer=0  true_answer=5
INCORRECT


 80%|████████  | 8/10 [00:16<00:04,  2.08s/it]

k=7: gpt_answer=0  true_answer=5
INCORRECT


 90%|█████████ | 9/10 [00:18<00:01,  1.90s/it]

k=8: gpt_answer=0  true_answer=5
INCORRECT


100%|██████████| 10/10 [00:21<00:00,  2.16s/it]


k=9: gpt_answer=5  true_answer=5
CORRECT
accuracy=0.50

Question  12


 10%|█         | 1/10 [00:01<00:11,  1.23s/it]

k=0: gpt_answer=85  true_answer=85
CORRECT


 20%|██        | 2/10 [00:02<00:08,  1.00s/it]

k=1: gpt_answer=85  true_answer=85
CORRECT


 30%|███       | 3/10 [00:03<00:07,  1.11s/it]

k=2: gpt_answer=85  true_answer=85
CORRECT


 40%|████      | 4/10 [00:04<00:06,  1.12s/it]

k=3: gpt_answer=85  true_answer=85
CORRECT


 50%|█████     | 5/10 [00:05<00:05,  1.18s/it]

k=4: gpt_answer=85  true_answer=85
CORRECT


 60%|██████    | 6/10 [00:06<00:04,  1.13s/it]

k=5: gpt_answer=85  true_answer=85
CORRECT


 70%|███████   | 7/10 [00:08<00:03,  1.19s/it]

k=6: gpt_answer=85  true_answer=85
CORRECT


 80%|████████  | 8/10 [00:09<00:02,  1.16s/it]

k=7: gpt_answer=85  true_answer=85
CORRECT


 90%|█████████ | 9/10 [00:10<00:01,  1.14s/it]

k=8: gpt_answer=85  true_answer=85
CORRECT


100%|██████████| 10/10 [00:11<00:00,  1.13s/it]


k=9: gpt_answer=85  true_answer=85
CORRECT
accuracy=1.00

Question  13


 10%|█         | 1/10 [00:01<00:13,  1.50s/it]

k=0: gpt_answer=32.50  true_answer=35
INCORRECT


 20%|██        | 2/10 [00:03<00:13,  1.68s/it]

k=1: gpt_answer=32.50  true_answer=35
INCORRECT


 30%|███       | 3/10 [00:04<00:10,  1.56s/it]

k=2: gpt_answer=22.5  true_answer=35
INCORRECT


 40%|████      | 4/10 [00:06<00:09,  1.53s/it]

k=3: gpt_answer=32.50  true_answer=35
INCORRECT


 50%|█████     | 5/10 [00:07<00:07,  1.41s/it]

k=4: gpt_answer=32.50  true_answer=35
INCORRECT


 60%|██████    | 6/10 [00:09<00:06,  1.50s/it]

k=5: gpt_answer=32.50  true_answer=35
INCORRECT


 70%|███████   | 7/10 [00:10<00:04,  1.43s/it]

k=6: gpt_answer=35  true_answer=35
CORRECT


 80%|████████  | 8/10 [00:11<00:02,  1.48s/it]

k=7: gpt_answer=35  true_answer=35
CORRECT


 90%|█████████ | 9/10 [00:13<00:01,  1.47s/it]

k=8: gpt_answer=35  true_answer=35
CORRECT


100%|██████████| 10/10 [00:14<00:00,  1.47s/it]


k=9: gpt_answer=32.50  true_answer=35
INCORRECT
accuracy=0.30

Question  14


 10%|█         | 1/10 [00:01<00:11,  1.32s/it]

k=0: gpt_answer=5  true_answer=5
CORRECT


 20%|██        | 2/10 [00:02<00:09,  1.23s/it]

k=1: gpt_answer=5  true_answer=5
CORRECT


 30%|███       | 3/10 [00:04<00:09,  1.37s/it]

k=2: gpt_answer=5  true_answer=5
CORRECT


 40%|████      | 4/10 [00:05<00:08,  1.35s/it]

k=3: gpt_answer=5  true_answer=5
CORRECT


 50%|█████     | 5/10 [00:06<00:06,  1.39s/it]

k=4: gpt_answer=5  true_answer=5
CORRECT


 60%|██████    | 6/10 [00:07<00:05,  1.31s/it]

k=5: gpt_answer=5  true_answer=5
CORRECT


 70%|███████   | 7/10 [00:09<00:03,  1.26s/it]

k=6: gpt_answer=5.  true_answer=5
CORRECT


 80%|████████  | 8/10 [00:10<00:02,  1.30s/it]

k=7: gpt_answer=5  true_answer=5
CORRECT


 90%|█████████ | 9/10 [00:11<00:01,  1.26s/it]

k=8: gpt_answer=5  true_answer=5
CORRECT


100%|██████████| 10/10 [00:13<00:00,  1.32s/it]


k=9: gpt_answer=5  true_answer=5
CORRECT
accuracy=1.00

Question  15


 10%|█         | 1/10 [00:01<00:13,  1.45s/it]

k=0: gpt_answer=450000.  true_answer=448000.0
INCORRECT


 20%|██        | 2/10 [00:03<00:12,  1.56s/it]

k=1: gpt_answer=450000  true_answer=448000.0
INCORRECT


 30%|███       | 3/10 [00:04<00:10,  1.51s/it]

k=2: gpt_answer=450000.  true_answer=448000.0
INCORRECT


 40%|████      | 4/10 [00:05<00:08,  1.48s/it]

k=3: gpt_answer=450000  true_answer=448000.0
INCORRECT


 50%|█████     | 5/10 [00:07<00:08,  1.62s/it]

k=4: gpt_answer=450000  true_answer=448000.0
INCORRECT


 60%|██████    | 6/10 [00:09<00:06,  1.64s/it]

k=5: gpt_answer=450000.  true_answer=448000.0
INCORRECT


 70%|███████   | 7/10 [00:11<00:05,  1.71s/it]

k=6: gpt_answer=450000  true_answer=448000.0
INCORRECT


 80%|████████  | 8/10 [00:12<00:03,  1.66s/it]

k=7: gpt_answer=450000  true_answer=448000.0
INCORRECT


 90%|█████████ | 9/10 [00:14<00:01,  1.63s/it]

k=8: gpt_answer=450000.  true_answer=448000.0
INCORRECT


100%|██████████| 10/10 [00:16<00:00,  1.60s/it]


k=9: gpt_answer=450000.  true_answer=448000.0
INCORRECT
accuracy=0.00

Question  19


 10%|█         | 1/10 [00:01<00:11,  1.26s/it]

k=0: gpt_answer=16  true_answer=16
CORRECT


 20%|██        | 2/10 [00:02<00:11,  1.46s/it]

k=1: gpt_answer=12  true_answer=16
INCORRECT


 30%|███       | 3/10 [00:04<00:09,  1.35s/it]

k=2: gpt_answer=16  true_answer=16
CORRECT


 40%|████      | 4/10 [00:05<00:07,  1.27s/it]

k=3: gpt_answer=16  true_answer=16
CORRECT


 50%|█████     | 5/10 [00:06<00:06,  1.26s/it]

k=4: gpt_answer=16  true_answer=16
CORRECT


 60%|██████    | 6/10 [00:07<00:05,  1.30s/it]

k=5: gpt_answer=12  true_answer=16
INCORRECT


 70%|███████   | 7/10 [00:09<00:03,  1.28s/it]

k=6: gpt_answer=12  true_answer=16
INCORRECT


 80%|████████  | 8/10 [00:10<00:02,  1.33s/it]

k=7: gpt_answer=16  true_answer=16
CORRECT


 90%|█████████ | 9/10 [00:11<00:01,  1.36s/it]

k=8: gpt_answer=12  true_answer=16
INCORRECT


100%|██████████| 10/10 [00:14<00:00,  1.40s/it]


k=9: gpt_answer=16  true_answer=16
CORRECT
accuracy=0.60

Question  20


 10%|█         | 1/10 [00:01<00:12,  1.42s/it]

k=0: gpt_answer=38  true_answer=38
CORRECT


 20%|██        | 2/10 [00:02<00:11,  1.39s/it]

k=1: gpt_answer=38  true_answer=38
CORRECT


 30%|███       | 3/10 [00:04<00:09,  1.41s/it]

k=2: gpt_answer=38  true_answer=38
CORRECT


 40%|████      | 4/10 [00:05<00:08,  1.48s/it]

k=3: gpt_answer=38  true_answer=38
CORRECT


 50%|█████     | 5/10 [00:07<00:07,  1.46s/it]

k=4: gpt_answer=38  true_answer=38
CORRECT


 60%|██████    | 6/10 [00:08<00:05,  1.49s/it]

k=5: gpt_answer=38  true_answer=38
CORRECT


 70%|███████   | 7/10 [00:10<00:04,  1.53s/it]

k=6: gpt_answer=38  true_answer=38
CORRECT


 80%|████████  | 8/10 [00:12<00:03,  1.71s/it]

k=7: gpt_answer=38  true_answer=38
CORRECT


 90%|█████████ | 9/10 [00:14<00:01,  1.80s/it]

k=8: gpt_answer=38  true_answer=38
CORRECT


100%|██████████| 10/10 [00:16<00:00,  1.61s/it]


k=9: gpt_answer=38  true_answer=38
CORRECT
accuracy=1.00

Question  23


 10%|█         | 1/10 [00:01<00:13,  1.53s/it]

k=0: gpt_answer=5  true_answer=5
CORRECT


 20%|██        | 2/10 [00:03<00:13,  1.66s/it]

k=1: gpt_answer=5  true_answer=5
CORRECT


 30%|███       | 3/10 [00:05<00:11,  1.70s/it]

k=2: gpt_answer=5  true_answer=5
CORRECT


 40%|████      | 4/10 [00:06<00:10,  1.70s/it]

k=3: gpt_answer=5  true_answer=5
CORRECT


 50%|█████     | 5/10 [00:08<00:08,  1.71s/it]

k=4: gpt_answer=5  true_answer=5
CORRECT


 60%|██████    | 6/10 [00:09<00:06,  1.59s/it]

k=5: gpt_answer=5  true_answer=5
CORRECT


 70%|███████   | 7/10 [00:11<00:04,  1.63s/it]

k=6: gpt_answer=5  true_answer=5
CORRECT


 80%|████████  | 8/10 [00:13<00:03,  1.67s/it]

k=7: gpt_answer=5  true_answer=5
CORRECT


 90%|█████████ | 9/10 [00:14<00:01,  1.62s/it]

k=8: gpt_answer=5.00  true_answer=5
CORRECT


100%|██████████| 10/10 [00:16<00:00,  1.66s/it]


k=9: gpt_answer=5  true_answer=5
CORRECT
accuracy=1.00

Question  24


 10%|█         | 1/10 [00:01<00:09,  1.03s/it]

k=0: gpt_answer=62  true_answer=62
CORRECT


 20%|██        | 2/10 [00:02<00:08,  1.02s/it]

k=1: gpt_answer=62  true_answer=62
CORRECT


 30%|███       | 3/10 [00:03<00:07,  1.08s/it]

k=2: gpt_answer=62  true_answer=62
CORRECT


 40%|████      | 4/10 [00:04<00:06,  1.11s/it]

k=3: gpt_answer=62  true_answer=62
CORRECT


 50%|█████     | 5/10 [00:05<00:05,  1.03s/it]

k=4: gpt_answer=62  true_answer=62
CORRECT


 60%|██████    | 6/10 [00:06<00:04,  1.01s/it]

k=5: gpt_answer=62  true_answer=62
CORRECT


 70%|███████   | 7/10 [00:07<00:03,  1.14s/it]

k=6: gpt_answer=62  true_answer=62
CORRECT


 80%|████████  | 8/10 [00:08<00:02,  1.12s/it]

k=7: gpt_answer=62  true_answer=62
CORRECT


 90%|█████████ | 9/10 [00:10<00:01,  1.24s/it]

k=8: gpt_answer=62  true_answer=62
CORRECT


100%|██████████| 10/10 [00:11<00:00,  1.13s/it]


k=9: gpt_answer=62  true_answer=62
CORRECT
accuracy=1.00

Question  25


 10%|█         | 1/10 [00:01<00:10,  1.18s/it]

k=0: gpt_answer=110  true_answer=110
CORRECT


 20%|██        | 2/10 [00:02<00:10,  1.26s/it]

k=1: gpt_answer=110  true_answer=110
CORRECT


 30%|███       | 3/10 [00:03<00:09,  1.30s/it]

k=2: gpt_answer=110  true_answer=110
CORRECT


 40%|████      | 4/10 [00:04<00:07,  1.24s/it]

k=3: gpt_answer=110  true_answer=110
CORRECT


 50%|█████     | 5/10 [00:06<00:06,  1.28s/it]

k=4: gpt_answer=110  true_answer=110
CORRECT


 60%|██████    | 6/10 [00:07<00:05,  1.33s/it]

k=5: gpt_answer=110  true_answer=110
CORRECT


 70%|███████   | 7/10 [00:09<00:04,  1.42s/it]

k=6: gpt_answer=110  true_answer=110
CORRECT


 80%|████████  | 8/10 [00:10<00:02,  1.38s/it]

k=7: gpt_answer=110  true_answer=110
CORRECT


 90%|█████████ | 9/10 [00:11<00:01,  1.32s/it]

k=8: gpt_answer=110  true_answer=110
CORRECT


100%|██████████| 10/10 [00:12<00:00,  1.29s/it]

k=9: gpt_answer=110  true_answer=110
CORRECT
accuracy=1.00

Total Inaccurate:  9



