Code from A3 problem1-prompt.ipynb and https://github.com/SewoongLee/reproduce-llama3-arithmetic/blob/main/llama3-tutorial-gsm8k.ipynb

You need to set your OPENAI_API_KEY environment variable in .env

In [1]:
from openai import OpenAI
from tqdm import tqdm
import textwrap
import dotenv
import os
%load_ext dotenv
%dotenv

client=OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# we use gpt-4o-mini as default: current pricing is 
# $0.150 / 1M input tokens, $0.600 / 1M output tokens
def generate_gpt_response(prompt,messages=[], model="gpt-3.5-turbo", temperature=0.7):
    messages.append({"role": "user", "content": prompt})
            
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature = temperature
    )
    content=response.choices[0].message.content
    messages.append({'role': 'assistant', 'content': content})
    return messages


def print_message(message, width=80):
    text=message["role"]+": " + message["content"]
    wrapped_text = textwrap.fill(text, width=width)
    
    for line in wrapped_text.split('\n'):
        print(f" {line.ljust(width)} ")        
    

cannot find .env file


load gsm8k dataset. only use the datapoints for questions that were ablated in gsm_template

In [2]:
import os
import re

q_indices = []
pattern = re.compile(r'q(\d+)\.py')

for fname in os.listdir("gsm_template/"):
    match = pattern.match(fname)
    if match:
        q_indices.append(int(match.group(1)))

q_indices.sort()
print(q_indices)
print(len(q_indices))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 19, 20, 23, 24, 25]
20


In [3]:
from datasets import load_dataset

dataset = load_dataset("gsm8k",'main')

train_data = dataset['train'].select(q_indices)

# train_data = dataset['train'].select(range(10))
# test_data = dataset['test'].select(range(10))

for i in range(len(train_data)):
    print('Q:')
    print(train_data[i]['question'])
    print()
# print('A:')
# print(train_data[0]['answer'])


Q:
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

Q:
Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?

Q:
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?

Q:
Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?

Q:
James writes a 3-page letter to 2 different friends twice a week.  How many pages does he write a year?

Q:
Mark has a garden with flowers. He planted plants of three different colors in it. Ten of them are yellow, and there are 8

We will use the following system prompt

In [4]:
system_prompt= \
'''You are an expert math tutor specializing in grade-school-level problems like those in the GSM8K dataset. 
Solve each problem step by step, explaining the reasoning and calculations clearly and concisely to help the student understand the process. 
Please return your final answer after the delimiter \"####\" as a numeric number'''

We can answer from gpt api using fewshots in context learning and system prompt

In [5]:
import re

def extract_ans_from_response(answer: str, eos=None):
    if eos:
        answer = answer.split(eos)[0].strip()

    answer = answer.split('####')
    
    if len(answer) > 2:
        answer = answer[-2].strip()
    elif len(answer) == 2:
        answer = answer[-1].strip()
    else:
        answer = "NA"

    if isinstance(answer, str):
        for remove_char in [',', '$', '%', 'g']:
            answer = answer.replace(remove_char, '')

    try:
        return int(answer)
    except ValueError:
        return answer

# def get_num_answer(text):
#     num_matches = re.findall(r'\d+\.?\d*', text)

#     if not num_matches:
#         return None
    
#     final_answer = num_matches[-1]

#     return final_answer

# Implement the function answer_from_gpt(question,temperature,num_shots), which should return the gpt answer as a string
# (see how it is called in the function evaluate_fewshots below)
# temperature is the temperature parameter passed to gpt api
# if num_shots>0, you should use the first num_shots examples in train_data for in-context learning
# Please use the system prompt provided above (see lecture10-gpt.ipynb for examples of system prompt usage)
#
def answer_from_gpt(question, temperature, num_shots):
    example = []

    if num_shots > 0:
        for i in range(num_shots):
            curr_ex = train_data[i]
            example.append({"role": "user", "content": curr_ex['question']})
            example.append({"role": "assistant", "content": curr_ex['answer']})

    full_prompt = system_prompt + "\n" + question
    # print("Q: ", full_prompt)

    messages = generate_gpt_response(prompt=full_prompt, messages=example, temperature=temperature)
    answer = messages[-1]["content"]
    # print("A: ", answer)
    last_token = extract_ans_from_response(answer)
    return last_token

Evaluate on a small subset of gsm8k test data

In [None]:
repeats = 100

def evaluate_fewshots(temperature,num_shots):
    total_inaccurate = 0
    for k in tqdm(range(len(train_data))):
        print("Question ", q_indices[k])
        num_corrects = 0
        for i in range(repeats):
            gpt_answer=answer_from_gpt(train_data[k]['question'],temperature=temperature,num_shots=num_shots)
            true_answer= train_data[k]['answer'].split()[-1]
            # print(f'k={i}: gpt_answer={gpt_answer}  true_answer={true_answer}')   # this is for debug only
            if gpt_answer != "NA":
                try:
                    if float(gpt_answer) == float(true_answer):
                        # print("CORRECT")
                        num_corrects += 1
                    else:
                        # print("INCORRECT")
                        num_corrects += 0
                except ValueError:
                    # print("INCORRECT")
                    num_corrects += 0
        accuracy = num_corrects/repeats
        print(f'accuracy={accuracy:0.2f}')
        print()

        if accuracy < 1:
            total_inaccurate += 1
        
    print("Total Inaccurate: ", total_inaccurate)
temperature = 0
num_shots = 0

print(f'=== temperature={temperature} num_shots={num_shots} ===')
evaluate_fewshots(temperature,num_shots)

=== temperature=0 num_shots=0 ===


  0%|          | 0/20 [00:00<?, ?it/s]

Question  0
k=0: gpt_answer=72  true_answer=72
CORRECT
k=1: gpt_answer=72  true_answer=72
CORRECT
k=2: gpt_answer=72  true_answer=72
CORRECT
k=3: gpt_answer=72  true_answer=72
CORRECT
k=4: gpt_answer=72  true_answer=72
CORRECT
k=5: gpt_answer=72  true_answer=72
CORRECT
k=6: gpt_answer=72  true_answer=72
CORRECT
k=7: gpt_answer=72  true_answer=72
CORRECT
k=8: gpt_answer=72  true_answer=72
CORRECT
k=9: gpt_answer=72  true_answer=72
CORRECT
k=10: gpt_answer=72  true_answer=72
CORRECT
k=11: gpt_answer=72  true_answer=72
CORRECT
k=12: gpt_answer=72  true_answer=72
CORRECT
k=13: gpt_answer=72  true_answer=72
CORRECT
k=14: gpt_answer=72  true_answer=72
CORRECT
k=15: gpt_answer=72  true_answer=72
CORRECT
k=16: gpt_answer=72  true_answer=72
CORRECT
k=17: gpt_answer=72  true_answer=72
CORRECT
k=18: gpt_answer=72  true_answer=72
CORRECT


  5%|▌         | 1/20 [00:23<07:17, 23.01s/it]

k=19: gpt_answer=72  true_answer=72
CORRECT
accuracy=1.00

Question  1
k=0: gpt_answer=10  true_answer=10
CORRECT
k=1: gpt_answer=9.9996  true_answer=10
INCORRECT
k=2: gpt_answer=10.00  true_answer=10
CORRECT
k=3: gpt_answer=10.0  true_answer=10
CORRECT
k=4: gpt_answer=10.00  true_answer=10
CORRECT
k=5: gpt_answer=9.9996  true_answer=10
INCORRECT
k=6: gpt_answer=10.0  true_answer=10
CORRECT
k=7: gpt_answer=10.00  true_answer=10
CORRECT
k=8: gpt_answer=9.9996  true_answer=10
INCORRECT
k=9: gpt_answer=9.9996  true_answer=10
INCORRECT
k=10: gpt_answer=10  true_answer=10
CORRECT
k=11: gpt_answer=10  true_answer=10
CORRECT
k=12: gpt_answer=10.0  true_answer=10
CORRECT
k=13: gpt_answer=9.9996  true_answer=10
INCORRECT
k=14: gpt_answer=9.9996  true_answer=10
INCORRECT
k=15: gpt_answer=10.  true_answer=10
CORRECT
k=16: gpt_answer=10  true_answer=10
CORRECT
k=17: gpt_answer=10  true_answer=10
CORRECT
k=18: gpt_answer=9.9996  true_answer=10
INCORRECT


 10%|█         | 2/20 [00:49<07:33, 25.21s/it]

k=19: gpt_answer=9.9996  true_answer=10
INCORRECT
accuracy=0.60

Question  2
k=0: gpt_answer=5  true_answer=5
CORRECT
k=1: gpt_answer=5  true_answer=5
CORRECT
k=2: gpt_answer=5  true_answer=5
CORRECT
k=3: gpt_answer=5  true_answer=5
CORRECT
k=4: gpt_answer=5  true_answer=5
CORRECT
k=5: gpt_answer=5  true_answer=5
CORRECT
k=6: gpt_answer=5  true_answer=5
CORRECT
k=7: gpt_answer=5  true_answer=5
CORRECT
k=8: gpt_answer=5  true_answer=5
CORRECT
k=9: gpt_answer=5  true_answer=5
CORRECT
k=10: gpt_answer=5  true_answer=5
CORRECT
k=11: gpt_answer=5  true_answer=5
CORRECT
k=12: gpt_answer=5  true_answer=5
CORRECT
k=13: gpt_answer=5  true_answer=5
CORRECT
k=14: gpt_answer=5  true_answer=5
CORRECT
k=15: gpt_answer=5  true_answer=5
CORRECT
k=16: gpt_answer=5  true_answer=5
CORRECT
k=17: gpt_answer=5  true_answer=5
CORRECT
k=18: gpt_answer=5  true_answer=5
CORRECT


 15%|█▌        | 3/20 [01:16<07:18, 25.81s/it]

k=19: gpt_answer=5  true_answer=5
CORRECT
accuracy=1.00

Question  3
k=0: gpt_answer=42  true_answer=42
CORRECT
k=1: gpt_answer=42  true_answer=42
CORRECT
k=2: gpt_answer=42  true_answer=42
CORRECT
k=3: gpt_answer=42  true_answer=42
CORRECT
k=4: gpt_answer=42  true_answer=42
CORRECT
k=5: gpt_answer=42  true_answer=42
CORRECT
k=6: gpt_answer=42  true_answer=42
CORRECT
k=7: gpt_answer=42  true_answer=42
CORRECT
k=8: gpt_answer=42  true_answer=42
CORRECT
k=9: gpt_answer=42  true_answer=42
CORRECT
k=10: gpt_answer=42  true_answer=42
CORRECT
k=11: gpt_answer=42  true_answer=42
CORRECT
k=12: gpt_answer=42  true_answer=42
CORRECT
k=13: gpt_answer=42  true_answer=42
CORRECT
k=14: gpt_answer=42  true_answer=42
CORRECT
k=15: gpt_answer=42  true_answer=42
CORRECT
k=16: gpt_answer=42  true_answer=42
CORRECT
k=17: gpt_answer=42  true_answer=42
CORRECT
k=18: gpt_answer=42  true_answer=42
CORRECT


 20%|██        | 4/20 [01:35<06:10, 23.18s/it]

k=19: gpt_answer=42  true_answer=42
CORRECT
accuracy=1.00

Question  4
k=0: gpt_answer=624  true_answer=624
CORRECT
k=1: gpt_answer=624  true_answer=624
CORRECT
k=2: gpt_answer=624  true_answer=624
CORRECT
k=3: gpt_answer=624  true_answer=624
CORRECT
k=4: gpt_answer=624  true_answer=624
CORRECT
k=5: gpt_answer=624  true_answer=624
CORRECT
k=6: gpt_answer=312  true_answer=624
INCORRECT
k=7: gpt_answer=624  true_answer=624
CORRECT
k=8: gpt_answer=624  true_answer=624
CORRECT
k=9: gpt_answer=624  true_answer=624
CORRECT
k=10: gpt_answer=624  true_answer=624
CORRECT
k=11: gpt_answer=624  true_answer=624
CORRECT
k=12: gpt_answer=624  true_answer=624
CORRECT
k=13: gpt_answer=624  true_answer=624
CORRECT
k=14: gpt_answer=624  true_answer=624
CORRECT
k=15: gpt_answer=624  true_answer=624
CORRECT
k=16: gpt_answer=624  true_answer=624
CORRECT
k=17: gpt_answer=624  true_answer=624
CORRECT
k=18: gpt_answer=624  true_answer=624
CORRECT


 25%|██▌       | 5/20 [02:01<06:02, 24.16s/it]

k=19: gpt_answer=624  true_answer=624
CORRECT
accuracy=0.95

Question  5
k=0: gpt_answer=22  true_answer=35
INCORRECT
k=1: gpt_answer=35  true_answer=35
CORRECT
k=2: gpt_answer=22  true_answer=35
INCORRECT
k=3: gpt_answer=35  true_answer=35
CORRECT
k=4: gpt_answer=35  true_answer=35
CORRECT
k=5: gpt_answer=35  true_answer=35
CORRECT
k=6: gpt_answer=35  true_answer=35
CORRECT
k=7: gpt_answer=25  true_answer=35
INCORRECT
k=8: gpt_answer=22  true_answer=35
INCORRECT
k=9: gpt_answer=35  true_answer=35
CORRECT
k=10: gpt_answer=35  true_answer=35
CORRECT
k=11: gpt_answer=25  true_answer=35
INCORRECT
k=12: gpt_answer=35  true_answer=35
CORRECT
k=13: gpt_answer=22  true_answer=35
INCORRECT
k=14: gpt_answer=35  true_answer=35
CORRECT
k=15: gpt_answer=22  true_answer=35
INCORRECT
k=16: gpt_answer=22  true_answer=35
INCORRECT
k=17: gpt_answer=22  true_answer=35
INCORRECT
k=18: gpt_answer=35  true_answer=35
CORRECT


 30%|███       | 6/20 [02:38<06:41, 28.69s/it]

k=19: gpt_answer=35  true_answer=35
CORRECT
accuracy=0.55

Question  6
k=0: gpt_answer=48  true_answer=48
CORRECT
k=1: gpt_answer=48  true_answer=48
CORRECT
k=2: gpt_answer=48  true_answer=48
CORRECT
k=3: gpt_answer=48  true_answer=48
CORRECT
k=4: gpt_answer=48  true_answer=48
CORRECT
k=5: gpt_answer=48  true_answer=48
CORRECT
k=6: gpt_answer=48  true_answer=48
CORRECT
k=7: gpt_answer=48  true_answer=48
CORRECT
k=8: gpt_answer=48  true_answer=48
CORRECT
k=9: gpt_answer=48  true_answer=48
CORRECT
k=10: gpt_answer=48  true_answer=48
CORRECT
k=11: gpt_answer=48  true_answer=48
CORRECT
k=12: gpt_answer=48  true_answer=48
CORRECT
k=13: gpt_answer=48  true_answer=48
CORRECT
k=14: gpt_answer=48  true_answer=48
CORRECT
k=15: gpt_answer=48  true_answer=48
CORRECT
k=16: gpt_answer=48  true_answer=48
CORRECT
k=17: gpt_answer=48  true_answer=48
CORRECT
k=18: gpt_answer=48  true_answer=48
CORRECT


 35%|███▌      | 7/20 [03:02<05:49, 26.90s/it]

k=19: gpt_answer=48  true_answer=48
CORRECT
accuracy=1.00

Question  7
k=0: gpt_answer=16  true_answer=16
CORRECT
k=1: gpt_answer=16  true_answer=16
CORRECT
k=2: gpt_answer=16  true_answer=16
CORRECT
k=3: gpt_answer=16  true_answer=16
CORRECT
k=4: gpt_answer=16  true_answer=16
CORRECT
k=5: gpt_answer=30  true_answer=16
INCORRECT
k=6: gpt_answer=16  true_answer=16
CORRECT
k=7: gpt_answer=16  true_answer=16
CORRECT
k=8: gpt_answer=30  true_answer=16
INCORRECT
k=9: gpt_answer=16  true_answer=16
CORRECT
k=10: gpt_answer=8  true_answer=16
INCORRECT
k=11: gpt_answer=16  true_answer=16
CORRECT
k=12: gpt_answer=16  true_answer=16
CORRECT
k=13: gpt_answer=16  true_answer=16
CORRECT
k=14: gpt_answer=16  true_answer=16
CORRECT
k=15: gpt_answer=16  true_answer=16
CORRECT
k=16: gpt_answer=16  true_answer=16
CORRECT
k=17: gpt_answer=16  true_answer=16
CORRECT
k=18: gpt_answer=16  true_answer=16
CORRECT


 40%|████      | 8/20 [03:31<05:32, 27.70s/it]

k=19: gpt_answer=16  true_answer=16
CORRECT
accuracy=0.85

Question  8
k=0: gpt_answer=41  true_answer=41
CORRECT
k=1: gpt_answer=41  true_answer=41
CORRECT
k=2: gpt_answer=41  true_answer=41
CORRECT
k=3: gpt_answer=41  true_answer=41
CORRECT
k=4: gpt_answer=41  true_answer=41
CORRECT
k=5: gpt_answer=41  true_answer=41
CORRECT
k=6: gpt_answer=41  true_answer=41
CORRECT
k=7: gpt_answer=41  true_answer=41
CORRECT
k=8: gpt_answer=41  true_answer=41
CORRECT
k=9: gpt_answer=41  true_answer=41
CORRECT
k=10: gpt_answer=41  true_answer=41
CORRECT
k=11: gpt_answer=41  true_answer=41
CORRECT
k=12: gpt_answer=41  true_answer=41
CORRECT
k=13: gpt_answer=41  true_answer=41
CORRECT
k=14: gpt_answer=41  true_answer=41
CORRECT
k=15: gpt_answer=41  true_answer=41
CORRECT
k=16: gpt_answer=41  true_answer=41
CORRECT
k=17: gpt_answer=41  true_answer=41
CORRECT
k=18: gpt_answer=41  true_answer=41
CORRECT


 45%|████▌     | 9/20 [04:03<05:19, 29.00s/it]

k=19: gpt_answer=41  true_answer=41
CORRECT
accuracy=1.00

Question  9
k=0: gpt_answer=990.00  true_answer=990
CORRECT
k=1: gpt_answer=990.00  true_answer=990
CORRECT
k=2: gpt_answer=990.00  true_answer=990
CORRECT
k=3: gpt_answer=990.00  true_answer=990
CORRECT
k=4: gpt_answer=990.00  true_answer=990
CORRECT
k=5: gpt_answer=990.00  true_answer=990
CORRECT
k=6: gpt_answer=990.00  true_answer=990
CORRECT
k=7: gpt_answer=990.00  true_answer=990
CORRECT
k=8: gpt_answer=990.00  true_answer=990
CORRECT
k=9: gpt_answer=990.00  true_answer=990
CORRECT
k=10: gpt_answer=990.00  true_answer=990
CORRECT
k=11: gpt_answer=990.00  true_answer=990
CORRECT
k=12: gpt_answer=990.00  true_answer=990
CORRECT
k=13: gpt_answer=990.00  true_answer=990
CORRECT
k=14: gpt_answer=990.00  true_answer=990
CORRECT
k=15: gpt_answer=990.00  true_answer=990
CORRECT
k=16: gpt_answer=990.00  true_answer=990
CORRECT
k=17: gpt_answer=990.00  true_answer=990
CORRECT
k=18: gpt_answer=990  true_answer=990
CORRECT


 50%|█████     | 10/20 [04:39<05:12, 31.23s/it]

k=19: gpt_answer=990.00  true_answer=990
CORRECT
accuracy=1.00

Question  11
k=0: gpt_answer=1  true_answer=5
INCORRECT
k=1: gpt_answer=1  true_answer=5
INCORRECT
k=2: gpt_answer=0  true_answer=5
INCORRECT
k=3: gpt_answer=0  true_answer=5
INCORRECT
k=4: gpt_answer=8  true_answer=5
INCORRECT
k=5: gpt_answer=8  true_answer=5
INCORRECT
k=6: gpt_answer=0  true_answer=5
INCORRECT
k=7: gpt_answer=1  true_answer=5
INCORRECT
k=8: gpt_answer=8  true_answer=5
INCORRECT
k=9: gpt_answer=8  true_answer=5
INCORRECT
k=10: gpt_answer=0  true_answer=5
INCORRECT
k=11: gpt_answer=0  true_answer=5
INCORRECT
k=12: gpt_answer=2  true_answer=5
INCORRECT
k=13: gpt_answer=1  true_answer=5
INCORRECT
k=14: gpt_answer=1  true_answer=5
INCORRECT
k=15: gpt_answer=1  true_answer=5
INCORRECT
k=16: gpt_answer=8  true_answer=5
INCORRECT
k=17: gpt_answer=1  true_answer=5
INCORRECT
k=18: gpt_answer=1  true_answer=5
INCORRECT


 55%|█████▌    | 11/20 [05:27<05:26, 36.31s/it]

k=19: gpt_answer=8  true_answer=5
INCORRECT
accuracy=0.00

Question  12
k=0: gpt_answer=85  true_answer=85
CORRECT
k=1: gpt_answer=85  true_answer=85
CORRECT
k=2: gpt_answer=85  true_answer=85
CORRECT
k=3: gpt_answer=85  true_answer=85
CORRECT
k=4: gpt_answer=85  true_answer=85
CORRECT
k=5: gpt_answer=85  true_answer=85
CORRECT
k=6: gpt_answer=85  true_answer=85
CORRECT
k=7: gpt_answer=85  true_answer=85
CORRECT
k=8: gpt_answer=85  true_answer=85
CORRECT
k=9: gpt_answer=85  true_answer=85
CORRECT
k=10: gpt_answer=85  true_answer=85
CORRECT
k=11: gpt_answer=85  true_answer=85
CORRECT
k=12: gpt_answer=85  true_answer=85
CORRECT
k=13: gpt_answer=85  true_answer=85
CORRECT
k=14: gpt_answer=85  true_answer=85
CORRECT
k=15: gpt_answer=85  true_answer=85
CORRECT
k=16: gpt_answer=85  true_answer=85
CORRECT
k=17: gpt_answer=85  true_answer=85
CORRECT
k=18: gpt_answer=85  true_answer=85
CORRECT


 60%|██████    | 12/20 [05:48<04:13, 31.74s/it]

k=19: gpt_answer=85  true_answer=85
CORRECT
accuracy=1.00

Question  13
k=0: gpt_answer=35  true_answer=35
CORRECT
k=1: gpt_answer=32.50  true_answer=35
INCORRECT
k=2: gpt_answer=22.50  true_answer=35
INCORRECT
k=3: gpt_answer=22.50  true_answer=35
INCORRECT
k=4: gpt_answer=22.50  true_answer=35
INCORRECT
k=5: gpt_answer=22.50  true_answer=35
INCORRECT
k=6: gpt_answer=22.50  true_answer=35
INCORRECT
k=7: gpt_answer=22.50  true_answer=35
INCORRECT
k=8: gpt_answer=22.50  true_answer=35
INCORRECT
k=9: gpt_answer=22.50  true_answer=35
INCORRECT
k=10: gpt_answer=35  true_answer=35
CORRECT
k=11: gpt_answer=35.00  true_answer=35
CORRECT
k=12: gpt_answer=35  true_answer=35
CORRECT
k=13: gpt_answer=22.50  true_answer=35
INCORRECT
k=14: gpt_answer=22.50  true_answer=35
INCORRECT
k=15: gpt_answer=22.50  true_answer=35
INCORRECT
k=16: gpt_answer=22.50  true_answer=35
INCORRECT
k=17: gpt_answer=22.50  true_answer=35
INCORRECT
k=18: gpt_answer=35.00  true_answer=35
CORRECT


 65%|██████▌   | 13/20 [06:24<03:49, 32.84s/it]

k=19: gpt_answer=35  true_answer=35
CORRECT
accuracy=0.30

Question  14
k=0: gpt_answer=5  true_answer=5
CORRECT
k=1: gpt_answer=5  true_answer=5
CORRECT
k=2: gpt_answer=5  true_answer=5
CORRECT
k=3: gpt_answer=5  true_answer=5
CORRECT
k=4: gpt_answer=5  true_answer=5
CORRECT
k=5: gpt_answer=5  true_answer=5
CORRECT
k=6: gpt_answer=5  true_answer=5
CORRECT
k=7: gpt_answer=5  true_answer=5
CORRECT
k=8: gpt_answer=5  true_answer=5
CORRECT
k=9: gpt_answer=5  true_answer=5
CORRECT
k=10: gpt_answer=5  true_answer=5
CORRECT
k=11: gpt_answer=5  true_answer=5
CORRECT
k=12: gpt_answer=5  true_answer=5
CORRECT
k=13: gpt_answer=5  true_answer=5
CORRECT
k=14: gpt_answer=5  true_answer=5
CORRECT
k=15: gpt_answer=5  true_answer=5
CORRECT
k=16: gpt_answer=5  true_answer=5
CORRECT
k=17: gpt_answer=5  true_answer=5
CORRECT
k=18: gpt_answer=5  true_answer=5
CORRECT


 70%|███████   | 14/20 [06:52<03:08, 31.45s/it]

k=19: gpt_answer=5  true_answer=5
CORRECT
accuracy=1.00

Question  15
k=0: gpt_answer=150450000.  true_answer=448000
INCORRECT
k=1: gpt_answer=150450000.  true_answer=448000
INCORRECT
k=2: gpt_answer=450000  true_answer=448000
INCORRECT
k=3: gpt_answer=150450000.  true_answer=448000
INCORRECT
k=4: gpt_answer=450000.  true_answer=448000
INCORRECT
k=5: gpt_answer=450000  true_answer=448000
INCORRECT
k=6: gpt_answer=450000.  true_answer=448000
INCORRECT
k=7: gpt_answer=450000  true_answer=448000
INCORRECT
k=8: gpt_answer=450000  true_answer=448000
INCORRECT
k=9: gpt_answer=150450000.  true_answer=448000
INCORRECT
k=10: gpt_answer=450000.  true_answer=448000
INCORRECT
k=11: gpt_answer=150450000.  true_answer=448000
INCORRECT
k=12: gpt_answer=150450000.  true_answer=448000
INCORRECT
k=13: gpt_answer=450000  true_answer=448000
INCORRECT
k=14: gpt_answer=450000  true_answer=448000
INCORRECT
k=15: gpt_answer=150450000.  true_answer=448000
INCORRECT
k=16: gpt_answer=150450000.  true_answer=4480

 75%|███████▌  | 15/20 [07:29<02:46, 33.21s/it]

k=19: gpt_answer=450000.  true_answer=448000
INCORRECT
accuracy=0.00

Question  19
k=0: gpt_answer=16  true_answer=16
CORRECT
k=1: gpt_answer=16  true_answer=16
CORRECT
k=2: gpt_answer=16  true_answer=16
CORRECT
k=3: gpt_answer=16  true_answer=16
CORRECT
k=4: gpt_answer=16  true_answer=16
CORRECT
k=5: gpt_answer=16  true_answer=16
CORRECT
k=6: gpt_answer=16  true_answer=16
CORRECT
k=7: gpt_answer=16  true_answer=16
CORRECT
k=8: gpt_answer=16  true_answer=16
CORRECT
k=9: gpt_answer=16  true_answer=16
CORRECT
k=10: gpt_answer=16  true_answer=16
CORRECT
k=11: gpt_answer=16  true_answer=16
CORRECT
k=12: gpt_answer=16  true_answer=16
CORRECT
k=13: gpt_answer=16  true_answer=16
CORRECT
k=14: gpt_answer=16  true_answer=16
CORRECT
k=15: gpt_answer=16  true_answer=16
CORRECT
k=16: gpt_answer=16  true_answer=16
CORRECT
k=17: gpt_answer=16  true_answer=16
CORRECT
k=18: gpt_answer=16  true_answer=16
CORRECT


 80%|████████  | 16/20 [07:57<02:06, 31.58s/it]

k=19: gpt_answer=16  true_answer=16
CORRECT
accuracy=1.00

Question  20
k=0: gpt_answer=38  true_answer=38
CORRECT
k=1: gpt_answer=38  true_answer=38
CORRECT
k=2: gpt_answer=38  true_answer=38
CORRECT
k=3: gpt_answer=38  true_answer=38
CORRECT
k=4: gpt_answer=38  true_answer=38
CORRECT
k=5: gpt_answer=38  true_answer=38
CORRECT
k=6: gpt_answer=38  true_answer=38
CORRECT
k=7: gpt_answer=38  true_answer=38
CORRECT
k=8: gpt_answer=38  true_answer=38
CORRECT
k=9: gpt_answer=38  true_answer=38
CORRECT
k=10: gpt_answer=38  true_answer=38
CORRECT
k=11: gpt_answer=38  true_answer=38
CORRECT
k=12: gpt_answer=38  true_answer=38
CORRECT
k=13: gpt_answer=38  true_answer=38
CORRECT
k=14: gpt_answer=38  true_answer=38
CORRECT
k=15: gpt_answer=38  true_answer=38
CORRECT
k=16: gpt_answer=38  true_answer=38
CORRECT
k=17: gpt_answer=38  true_answer=38
CORRECT
k=18: gpt_answer=38  true_answer=38
CORRECT


 85%|████████▌ | 17/20 [08:31<01:36, 32.23s/it]

k=19: gpt_answer=38  true_answer=38
CORRECT
accuracy=1.00

Question  23
k=0: gpt_answer=5  true_answer=5
CORRECT
k=1: gpt_answer=5  true_answer=5
CORRECT
k=2: gpt_answer=5  true_answer=5
CORRECT
k=3: gpt_answer=5  true_answer=5
CORRECT
k=4: gpt_answer=5  true_answer=5
CORRECT
k=5: gpt_answer=5  true_answer=5
CORRECT
k=6: gpt_answer=5  true_answer=5
CORRECT
k=7: gpt_answer=5.  true_answer=5
CORRECT
k=8: gpt_answer=5  true_answer=5
CORRECT
k=9: gpt_answer=5  true_answer=5
CORRECT
k=10: gpt_answer=5  true_answer=5
CORRECT
k=11: gpt_answer=5  true_answer=5
CORRECT
k=12: gpt_answer=5  true_answer=5
CORRECT
k=13: gpt_answer=5  true_answer=5
CORRECT
k=14: gpt_answer=5  true_answer=5
CORRECT
k=15: gpt_answer=5  true_answer=5
CORRECT
k=16: gpt_answer=5  true_answer=5
CORRECT
k=17: gpt_answer=5  true_answer=5
CORRECT
k=18: gpt_answer=5.  true_answer=5
CORRECT


 90%|█████████ | 18/20 [09:05<01:05, 32.93s/it]

k=19: gpt_answer=5  true_answer=5
CORRECT
accuracy=1.00

Question  24
k=0: gpt_answer=62  true_answer=62
CORRECT
k=1: gpt_answer=62  true_answer=62
CORRECT
k=2: gpt_answer=62  true_answer=62
CORRECT
k=3: gpt_answer=62  true_answer=62
CORRECT
k=4: gpt_answer=62  true_answer=62
CORRECT
k=5: gpt_answer=62  true_answer=62
CORRECT
k=6: gpt_answer=62  true_answer=62
CORRECT
k=7: gpt_answer=62  true_answer=62
CORRECT
k=8: gpt_answer=62  true_answer=62
CORRECT
k=9: gpt_answer=62  true_answer=62
CORRECT
k=10: gpt_answer=62  true_answer=62
CORRECT
k=11: gpt_answer=62  true_answer=62
CORRECT
k=12: gpt_answer=62  true_answer=62
CORRECT
k=13: gpt_answer=62  true_answer=62
CORRECT
k=14: gpt_answer=62  true_answer=62
CORRECT
k=15: gpt_answer=62  true_answer=62
CORRECT
k=16: gpt_answer=62  true_answer=62
CORRECT
k=17: gpt_answer=62  true_answer=62
CORRECT
k=18: gpt_answer=62  true_answer=62
CORRECT


 95%|█████████▌| 19/20 [09:29<00:30, 30.32s/it]

k=19: gpt_answer=62  true_answer=62
CORRECT
accuracy=1.00

Question  25
k=0: gpt_answer=110  true_answer=110
CORRECT
k=1: gpt_answer=110  true_answer=110
CORRECT
k=2: gpt_answer=110  true_answer=110
CORRECT
k=3: gpt_answer=110  true_answer=110
CORRECT
k=4: gpt_answer=110  true_answer=110
CORRECT
k=5: gpt_answer=110  true_answer=110
CORRECT
k=6: gpt_answer=110  true_answer=110
CORRECT
k=7: gpt_answer=110  true_answer=110
CORRECT
k=8: gpt_answer=110  true_answer=110
CORRECT
k=9: gpt_answer=110  true_answer=110
CORRECT
k=10: gpt_answer=110  true_answer=110
CORRECT
k=11: gpt_answer=110  true_answer=110
CORRECT
k=12: gpt_answer=110  true_answer=110
CORRECT
k=13: gpt_answer=110  true_answer=110
CORRECT
k=14: gpt_answer=110  true_answer=110
CORRECT
k=15: gpt_answer=110  true_answer=110
CORRECT
k=16: gpt_answer=110  true_answer=110
CORRECT
k=17: gpt_answer=110  true_answer=110
CORRECT
k=18: gpt_answer=110  true_answer=110
CORRECT


100%|██████████| 20/20 [10:01<00:00, 30.09s/it]

k=19: gpt_answer=110  true_answer=110
CORRECT
accuracy=1.00

Total Inaccurate:  7



