In [5]:
# Imports
from openai import OpenAI

import os
import json

from utils import constants
from utils import util
from utils import keys
from utils import prompts

In [7]:
# Load data and Initialize OpenAI
os.environ['OPENAI_API_KEY'] = keys.WX
client = OpenAI()
client.api_key = os.getenv('OPENAI_API_KEY')

with open("../data/new_test_data_nolabel/SP_new_test_corrected.json") as f:
    data = f.read()
data = json.loads(data)

{'question': 'In a small village, two farmers are working in their fields - a diligent farmer and a lazy farmer. The hardworking farmer is the son of the lazy farmer, but the lazy farmer is not the father of the hardworking farmer. Can you explain this unusual relationship?', 'choice_list': ['The lazy farmer is his mother.', 'The lazy farmer is not a responsible father as he is lazy.', 'The diligent farmer devoted himself to the farm and gradually forgot his father.', 'None of above.']}


In [9]:
# Model settings
MODEL = "gpt-3.5-turbo"
SYSTEM_PROMPT = {"role": "system", 
                 "content": "You are a Question Answering Model, your response must be a number from the choices that are delimited by the symbol \";\" ."}
MULTI_PREFIX = "You are a Question Answering Model, your response must be a number from the choices that are delimited by the symbol \";\". Here are some examples: \n"
SP_QUESTION = "Think outside of the box and respond with the number corresponding to the best choice for the following question.\nQuestion: "
WP_QUESTION = "For the following word problem, look at the meaning and letters in the words and respond with the number corresponding to the best choice.\nQuestion: "

In [10]:
# Eval Multishot GPT3.5-TURBO
preds = util.run_eval_cot(client, MODEL, data, prompts.CHAIN_SP_BASE, prompts.CHAIN_SYSTEM, MULTI_PREFIX, m=0, n=len(data))

In [8]:
util.submission_log(preds, "answer_sen")

Saved submission to /Users/alvinchen/Documents/GitHub/brainteaser-data/submission/answer_sen.txt


In [98]:
import re

with open("test_logs/gpt-3.5-turbo_eval_2023-12-18_21-48-05.log", "r") as f:
    raw = f.readlines()
lines = raw[30:]

In [99]:
# Copy answers and ids
ans = {}
for i in range(len(lines)-1):
    m = re.search(r"Question [0-9]+:", lines[i])
    if m is not None:
        n = m.group(0)
        id = int(re.search(r"[0-9]+", n).group(0))
        an = re.search(r"answer is [0-3]", lines[i+1])
        if an is not None:
            an = int(an.group(0)[-1])
            ans[id] = an

In [100]:
notin = []
for i in range(1, 120):
    if i not in ans:
        notin.append(i)
print(notin)

[3, 23, 26, 28, 31, 34, 38, 58, 59, 92, 95, 114]


In [18]:
repeatid = []
test_questions = []
order = []
for data in sp_test:
    test_questions.append(data["question"])
for data in sp_all:
    if data["question"] in test_questions:
        repeatid.append(data["id"])
        m = [i for i in range(len(test_questions)) if test_questions[i] == data["question"]][0]
        order.append(m)
print(order)

[38, 47, 98, 39, 5, 23, 3, 102, 51, 28, 50, 77, 1, 24, 88, 94, 71, 85, 11, 111, 93, 86, 119, 66, 104, 108, 61, 60, 116, 73, 59, 110, 52, 72, 36, 17, 92, 21, 69, 62, 113, 107, 90, 81, 112, 35, 16, 57, 19, 40, 43, 70, 46, 79, 13, 83, 64, 18, 49, 27, 58, 115, 31, 33, 2, 109, 6, 9, 8, 97, 84, 41, 89, 10, 101, 91, 53, 106, 55, 117, 82, 99, 37, 56, 87, 105, 44, 100, 67, 22, 20, 15, 12, 103, 75, 4, 7, 14, 29, 34, 32, 42, 65, 95, 25, 96, 114, 30, 54, 118, 63, 76, 0, 26, 74, 78, 45, 80, 68, 48]


In [19]:
correct = []
label = []
choices = []
for i in order:
    id = repeatid[i]
    for data in sp_all:
        if data["id"] == id:
            correct.append(data["answer"])
            label.append(data["label"])
            choices.append("".join(str(j) + " = " + data["choice_list"][j] + "; " for j in range(4)))

In [105]:
with open("../submission/answer_sen.txt", "r") as f:
    answers = f.readlines()

with open("chain_of_thought_output.txt", "w") as f:
    for r in raw[:29]:
        f.write(r)
    for i in range(120):
        f.write(lines[i*3])
        f.write("Choices: " + choices[i] + "\n")
        f.write(lines[i*3+1])
        answer = answers[i].strip()
        f.write(f"Corresponding Answer: {str(answer)} = {sp_test[i]['choice_list'][int(answers[i])]}\n")
        f.write(f"Correct Answer: {label[i]} = {correct[i]}\n")
        f.write("\n")

In [3]:
wrepeatid = []
wtest_questions = []
worder = []
for data in wp_test:
    wtest_questions.append(data["question"])
for data in wp_all:
    if data["question"] in wtest_questions:
        wrepeatid.append(data["id"])
        m = [i for i in range(len(wtest_questions)) if wtest_questions[i] == data["question"]][0]
        worder.append(m)
print(worder)

[33, 34, 35, 27, 28, 29, 63, 64, 65, 48, 49, 50, 84, 85, 86, 96, 97, 98, 39, 40, 41, 36, 37, 38, 72, 73, 74, 6, 7, 8, 114, 115, 116, 21, 22, 23, 24, 25, 26, 78, 79, 80, 30, 31, 32, 18, 19, 20, 105, 106, 107, 57, 58, 59, 3, 4, 5, 75, 76, 77, 0, 1, 2, 54, 55, 56, 90, 91, 92, 93, 94, 95, 51, 52, 53, 45, 46, 47, 108, 109, 110, 69, 70, 71, 15, 16, 17, 42, 43, 44, 12, 13, 14, 81, 82, 83, 111, 112, 113, 9, 9, 11, 60, 61, 62, 87, 88, 89, 117, 118, 119, 102, 103, 104, 66, 66, 68, 99, 100, 101]


In [21]:
# Eval Multishot GPT4
preds = util.run_eval_cot(client,"gpt-4", data, prompts.CHAIN_SP_BASE, prompts.CHAIN_SYSTEM, MULTI_PREFIX, m=0, n=len(data))

In [29]:
import re

with open("test_logs/gpt-4_eval_2024-01-30_18-17-18.log", "r") as f:
    raw = f.readlines()
lines = raw[30:]
# Copy answers and ids
ans = {}
for i in range(len(lines)-1):
    m = re.search(r"Question [0-9]+:", lines[i])
    if m is not None:
        n = m.group(0)
        id = int(re.search(r"[0-9]+", n).group(0))
        an = re.search(r"answer is [0-3]", lines[i+1])
        if an is not None:
            an = int(an.group(0)[-1])
            ans[id] = an
notin = []
for i in range(1, 120):
    if i not in ans:
        notin.append(i)
print(notin)
print(ans)

[67, 89]
{1: 0, 2: 2, 3: 2, 4: 1, 5: 3, 6: 0, 7: 0, 8: 1, 9: 3, 10: 1, 11: 1, 12: 3, 13: 2, 14: 0, 15: 1, 16: 1, 17: 0, 18: 2, 19: 2, 20: 0, 21: 0, 22: 0, 23: 1, 24: 0, 25: 1, 26: 0, 27: 0, 28: 2, 29: 0, 30: 2, 31: 3, 32: 2, 33: 2, 34: 2, 35: 3, 36: 2, 37: 3, 38: 0, 39: 0, 40: 0, 41: 1, 42: 3, 43: 0, 44: 0, 45: 2, 46: 2, 47: 2, 48: 2, 49: 0, 50: 0, 51: 0, 52: 0, 53: 3, 54: 1, 55: 2, 56: 0, 57: 1, 58: 1, 59: 1, 60: 1, 61: 3, 62: 1, 63: 1, 64: 1, 65: 1, 66: 2, 68: 1, 69: 2, 70: 0, 71: 3, 72: 0, 73: 3, 74: 2, 75: 1, 76: 2, 77: 0, 78: 1, 79: 2, 80: 0, 81: 3, 82: 2, 83: 3, 84: 3, 85: 3, 86: 1, 87: 2, 88: 3, 90: 2, 91: 1, 92: 0, 93: 3, 94: 0, 95: 2, 96: 3, 97: 1, 98: 2, 99: 0, 100: 2, 101: 1, 102: 3, 103: 0, 104: 2, 105: 2, 106: 2, 107: 3, 108: 1, 109: 2, 110: 1, 111: 1, 112: 1, 113: 1, 114: 3, 115: 2, 116: 1, 117: 2, 118: 1, 119: 1, 120: 2}


In [34]:
with open("../submission/answer_sen.txt", "w") as f:
    for i in range(1, 121):
        if i in ans:
            f.write(str(ans[i]) + "\n")
        else:
            f.write("\n")

In [33]:
with open("../submission/answer_sen.txt", "r") as f:
    answers = f.readlines()

with open("chain_of_thought_gpt4.txt", "w") as f:
    for r in raw[:29]:
        f.write(r)
    for i in range(120):
        f.write(lines[i*3])
        f.write("Choices: " + choices[i] + "\n")
        f.write(lines[i*3+1])
        answer = answers[i].strip()
        f.write(f"Corresponding Answer: {str(answer)} = {sp_test[i]['choice_list'][int(answers[i])]}\n")
        f.write(f"Correct Answer: {label[i]} = {correct[i]}\n")
        f.write("\n")

TypeError: can only concatenate str (not "int") to str

In [16]:
# Eval Zeroshot GPT3.5-TURBO
preds = util.run_eval_cot(client, MODEL, sp_test, prompts.ZERO_SP_BASE, prompts.CHAIN_SYSTEM, MULTI_PREFIX, m=0, n=len(sp_test))

In [32]:
import re

with open("test_logs/gpt-3.5-turbo_eval_2023-12-19_20-02-28.log", "r") as f:
    raw = f.readlines()
lines = raw[30:]
# Copy answers and ids
ans = {}
for i in range(len(lines)-1):
    m = re.search(r"Question [0-9]+:", lines[i])
    if m is not None:
        n = m.group(0)
        id = int(re.search(r"[0-9]+", n).group(0))
        an = re.findall(r"[^0-9][0-3][^0-9]", lines[i+1])
        if len(an) > 0:
            an = [int(i[1]) for i in an]
            ans[id] = an
notin = []
for i in range(1, 120):
    if i not in ans:
        notin.append(i)
print(notin)
print(ans)

[1, 2, 3, 4, 5, 6, 7, 33, 43, 72, 86, 93, 100, 107, 112]
{8: [2], 9: [2], 10: [2], 11: [2], 12: [0], 13: [1], 14: [3], 15: [0], 16: [0], 17: [1], 18: [2], 19: [0], 20: [2], 21: [2], 22: [0], 23: [0], 24: [0], 25: [2], 26: [0], 27: [2], 28: [3], 29: [0], 30: [1], 31: [2], 32: [3], 34: [3, 1], 35: [0], 36: [3], 37: [2], 38: [2], 39: [1], 40: [2], 41: [1], 42: [0], 44: [0], 45: [3], 46: [0], 47: [1], 48: [2], 49: [2], 50: [2], 51: [1], 52: [1], 53: [1], 54: [3], 55: [2], 56: [2], 57: [0], 58: [0], 59: [3], 60: [1], 61: [2], 62: [1], 63: [0], 64: [0], 65: [3], 66: [1], 67: [0], 68: [2], 69: [1], 70: [3], 71: [2], 73: [0], 74: [2], 75: [2], 76: [2], 77: [3], 78: [2], 79: [2], 80: [0, 0], 81: [2], 82: [3], 83: [2], 84: [3], 85: [2], 87: [3, 0, 1, 2], 88: [2], 89: [2], 90: [2], 91: [3], 92: [3], 94: [0], 95: [0], 96: [1], 97: [3], 98: [2], 99: [3], 101: [3], 102: [2], 103: [2], 104: [3, 3], 105: [0], 106: [2], 108: [0], 109: [0], 110: [2], 111: [0], 113: [3], 114: [2], 115: [3], 116: [3], 117

In [34]:
with open("../submission/answer_sen.txt", "w") as f:
    for i in range(1, 121):
        if i in ans:
            if len(ans[i]) == 1:
                f.write(str(ans[i][0]) + "\n")
            else:
                f.write("\n")
        else:
            f.write("\n")

In [36]:
# Accuracy testing
with open("../submission/answer_sen.txt", "r") as f:
    answers = f.readlines()
with open("../data/new_test_data_nolabel/sp_choices.txt", "r") as f:
    choices = f.readlines()

for i in range(len(answers)):
    answers[i] = int(answers[i].strip())
    choices[i] = int(choices[i].strip())

acc = 0
for i in range(len(answers)):
    if answers[i] == choices[i]:
        acc += 1
print(acc/len(answers))

0.8416666666666667
