In [1]:
# Load data
import util
import random
import numpy as np


dataset, datakey = util.load_data()
#TODO split data
# wp_train = dataset[0]
# sp_eval = dataset[1]
# sp_train = dataset[2]
# wp_eval = dataset[3]

Loaded WP-train.npy at index 0
Loaded SP_eval_data_for_practice.npy at index 1
Loaded SP-train.npy at index 2
Loaded WP_eval_data_for_practice.npy at index 3


In [None]:
# Initialize GPT Model
import keys
import os
import random
from openai import OpenAI
from util import avg_f1_score

os.environ['OPENAI_API_KEY'] = keys.WX
client = OpenAI()
client.api_key = os.getenv('OPENAI_API_KEY')

# Model settings
MODEL = "gpt-3.5-turbo"
SYSTEM_PROMPT = {"role": "system", "content": "You are a Question Answering Model, your response must be a number from the choices that are delimited by the symbol \";\" ."}
MULTI_PREFIX = "You are a Question Answering Model, your response must be a number from the choices that are delimited by the symbol \";\". Here are some examples: \n"
SP_QUESTION = "Think outside of the box and respond with the number corresponding to the best choice for the following question.\n\nQuestion: "
WP_QUESTION = "For the following word problem, look at the meaning and letters in the words and respond with the number corresponding to the best choice.\n\nQuestion: "


# Testing functions
def generate_prompt(question, prefix):
    choices = "".join(str(i) + " = " + question["choice_list"][i] + "; " for i in range(4))
    content = prefix + "\"" + question["question"] + "\"\nChoices: " + choices + "\nResponse: "
    return {"role": "user", "content": content}


def multishot_prompt(training, prefix):
    prompt = prefix
    for i in range(len(training)):
        choices = "".join(str(j) + " = " + training[i]["choice_list"][j] + "; " for j in range(4))
        prompt += "Question: " + "\"" + training[i]["question"] + "\"\nChoices: " + choices + "\nResponse: " + training[i]["label"] + "\n"
    return {"role": "system", "content": prompt}


def run_test_nofinetune(dataset,
            question_prompt,
            n=20,
            training_data=None,
            m=30,
            model=MODEL,
            system_prompt=SYSTEM_PROMPT,
            multi_prefix=MULTI_PREFIX,
            classes=[0, 1, 2, 3],):
    """
    Dataset is a list of dicts, each dict contains question, choices, and answer
    training_data either none or list of training data
    n=20 is the number of questions to test
    m=30 is the number of training examples to use
    no finetuning means some outputs may not be ints, non-ints will be set as incorrect answer
    return avg f1 score
    """
    answers = []
    labels = []
    log = []
    # Randomize dataset
    random.shuffle(dataset)
    if training_data is not None:
        random.shuffle(training_data)
    # Run test
    for i in range(n):
        user_prompt = generate_prompt(dataset[i], question_prompt)
        if training_data is not None:
            prompt = [multishot_prompt(training_data[:m], multi_prefix), user_prompt]
        else:
            prompt = [system_prompt, user_prompt]
        response = client.chat.completions.create(model=model, messages=prompt)
        print(prompt[1]["content"] + 
              response.choices[0].message.content + " " + 
              dataset[i]["choice_list"][int(response.choices[0].message.content)])
        print("Correct Answer: " + str(dataset[i]["label"]) + " " + dataset[i]["answer"] + "\n")
    # Get F1
    f1 = avg_f1_score(answers, labels, classes)
    return f1