In [2]:
# Change to project directory and load API key 
import os
os.chdir("/w/246/ikozlov/csc2542-project/")

from dotenv import load_dotenv
load_dotenv()  # This loads the environment variables from a .env file.

# Get OpenAI API Key
from openai import OpenAI 
client = OpenAI() 

# Define Function for Calling Model with a Function

In [80]:
from epistemic_logic import is_entailment
import json

tools = [
        {
            "type": "function",
            "function": {
                "name": "is_entailment",
                "description": '''Get as output the words 'entailment' or 'non-entailment' 
                                to find out whether the given hypothesis is entailed or not entailed by the premise. 
                                Note that premise and hypothesis need to be converted to symbolic epistemic logic, where 
                                $$B(Agent,p)$$ is used to denote that Agent believes p and $$K(Agent,p)$$ to denote that Agent knows p and 
                                p ^ q is usd to denote p and q. For example the sentence "Anna believes the Bob knows that the sky is blue" 
                                may be written in symbolic epistemic logic by letting p be "Sky is Blue" and then writing the sentence as 
                                "$$B(Anna, K(Bob, p))$$"''',
                "parameters": {
                    "type": "object",
                    "properties": {
                        "premise": {
                            "type": "string",
                            "description": '''The premise of the problem in epistemic logic format encapsulted by the symbols $$, 
                            e.g. $$K(Anna, B(Bob, p))$$''',
                        },
                        "hypothesis": {
                            "type": "string",
                            "description": '''The hypothesis in the given problem in epistemic logic format, 
                            e.g. $$B(Marry, p)$$''',
                        },
                    },
                    "required": ["premise", "hypothesis"],
                },
            },
        }
    ]

def gpt_sym(model, messages, prompt): 
    
    completion = client.chat.completions.create(
                model = model, 
                messages = messages, 
                tools = tools, 
                tool_choice = "auto"
            )
    
    response_message = completion.choices[0].message
    tool_calls = response_message.tool_calls

    function_call = False 
    if tool_calls: 
        function_call = True
        model_output = ""
        for tool_call in tool_calls: 
            premise = "" 
            hypothesis = ""
            try: 
                function_args = json.loads(tool_call.function.arguments)
                premise += function_args.get("premise")
                hypothesis += function_args.get("hypothesis") 
                reasoning_output = is_entailment(premise, hypothesis)
                model_output += f"Function arguments: Premise: {premise}, Hypothesis: {hypothesis}, Answer: {reasoning_output}. "
            except: 

                model_output += f"Function arguments: Premise: {premise}, Hypothesis: {hypothesis}. "
    else: 
        model_output = response_message.content 

    return model_output, tool_calls, function_call

# Define Function for Testing GPT on Given Datasets

In [76]:
from tqdm import tqdm 
import numpy as np
import re
import csv
import json 
import random 

output_dir = "results"

def test_gpt(dataset, model, model_name, n_shots, chain_of_thought, prefix = None):
    # Load dataset 
    with open(f"./datasets/{dataset}.json") as data_file: 
        data = json.load(data_file) 

    prefix = data["task_prefix"] if prefix == None else prefix 
    questions = data["examples"]

    # Set up output file for data
    savedir = f"./{output_dir}/{model_name}/{model}"
    if not os.path.exists(savedir):
        os.makedirs(savedir)
    output_file = f"{savedir}/{dataset}_{n_shots}shot"
    if chain_of_thought and (n_shots > 0):
        output_file += "_cot"
    output_file += ".csv"

    # Start writing CSV
    with open(output_file, 'w', newline='') as output:
        writer = csv.writer(output)
        header = ["question_index", "response", "is_correct"]
        writer.writerow(header)

    # Get prompt 
    if model_name == "basic":
        prompt_name = f'respond_{n_shots}shot' + ('_cot' if chain_of_thought else "")
    elif model_name == "logical": 
        prompt_name = f'translate_gpt_{n_shots}shot' + ('_cot' if chain_of_thought else "")
    else: 
        raise ValueError("Wrong model name: ", model_name)

    if n_shots != 0: 
        with open(f"./prompts/{prompt_name}.txt", 'r') as f:
            prompt = f.read()
    else: 
        prompt = ""

    # Pattern to match to parse model output 
    pattern = r"Answer: ([a-zA-Z-]+)"

    # Compute score 
    scores = []
    failures = []
    function_calls = []
    questions = random.sample(questions, 500)
    num_runs = len(questions)
    for i in tqdm(range(0,num_runs)):
        if model_name == "basic": 
            completion = client.chat.completions.create(
                model = model, 
                messages=[
                    {"role": "system", "content":prefix},
                    {"role": "user", "content":prompt + questions[i]["input"]}
                ]
            )

            model_output = completion.choices[0].message.content

        elif model_name == "logical": 
            messages=[
                    {"role": "system", "content":prefix + (prompt if (n_shots != 0) else "")},
                    {"role": "user", "content":questions[i]["input"]}
                ]
            model_output, function_call = gpt_sym(model, messages, prompt)
            function_calls.append(function_call)

        else: 
            raise ValueError("Wrong model name: ", model_name)

        try: 
            response = re.search(pattern, model_output).group(1)
        except: 
            response = model_output

        response = response.lower()
        is_invalid = response not in ["entailment", "non-entailment"]
        failures.append(is_invalid)

        scores.append(float('NaN') if is_invalid else questions[i]["target_scores"][response])

        with open(output_file, 'a', newline='') as output:
            writer = csv.writer(output)
            writer.writerow([i, model_output, scores[-1]])

    accuracy = np.nanmean(np.array(scores)) 
    failure_rate = np.mean(np.array(failures))
    
    if model_name == "basic":
        return accuracy, failure_rate
    else: 
        function_calls = np.array(function_calls)
        function_call_rate = np.mean(function_calls)
        accurate_func_call_rate = np.nansum(function_calls == np.array(scores))/np.sum(function_calls)
        return accuracy, failure_rate, function_call_rate, accurate_func_call_rate

# Baseline 0-shot GPT-4: 

In [77]:
chain_of_thought = False
n_shots = 0
dataset = "task"
model_name = "basic"
model = "gpt-4"

prefix = "Identify the relation between the following premises and hypotheses, choosing from the options 'entailment' or 'non-entailment'. Indicate the response after the word 'Answer: '\n"
accuracy, failure_rate = test_gpt(dataset, model, model_name, n_shots, chain_of_thought, prefix=prefix)

print("Accuracy: ", accuracy)
print("Failure Rate: ", failure_rate)

  0%|                                                                                                         | 0/500 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [06:09<00:00,  1.35it/s]

Accuracy:  0.722
Failure Rate:  0.0





# Logical 0-shot GPT-4:

In [78]:
chain_of_thought = False
n_shots = 0
dataset = "task"
model_name = "logical"
model = "gpt-4"

prefix = "Identify the relation between the following premises and hypotheses, choosing from the options 'entailment' or 'non-entailment'. Indicate the response after the word 'Answer: '\n"
accuracy, failure_rate, function_call_rate, accurate_func_call_rate = test_gpt(dataset, model, model_name, n_shots, chain_of_thought, prefix=prefix)

print("Accuracy: ", accuracy)
print("Failure Rate: ", failure_rate)
print("Function call rate: ", function_call_rate)
print("Accuracy at function calls: ", accurate_func_call_rate)

  5%|████▍                                                                                           | 23/500 [00:42<15:56,  2.00s/it]

 65%|█████████████████████████████████████████████████████████████▌                                 | 324/500 [11:12<06:05,  2.07s/it]


JSONDecodeError: Invalid \escape: line 2 column 37 (char 38)

# Basic GPT-3.5-Turbo Model Tests: 

In [6]:
chain_of_thought = False
n_shots = 3
dataset = "task"
model_name = "basic"
model = "gpt-3.5-turbo"

accuracy, failure_rate = test_gpt(dataset, model, model_name, n_shots, chain_of_thought)

print("Accuracy: ", accuracy)
print("Failure Rate: ", failure_rate)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [19:07<00:00,  1.74it/s]

Accuracy:  0.6082164328657315
Failure Rate:  0.002





In [7]:
chain_of_thought = True
n_shots = 3
dataset = "task"
model_name = "basic"
model = "gpt-3.5-turbo"

accuracy, failure_rate = test_gpt(dataset, model, model_name, n_shots, chain_of_thought)

print("Accuracy: ", accuracy)
print("Failure Rate: ", failure_rate)

 54%|████████████████████████████████████████████████████████████████████                                                         | 1088/2000 [1:02:52<52:07,  3.43s/it]

In [5]:
chain_of_thought = False
n_shots = 3
dataset = "mixed_reasoning"
model_name = "basic"
model = "gpt-3.5-turbo"

prefix = "Identify the relation between the following premises and hypotheses, choosing from the options 'entailment' or 'non-entailment'.\n"
accuracy, failure_rate = test_gpt(dataset, model, model_name, n_shots, chain_of_thought, prefix = prefix)

print("Accuracy: ", accuracy)
print("Failure Rate: ", failure_rate)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [04:55<00:00,  1.69it/s]

Accuracy:  0.616
Failure Rate:  0.0





In [6]:
chain_of_thought = True
n_shots = 3
dataset = "mixed_reasoning"
model_name = "basic"
model = "gpt-3.5-turbo"

prefix = "Identify the relation between the following premises and hypotheses, choosing from the options 'entailment' or 'non-entailment'.\n"
accuracy, failure_rate = test_gpt(dataset, model, model_name, n_shots, chain_of_thought, prefix = prefix)

print("Accuracy: ", accuracy)
print("Failure Rate: ", failure_rate)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [29:13<00:00,  3.51s/it]

Accuracy:  0.6895833333333333
Failure Rate:  0.04



