In [1]:
import json
from matplotlib import pyplot as plt
import numpy as np
from openai import OpenAI
import os
from scipy.stats import gaussian_kde
from typing import Optional, Tuple

In [2]:
assert os.environ["OPENAI_API_KEY"], "ERROR: No OPENAI_API_KEY environment variable defined"

In [3]:
client = OpenAI(
    api_key=os.environ["OPENAI_API_KEY"],
)

In [4]:
def get_linprob(logprob: float) -> float:
    return np.round(np.exp(logprob) * 100.0, 2)

In [5]:
print(get_linprob(-0.00011129))

99.99


In [6]:
def run(user_prompt: str, system_prompt: Optional[str] = None) -> Tuple[float, float]:
    if system_prompt is None:
        system_prompt = "Return JSON output where the answer is specified as a single numerical value in the `answer` field."
    
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        response_format={"type": "json_object"},
        logprobs=True,
        top_logprobs=2,
        temperature=0.1,
    )

    answer = json.loads(completion.choices[0].message.content)["answer"]
    logprob = completion.choices[0].logprobs.content[8].logprob
    linprob = get_linprob(logprob)

    return (answer, linprob)

In [None]:
answer, linprob = run(user_prompt="9.11 or 9.9 - which one is higher?")

print(answer, linprob)

In [14]:
num_trials = 1_000
correct_answer = 9.9

In [13]:
def run_experiment(num_trials: int, correct_answer: float, user_prompt: str, system_prompt: Optional[str] = None) -> None:
    trial_results = []
    trial_linprobs = []

    for ix in range(num_trials):
        answer, linprob = run(
            user_prompt=user_prompt,
            system_prompt=system_prompt,
        )
        trial_results.append(answer == correct_answer)
        trial_linprobs.append(linprob)

    p_correct = np.round(sum(trial_results) / len(trial_results) * 100.0, 2)

    print("<USER_PROMPT>:", user_prompt)
    print("<SYSTEM_PROMPT>:", system_prompt)
    print("P(CORRECT) = ", p_correct)

    kde = gaussian_kde(trial_linprobs)
    dist_space = np.linspace(min(trial_linprobs), max(trial_linprobs), 100)
    plt.plot(dist_space, kde(dist_space))
    plt.show()

In [None]:
user_prompt = "9.11 or 9.9 - which one is higher?"

run_experiment(num_trials, correct_answer, user_prompt)

In [None]:
user_prompt = "Which one is higher, 9.11 or 9.9?"

run_experiment(num_trials, correct_answer, user_prompt)

In [None]:
user_prompt = "Which one is higher, 9.11 or 9.9? Provide reasoning as to how you obtained the answer in `reasoning` field."

run_experiment(num_trials, correct_answer, user_prompt)

In [None]:
user_prompt = "Which one is higher, 9.11 or 9.9?"
system_prompt = """
Return JSON output where the answer is specified as a single numerical value in the `answer` field.
Provide reasoning as to how you obtained the answer in `reasoning` field.
"""

run_experiment(num_trials, correct_answer, user_prompt, system_prompt)