In [1]:
from dotenv import load_dotenv
import yaml
import os
import openai
from src.tree_node import TreeNode
from src.api_completion import Completion
import numpy as np

In [4]:
config_file = "config.yaml"

# Load experiment config
with open(config_file) as file:
    cfg = yaml.load(file, Loader=yaml.loader.SafeLoader)


In [5]:
_ = load_dotenv()
oai_client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

generator = Completion(oai_client=oai_client)
generator_args = {
    "model": cfg["generator"]["model"],
    "system": cfg["generator"]["system_prompt"],
    "max_tokens": cfg["generator"]["max_tokens"],
    "temperature": cfg["generator"]["temperature"],
    "n": cfg["generator"]["n"]
}

verifier = Completion(oai_client=oai_client)
verifier_args = {
    "model": cfg["verifier"]["model"],
    "system": cfg["verifier"]["system_prompt"],
    "max_tokens": cfg["verifier"]["max_tokens"],
    "temperature": cfg["verifier"]["temperature"],
    "logprobs": cfg["verifier"]["logprobs"],
    "top_logprobs": cfg["verifier"]["top_logprobs"]
}

evaluator = Completion(oai_client=oai_client)
evaluator_args = {
    "model": cfg["evaluator"]["model"],
    "system": cfg["evaluator"]["system_prompt"],
    "max_tokens": cfg["evaluator"]["max_tokens"],
    "temperature": cfg["evaluator"]["temperature"],
    "logprobs": cfg["evaluator"]["logprobs"],
    "top_logprobs": cfg["evaluator"]["top_logprobs"]
}

In [6]:
task = "Kacey is picking out jewelry to wear for school. She has 10 rings, 4 bracelets, and 4 necklaces. How many jewelry combinations are possible if she is going to wear 1 ring, 1 bracelet, and 2 necklaces?"

In [7]:
tree_root = TreeNode(task)

In [8]:
prompt = f"<task>{task}</task>\n(no previous thoughts)\nNext thought:"
result = generator.run(prompt, **generator_args)
print(result)

ChatCompletion(id='chatcmpl-AYO1lmOaOqjfioyVq24zkQnPVIzlJ', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='<thought>Kacey can choose 1 ring from 10 options, 1 bracelet from 4 options, and 2 necklaces from 4 options, so we need to calculate the total combinations using the formula: 10 * 4 * (4 choose 2).</thought>', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)), Choice(finish_reason='stop', index=1, logprobs=None, message=ChatCompletionMessage(content='<thought>To find the total combinations, I need to multiply the number of choices for each type of jewelry: 10 rings, 4 bracelets, and the number of ways to choose 2 necklaces from 4, which is calculated using combinations as C(4,2).</thought>', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1732758877, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_0705

In [9]:
for i, choice in enumerate(result.choices):
    print(f"Thought {i+1}:\n{choice.message.content}\n")



Thought 1:
<thought>Kacey can choose 1 ring from 10 options, 1 bracelet from 4 options, and 2 necklaces from 4 options, so we need to calculate the total combinations using the formula: 10 * 4 * (4 choose 2).</thought>

Thought 2:
<thought>To find the total combinations, I need to multiply the number of choices for each type of jewelry: 10 rings, 4 bracelets, and the number of ways to choose 2 necklaces from 4, which is calculated using combinations as C(4,2).</thought>



In [10]:
for choice in result.choices:
    tree_node = TreeNode(choice.message.content)
    tree_root.add_child(tree_node)

In [11]:
current_paths = tree_root.get_paths()

In [23]:
for path in current_paths:
    prompt = f"<task>{path[0].value}</task>\n"
    if len(path) > 2:
        prompt += "\n".join([node.value for node in path[1:-1]])
    prompt += f"\nLast thought:\n{path[-1].value}"
    verifier_result = verifier.run(prompt, **verifier_args)
    evaluator_result = evaluator.run(prompt, **evaluator_args)

    correctness_scores = {
        logprob.token: float(np.round(np.exp(logprob.logprob)*100,2))
        for logprob in verifier_result.choices[0].logprobs.content[0].top_logprobs    
    }
    helpfulness_scores = {
        logprob.token: float(np.round(np.exp(logprob.logprob)*100,2))
        for logprob in evaluator_result.choices[0].logprobs.content[0].top_logprobs    
    }
    print(f"{prompt}\n{correctness_scores = }\n{helpfulness_scores = }\n\n")


<task>Kacey is picking out jewelry to wear for school. She has 10 rings, 4 bracelets, and 4 necklaces. How many jewelry combinations are possible if she is going to wear 1 ring, 1 bracelet, and 2 necklaces?</task>
<thought>Kacey can choose 1 ring from 10 options, 1 bracelet from 4 options, and 2 necklaces from 4 options, so we need to calculate the total combinations using the formula: 10 * 4 * (4 choose 2).</thought>
<thought>Kacey can choose 1 ring from 10 options, 1 bracelet from 4 options, and 2 necklaces from 4 options, so we need to calculate the total combinations using the formula: 10 * 4 * (4 choose 2).</thought>
Last thought:
<thought>Kacey can choose 1 ring from 10 options, 1 bracelet from 4 options, and 2 necklaces from 4 options, so we need to calculate the total combinations using the formula: 10 * 4 * (4 choose 2).</thought>
correctness_scores = {'1': 99.48, '0': 0.52}
helpfulness_scores = {'9': 55.48, '8': 43.21, '7': 1.3, '6': 0.01, '5': 0.0, 'The': 0.0, '4': 0.0, 'Thi