# Env

In [1]:
import json
import time
from tot.prompts.strategy_qa import propose_prompt, value_prompt, answer_prompt
from tot.models import gpt, gpt_usage
from tot.tasks.strategy_qa import StrategyQAEnv, StrategyQATask
from tot.methods.evaluation import compute_f1_strategy, compute_comparison
import os

env = StrategyQAEnv()

# Prompt

In [2]:
def propose_prompt_wrap(obs):
    prompt = propose_prompt.format(input=obs)
    # print(prompt)
    return prompt

# print(propose_prompt_wrap(env.reset(0)))
# print('---------')
# print(prompt_wrap(env.step('h2. value')[0]))

def answer_prompt_wrap(self):
    prompt = answer_prompt.format(original_question=self.data, decomposition=self.decomp)
    # print(prompt)
    return prompt

# print(answer_prompt_wrap(env))

In [3]:
import re
import copy
from tot.models import gpt

def parse_line(input_str):
    pattern = r'confidence level: (\w+)'
    match = re.search(pattern, input_str, re.IGNORECASE)

    if match:
        confidence_level = match.group(1)
        return confidence_level
    else:
        print("No confidence level found.")

confidence_to_value = {'stop': 100, 'certain': 1, 'high': 0.5, 'medium': 0.2, 'low': 0.1}  # TODO: ad hoc

def parse_response(response):
    response = response.split('\n')
    score = parse_line(response[-1].strip())
    candidate = response[:-1]
    candidate = '\n'.join(candidate)

    parsed_lines = [(candidate, confidence_to_value.get(score, 0))]
    return parsed_lines


def get_candidates_to_scores(env):
    obs = env.render()
    if obs in env.cache: # Checks if the observation (obs) is already present in the cache of the environment (env)
        print('cache hit: stopping the decomposition because of repetition!')
        return []
        # return env.cache[obs] # If so returns the cached result (candidates and their scores)

    print('call gpt')
    responses = gpt(propose_prompt_wrap(obs), n=3) # get candidates for possible next steps
    # print('responses:')
    # print(responses)
    candidates_to_scores = {}
    for response in responses:
        parsed_response = parse_response(response.lower())
        # print(parsed_response)
        if parsed_response:
            for candidate, score in parsed_response:
                # candidate = str(candidate)
                candidates_to_scores[candidate] = candidates_to_scores.get(candidate, 0) + score # sums up the scores of same candidates 
        # choose candiate with highest score
    # print(sorted(candidates_to_scores.items(), key=lambda x: x[1], reverse=True))
    env.cache[obs] = candidates_to_scores # Stores the obtained candidates and scores in the cache of the environment for future use
    return candidates_to_scores

def propose_score(env, idx):
    obs = env.reset(idx)
    done = False
    infos = []
    while not done:
        responses = gpt(propose_prompt_wrap(obs), n=5)
        candidates_to_scores = {}
        for response in responses:
            parsed_response = parse_response(response)
            if parsed_response:
                for candidate, score in parsed_response:
                    candidates_to_scores[candidate] = candidates_to_scores.get(candidate, 0) + score # aggregated scores for each candidate. If a candidate is already present in the dictionary, the existing score is incremented by the new score. If the candidate is not present, a new entry is created with the initial score.
        # choose candiate with highest score
        print(sorted(candidates_to_scores.items(), key=lambda x: x[1], reverse=True))
        if len(candidates_to_scores) == 0:
            break
        candidates =  sorted(candidates_to_scores, key=candidates_to_scores.get, reverse=True)
        for candidate in candidates:
            env_ = copy.deepcopy(env)
            env_.step(candidate)
            if not any(_ == 2 for _ in env_.status):
                break
        print(candidate)
        # candidate = input()
        obs, r, done, info = env.step(candidate)
        print(obs)
        print(env.steps, info)
        print('-------------------\n\n\n')
        infos.append(info)
    return infos
    

# DFS

In [4]:
import copy

def dfs(env, actions, infos, time_limit, prune, max_per_state, done=False):
    # get candidate thoughts
    candidates_to_scores = get_candidates_to_scores(env)
    # if len(candidates_to_scores) == 0: return 0, [], []
    if len(candidates_to_scores) == 0: 
        done = True
        return env, done
    print("sorted candidates to score:")
    print(sorted(candidates_to_scores.items(), key=lambda x: x[1], reverse=True)) # Prints the candidates and their scores sorted in descending order based on confidence levels

    # back up current state
    decomp, status, steps = copy.copy(env.decomp), env.status.copy(), env.steps

    # try each candidate
    cnt_per_state = 0
    for action in sorted(candidates_to_scores, key=candidates_to_scores.get, reverse=True): # Iterates over candidates in descending order of their confidence levels
        # Takes a step in the environment with the selected action
        obs = env.step(action)
        print('### action ###')
        print(obs)
        if len(infos) < time_limit and env.steps < 2 and not any(_ == 2 for _ in env.status) and candidates_to_scores[action] < 100:  # not violating any existing constraints
            cnt_per_state += 1
            if cnt_per_state > max_per_state: break
            count = env.prompt_status()        
            actions.append(action)  
            
            info = {'total_step': len(infos), 'env_step': env.steps, 'actions': actions.copy(), 'count': count} # Information about the explored states is collected
            infos.append(info)

            if not prune or count['impossible'] < 1:  # only continue if the current status is possible and is not pruned
                print("DFS recursive call")
                env, done = dfs(env, actions, infos, time_limit, prune, max_per_state, done) ### RECURSIVELY CALLS DFS FOR FURTHER EXPLORATION ###
            
            if done: break
            actions.pop() # Pops the last action from the list of actions to backtrack (undoing the last decision made during the exploration)
            # print('remaining actions:')
            # print(actions)
            
        if len(infos) >= time_limit or env.steps >= 4 or candidates_to_scores[action] >= 100:
            info = {'total_step': len(infos), 'env_step': env.steps, 'decomp_state': env.decomp}
            infos.append(info)

            if len(infos) >= time_limit:
                reason = 'time out'
            elif env.steps >= 4:
                reason = 'environment steps'
            else:
                reason = 'decomposition finished'

            print(f'$$$ end exploration because of: {reason} $$$')
            done = True
            break
        else:
            env.reset(env.idx, decomp=copy.copy(decomp), status=status.copy(), steps=steps) # if all nodes are impossible or the max amount of steps was reached, resets the environment to the previous layer's backed-up state to backtrack and explore other candidate actions
            print('$$$ backtrack $$$')
        # print(env.render())

    # print(env.render())
    return env, done

In [5]:
import time
import random
import json
import argparse
from tot.methods.bfs import naive_solve

def process_strategy(env, args, task, sample):
    infoss = []
    for i in range(0, 200, 2):
        env.reset(i)
        infos = []
        actions = []

        start_time = time.time()
        if sample == 'dfs':
            env.reset(i)
            final_env, _ = dfs(env, actions, infos, 6, prune=True, max_per_state=2)   
            answer = gpt(answer_prompt_wrap(final_env), temperature=0)
            score = env.eval_status(answer)
        else:
            ys, info, score = naive_solve(args, task, i)
        end_time = time.time()
        elapsed_time = end_time - start_time

        if sample == 'dfs':
            new_entry = {"answer": answer, "answer_gt": env.ans_gt, "gpt usage": gpt_usage()['cost'], "elapsed time": elapsed_time, "score": score}
        else:
            new_entry = {"answer": ys, "answer_gt": task.get_answer(i), "gpt usage": gpt_usage()['cost'], "elapsed time": elapsed_time, "score": score}
        
        infos.append(new_entry)
        infoss.append(infos)
        
        with open(f'../../logs/strategy_qa/infoss_{sample}_gpt35.json', 'w') as fout:
            json.dump(infoss, fout, indent=4)

    return infoss

In [6]:
task = StrategyQATask()
args = argparse.Namespace(backend='gpt-3.5-turbo', temperature=0.7, task='strategy_qa', naive_run=True, n_generate_sample=1)
samples = ['dfs', 'cot', 'standard']

resultss = []
for iter in range(3):
    # Process each strategy
    for sample in samples:
        if sample == 'cot':
            args.prompt_sample = 'cot'
        elif sample == 'standard':
            args.prompt_sample = 'standard'
        
        infoss = process_strategy(env, args, task, sample)
        
        
    with open('../../logs/strategy_qa/infoss_dfs_gpt35.json', 'r') as file_1:
        data_1 = json.load(file_1)
    with open('../../logs/strategy_qa/infoss_cot_gpt35.json', 'r') as file_2:
        data_2 = json.load(file_2)
    with open('../../logs/strategy_qa/infoss_standard_gpt35.json', 'r') as file_3:
        data_3 = json.load(file_3)

    results_1 = compute_f1_strategy(data_1)
    results_2 = compute_f1_strategy(data_2)
    results_3 = compute_f1_strategy(data_3)

    resultss.append(results_1)
    resultss.append(results_2)
    resultss.append(results_3)

    score = {'dfs': 0, 'cot': 0, 'io': 0}

    for i in range(0, len(data_1), 5):
        env.reset(i*2)
        question = env.data
        ans_gt = env.ans_gt
        dfs_ans = next((item['answer'] for item in data_1[i] if 'answer' in item), 0)
        cot_ans = data_2[i][0]['answer']
        io_ans = data_3[i][0]['answer']

        new_scores = compute_comparison(score, dfs_ans[0].replace('\n', ''), cot_ans[0].replace('\n', ''), io_ans, question, ans_gt, iter)
        score = {key: score[key] + new_scores[key] for key in score}
                
    resultss.append(score)

    with open('../../logs/strategy_qa/results.json', 'w') as file:
        json.dump(resultss, file, indent=4)

call gpt
sorted candidates to score:
[('output:\nare more people today related to genghis khan than julius caesar?\n- how many descendants does genghis khan have?\n- how many descendants does julius caesar have?\n- what is the estimated population of descendants for each?', 0.2), ('output:\nare more people today related to genghis khan than julius caesar?\n- how many descendants does genghis khan have today?\n- how many descendants does julius caesar have today?', 0.2), ('output:\nare more people today related to genghis khan than julius caesar?\n- how many descendants does genghis khan have today?\n- how many descendants does julius caesar have today?\n- what is the estimated world population of descendants of genghis khan?\n- what is the estimated world population of descendants of julius caesar?', 0.2)]
steps: 1
### action ###
Current Decomposition:
output:
are more people today related to genghis khan than julius caesar?
- how many descendants does genghis khan have?
- how many des

In [None]:
with open('../../logs/strategy_qa/results.json', 'r') as file:
    data = json.load(file)

f1_scores = []
summed_scores = []
summed_values = {'dfs': 0, 'cot': 0, 'io': 0}

for item in data:
    if isinstance(item, list):
        for sub_item in item:
            if isinstance(sub_item, list):
                for sub_sub_item in sub_item:
                    if isinstance(sub_sub_item, dict):
                        f1_scores.append(sub_sub_item["f1"])
            
            elif isinstance(sub_item, float):
                summed_scores.append(sub_item)

    if isinstance(item, dict):
        for key in summed_values:
            summed_values[key] += item[key]

# print(f1_scores)
# print(summed_values)
# print(summed_scores)

print('AVERAGE F1 SCORE')
average_f1_dfs = (f1_scores[0] + f1_scores[3] + f1_scores[6]) / 3
average_f1_cot = (f1_scores[1] + f1_scores[4] + f1_scores[7]) / 3
average_f1_io = (f1_scores[2] + f1_scores[5] + f1_scores[8]) / 3
print("Average f1 score MTS:", average_f1_dfs)
print("Average f1 score COT:", average_f1_cot)
print("Average f1 score IO:", average_f1_io)

print('\n')
print('AVERAGE INDIVIDUAL SCORE')

average_scores_dfs = (summed_scores[0] + summed_scores[3] + summed_scores[6]) / 3
average_scores_cot = (summed_scores[1] + summed_scores[4] + summed_scores[7]) / 3
average_scores_io = (summed_scores[2] + summed_scores[5] + summed_scores[8]) / 3
print("Average score MTS:", average_scores_dfs)
print("Average score COT:", average_scores_cot)
print("Average score IO:", average_scores_io)

print('\n')
print('AVERAGE COMPARISON SCORE')

for key in summed_values:
        summed_values[key] /= 3
print('Average explainability score:', summed_values)