# Env

In [1]:
import json
import time
from tot.prompts.strategy_qa import propose_prompt, value_prompt, answer_prompt_retrieval
from tot.models import gpt, gpt_usage
from tot.tasks.strategy_qa import StrategyQAEnv
from tot.methods.evaluation import compute_f1_strategy
import os

env = StrategyQAEnv()
api_key = os.getenv('OPENAI_API_KEY', 'PUT-YOUR-KEY-HERE')

# Vector Search

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

loader = TextLoader("facts.txt")
doc = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0, separator = '\n')
splits = text_splitter.split_documents(doc)

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

persist_directory = 'docs/chroma/'
embedding = OpenAIEmbeddings(openai_api_key=api_key)
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)
# print(vectordb._collection.count())

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=api_key)
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

# Prompt

In [5]:
completion_tokens = prompt_tokens = 0

In [6]:
def propose_prompt_wrap(obs):
    global completion_tokens, prompt_tokens
    prompt = propose_prompt.format(input=obs)
    prompt_tokens += len(prompt.split())
    # print(prompt)
    return prompt

# print(propose_prompt_wrap(env.reset(0)))
# print('---------')
# print(prompt_wrap(env.step('h2. value')[0]))

# def answer_prompt_wrap(self):
#     global completion_tokens, prompt_tokens
#     prompt = answer_prompt.format(original_question=self.data, facts=self.facts_gt, decomposition=self.decomp)
#     prompt_tokens += len(prompt.split())
#     # print(prompt)
#     return prompt

def answer_prompt_wrap(final_decomp, original_question):
    global completion_tokens, prompt_tokens
    prompt = answer_prompt_retrieval.format(original_question=original_question, decomposition=final_decomp)
    prompt_tokens += len(prompt.split())
    # print(prompt)
    return prompt

# print(answer_prompt_wrap(env))

In [7]:
import re
import copy
from tot.models import gpt

def parse_line(input_str):
    pattern = r'confidence level: (\w+)'
    match = re.search(pattern, input_str, re.IGNORECASE)

    if match:
        confidence_level = match.group(1)
        return confidence_level
    else:
        print("No confidence level found.")

confidence_to_value = {'stop': 100, 'certain': 1, 'high': 0.5, 'medium': 0.2, 'low': 0.1}  # TODO: ad hoc

def parse_response(response):
    response = response.split('\n')
    score = parse_line(response[-1].strip())
    candidate = response[:-1]
    candidate = '\n'.join(candidate)

    parsed_lines = [(candidate, confidence_to_value.get(score, 0))]
    return parsed_lines


def get_candidates_to_scores(env):
    global completion_tokens, prompt_tokens
    obs = env.render()
    if obs in env.cache: # Checks if the observation (obs) is already present in the cache of the environment (env)
        print('cache hit: stopping the decomposition because of repetition!')
        return []
        # return env.cache[obs] # If so returns the cached result (candidates and their scores)

    print('call gpt')
    responses = gpt(propose_prompt_wrap(obs), n=3) # get candidates for possible next steps
    # responses = ['Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia solve altogether in April and May?\n    - How many clips did Natalia sell in April?\n    - How many clips did Natalia sell in June?\nConfidence level: high', 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia solve altogether in April and May?\n    - How many clips did Natalia sell in April?\n    - How many clips did Natalia sell in May?\nConfidence level: stop']
    # print('responses:')
    # print(responses)
    candidates_to_scores = {}
    for response in responses:
        completion_tokens += len(response.split())
        parsed_response = parse_response(response.lower())
        # print(parsed_response)
        if parsed_response:
            for candidate, score in parsed_response:
                # candidate = str(candidate)
                candidates_to_scores[candidate] = candidates_to_scores.get(candidate, 0) + score # sums up the scores of same candidates 
        # choose candiate with highest score
    # print(sorted(candidates_to_scores.items(), key=lambda x: x[1], reverse=True))
    env.cache[obs] = candidates_to_scores # Stores the obtained candidates and scores in the cache of the environment for future use
    return candidates_to_scores

def propose_score(env, idx):
    obs = env.reset(idx)
    done = False
    infos = []
    while not done:
        responses = gpt(propose_prompt_wrap(obs), n=5)
        candidates_to_scores = {}
        for response in responses:
            parsed_response = parse_response(response)
            if parsed_response:
                for candidate, score in parsed_response:
                    candidates_to_scores[candidate] = candidates_to_scores.get(candidate, 0) + score # aggregated scores for each candidate. If a candidate is already present in the dictionary, the existing score is incremented by the new score. If the candidate is not present, a new entry is created with the initial score.
        # choose candiate with highest score
        print(sorted(candidates_to_scores.items(), key=lambda x: x[1], reverse=True))
        if len(candidates_to_scores) == 0:
            break
        candidates =  sorted(candidates_to_scores, key=candidates_to_scores.get, reverse=True)
        for candidate in candidates:
            env_ = copy.deepcopy(env)
            env_.step(candidate)
            if not any(_ == 2 for _ in env_.status):
                break
        print(candidate)
        # candidate = input()
        obs, r, done, info = env.step(candidate)
        print(obs)
        print(env.steps, info)
        print('-------------------\n\n\n')
        infos.append(info)
    return infos

def is_leaf_node(list):
    leaf_nodes = []
    for i in range(len(list)):
        line = list[i].strip()
        if line.startswith('-'):
            if i + 1 < len(list) and list[i + 1].strip().startswith('\t-'):
                # If followed by a question with tab and hyphen, add that instead
                leaf_nodes.append(list[i + 1].strip())
            else:
                leaf_nodes.append(line)
    return leaf_nodes

def get_ans(env, vectordb, qa_chain):
    question_list = env.decomp.split('\n')

    result = []
    for sub_question in question_list:
        if sub_question in is_leaf_node(question_list):
            docs = vectordb.similarity_search(sub_question, k=3)
            print(docs)
            answer = qa_chain({"query": sub_question})
            answer = answer["result"]
            result.append(f"{sub_question} {answer}")
        else:
            result.append(sub_question)
    # print(result)
    result_text = '\n'.join(result)
    # print(result_text)
    return result_text
    

# DFS

In [8]:
import copy

def dfs(env, actions, infos, time_limit, prune, max_per_state, done=False):
    global completion_tokens, prompt_tokens
    # get candidate thoughts
    candidates_to_scores = get_candidates_to_scores(env)
    # if len(candidates_to_scores) == 0: return 0, [], []
    if len(candidates_to_scores) == 0: 
        done = True
        return env, done
    print("sorted candidates to score:")
    print(sorted(candidates_to_scores.items(), key=lambda x: x[1], reverse=True)) # Prints the candidates and their scores sorted in descending order based on confidence levels

    # back up current state
    decomp, status, steps = copy.copy(env.decomp), env.status.copy(), env.steps

    # try each candidate
    cnt_per_state = 0
    for action in sorted(candidates_to_scores, key=candidates_to_scores.get, reverse=True): # Iterates over candidates in descending order of their confidence levels
        # obs, r, done, info = env.step(action) # Takes a step in the environment with the selected action
        obs = env.step(action)
        print('### action ###')
        print(obs)
        # r = info['r_word'] # collects reward for this action
        if len(infos) < time_limit and env.steps < 2 and not any(_ == 2 for _ in env.status) and candidates_to_scores[action] < 100:  # not violating any existing constraints
            cnt_per_state += 1
            if cnt_per_state > max_per_state: break
            count, prompt, res = env.prompt_status()   
            prompt_tokens += len(prompt.split())
            completion_tokens += len(res.split())      
            actions.append(action)  
            
            info = {'total_step': len(infos), 'env_step': env.steps, 'actions': actions.copy(), 'count': count} # Information about the explored states is collected
            infos.append(info)
            print(info)

            if not prune or count['impossible'] < 1:  # only continue if the current status is possible and is not pruned
                print("DFS recursive call")
                env, done = dfs(env, actions, infos, time_limit, prune, max_per_state, done) ### RECURSIVELY CALLS DFS FOR FURTHER EXPLORATION ###
            
            if done: break
            actions.pop() # Pops the last action from the list of actions to backtrack (undoing the last decision made during the exploration)
            # print('remaining actions:')
            # print(actions)
            
        if len(infos) >= time_limit or env.steps >= 4 or candidates_to_scores[action] >= 100:
            info = {'total_step': len(infos), 'env_step': env.steps, 'decomp_state': env.decomp}
            infos.append(info)
            print(info)

            if len(infos) >= time_limit:
                reason = 'time out'
            elif env.steps >= 4:
                reason = 'environment steps'
            else:
                reason = 'decomposition finished'

            print(f'$$$ end exploration because of: {reason} $$$')
            done = True
            break
        else:
            env.reset(env.idx, decomp=copy.copy(decomp), status=status.copy(), steps=steps) # if all nodes are impossible or the max amount of steps was reached, resets the environment to the previous layer's backed-up state to backtrack and explore other candidate actions
            print('$$$ backtrack $$$')
        # print(env.render())

    # print(env.render())
    return env, done

In [None]:
# dfs with pruning
resultss = []
for iter in range(3):
        infoss = []
        for i in range(0, 100, 5):
                env.reset(i)
                infos = []
                actions = []

                start_time = time.time()
                final_env, _ = dfs(env, actions, infos, 6, prune=True, max_per_state=2)
                score, prompt, res = env.eval_status()    
                answer = get_ans(final_env, vectordb, qa_chain)
                answer = gpt(answer_prompt_wrap(answer, final_env.data), temperature=0)
                end_time = time.time()
                elapsed_time = end_time - start_time

                prompt_tokens += len(prompt.split())
                completion_tokens += len(res.split()) 
                completion_tokens += len(str(answer).split()) 
                print("and the answer is: "+str(answer))
                gpt_cost = gpt_usage()
                new_entry = {"answer": answer, "answer_gt": env.ans_gt, "gpt usage": gpt_cost['cost'], "elapsed time": elapsed_time, "scores": score}
                infos.append(new_entry)
                infoss.append(infos)
                with open('../../logs/strategy_qa/infoss_dfs_retrieval_gpt35.json', 'w') as fout:
                # with open('../../logs/strategy_qa/test_retrieval.json', 'w') as fout:
                        json.dump(infoss, fout, indent=4)
        
        with open('../../logs/strategy_qa/infoss_dfs_gpt35.json', 'r') as file:
                data = json.load(file)

        results = compute_f1_strategy(data)
        resultss.append(results)

        with open('../../logs/strategy_qa/results_retrieval.json', 'w') as file:
                json.dump(resultss, file, indent=4)

In [None]:
with open('../../logs/strategy_qa/results_retrieval.json', 'r') as file:
    data_lists = json.load(file)

combined_data = []
for data_list in data_lists:
    combined_data.extend(data_list)

f1_scores = []

for item in combined_data:
    f1 = item['f1']
    f1_scores.append(f1)

print(f1_scores)
average_f1_score = sum(f1_scores)/ len(f1_scores)
print(f"Average F1 score: {average_f1_score}")