First, do `pip install -r requirements.txt`

In [None]:
# this will have our open AI key
import json
with open('secrets.json') as f:
    secrets = json.load(f)

In [None]:
import openai
openai.api_key = secrets["OPEN-AI-KEY"]

import wikienv, wrappers
import random
import time
import pandas as pd
from tqdm import tqdm

In [None]:
def llm(prompt, stop=["\n"]):
    res = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        stop=stop
    )

    res = res['choices'][0]['message']['content']
    return res

In [None]:
env = wikienv.WikiEnv()
env = wrappers.FeverWrapper(env, split="train")
env = wrappers.LoggingWrapper(env)

def step(env, action):
    attempts = 0
    while attempts < 10:
        try:
            return env.step(action)
        except:
            attempts += 1

In [None]:
webthink_prompt = """
Reason about why the claim is supported, refuted, or has not enough information based on Wikipedia evidences.

Claim: Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.
Label: SUPPORTS
Thought 1: I need to search Nikolaj Coster-Waldau and find if he has worked with the Fox Broadcasting Company.
Action 1: Search[Nikolaj Coster-Waldau]
Observation 1: Nikolaj William Coster-Waldau (born 27 July 1970) is a Danish actor and producer. He graduated from the Danish National School of Performing Arts in Copenhagen in 1993,[1] and had his breakthrough role in Denmark with the film Nightwatch (1994). He played Jaime Lannister in the HBO fantasy drama series Game of Thrones, for which he received two Primetime Emmy Award nominations for Outstanding Supporting Actor in a Drama Series.. Coster-Waldau has appeared in numerous films in his native Denmark and Scandinavia, including Headhunters (2011) and A Thousand Times Good Night (2013). In the U.S, his debut film role was in the war film Black Hawk Down (2001), playing Medal of Honor recipient Gary Gordon.[2] He then played a detective in the short-lived Fox television series New Amsterdam (2008), and appeared in the 2009 Fox television film Virtuality, originally intended as a pilot.
Thought 2: Because he "appeared in the 2009 Fox television film Virtuality", he should have worked with the Fox Broadcasting Company. So, the claim is indeed supported by the evidence.
Action 2: Finish[done]

Claim: Stranger Things is set in Bloomington, Indiana.
Label: REFUTES
Thought 1: I should search for Stranger Things, and see if it is set in Bloomington, Indiana.
Action 1: Search[Stranger Things]
Observation 1: Stranger Things is an American science fiction horror drama television series created by the Duffer Brothers. Set in the 1980s, primarily in the fictional town of Hawkins, Indiana, the series centers on a number of mysteries and supernatural events occurring around the town and their impact on an ensemble of child and adult characters. 
Thought 2: The observation says that it is set in a "fictional town of Hawkins, Indiana", so it is not set in Bloomington. So, the claim is indeed refuted by the evidence.
Action 2: Finish[done]

Claim: Beautiful reached number two on the Billboard Hot 100 in 2003.?
Label: NOT ENOUGH INFO
Thought 1: I need to search the song Beautiful and find if it reached number two on the Billboard Hot 100 in 2003.
Action 1: Search[Beautiful]
Observation 1: Could not find [Beautiful]. Similar: ['Beautiful', 'Beautiful, Beautiful', 'A Beautiful Mind (film)', 'Beautiful (Christina Aguilera song)', 'Life Is Beautiful'].
Thought 2: From suggestions, I should search "Beautiful (Christina Aguilera song)" to find the song.
Action 2: Search[Beautiful (Christina Aguilera song)]
Observation 2: "Beautiful" is a song recorded by American singer Christina Aguilera for her fourth studio album, Stripped (2002).
Thought 3: It does not mention Billboard, so I need to look up "Billboard Hot 100" to find if it reached number two on it in 2003.
Action 3: Lookup[Billboard Hot 100]
Observation 3: (Result 1 / 3) The song peaked at number two on the Billboard Hot 100 in the United States, where it was certified Gold for 500,000 units shipped.
Thought 4: It only says the song peaked at number two on the Billboard Hot 100, but not if it was in 2003. I am not sure if this claim is true or not. So, the evidence is indeed not sufficient to verify the claim.
Action 4: Finish[done]
"""

In [None]:
from datasets import load_dataset
d_train = load_dataset("fever", "v1.0") # training set
df_train = pd.DataFrame(d_train['train'])

In [None]:
def webthink(idx=None, prompt=webthink_prompt, to_print=False):
    question = env.reset(idx=idx)
    label = df_train[df_train['claim'] == question[7:]]['label'].values[0]

    if to_print:
        print(idx, question)
    
    prompt += question + "\n"
    prompt += "Label: " + label + "\n"

    n_calls, n_badcalls = 0, 0
    for i in range(1, 8):
        n_calls += 1
        thought_action = llm(prompt + f"Thought {i}:", stop=[f"\nObservation {i}:"])
        try:
            thought, action = thought_action.strip().split(f"\nAction {i}: ")
        except:
            #print('ohh...', thought_action)
            n_badcalls += 1
            n_calls += 1
            thought = thought_action.strip().split('\n')[0]
            action = llm(prompt + f"Thought {i}: {thought}\nAction {i}:", stop=[f"\n"]).strip()
        obs, r, done, info = step(env, action[0].lower() + action[1:])
        obs = obs.replace('\\n', '')
        step_str = f"Thought {i}: {thought}\nAction {i}: {action}\nObservation {i}: {obs}\n"
        prompt += step_str
        if to_print:
            print(step_str)
        if done:
            break
    if not done:
        obs, r, done, info = step(env, "finish[]")
    if to_print:
        print(info, '\n')
    info.update({'n_calls': n_calls, 'n_badcalls': n_badcalls, 'traj': prompt})
    return r, info

In [None]:
df_react_reasoning_chains = pd.read_csv('data/react/df_react_chains_1381_forced_gt.csv')
seen = df_react_reasoning_chains['question_idx'].values

For 1500 examples, this loop may take about a day. It may also need manual intervention now and then (every few minutes) to make sure it doesnt get stuck. You can tell it is stuck if (when you clear the output) there is no new output for 20 seconds or so

In [None]:
idxs = random.Random(233).shuffle(range(3000)) # use this seed
num_to_process = 1500 # message Dean on slack how many you choose to process
num_processed = 0
infos = [] # holds our trajectories (and other info from react process)
    
for i in tqdm(idxs):
    if i in seen:
        continue
    
    try:
        r, info = webthink(i, prompt=webthink_prompt)
    except:
        continue

    infos.append(info)

    num_processed += 1
    if num_processed == num_to_process:
        break

Now save results to a new dataframe

In [None]:
# This code is for the new react prompt template (forcing it to give reasoning for ground truth)
labels = []
claims = []
trajectories = []
question_idxs = []
n = 0
for i in infos:
    qi = i['question_idx']
    a, ga = i['answer'], i['gt_answer']
    traj = i['traj']

    last_claim_chain = traj.split("Claim: ")[-1]
    lines = last_claim_chain.split('\n')
    claim = lines[0]
    trajectories.append("\n".join(lines[:-4]))

    labels.append(ga)
    claims.append(claim)
    question_idxs.append(qi)

In [None]:
import pandas as pd
df_react_reasoning_chains = pd.DataFrame({
    'question_idx': question_idxs,
    'claim': claims,
    'label': labels,
    #'evidence': evidences,
    #'assertions': assertions
    'trajectories': trajectories
})

In [None]:
df_react_reasoning_chains.to_csv('data/react/df_react_chains_next_1500_forced_gt.csv')