# Contents

1. [Loading the FEVER dataset](#Loading-the-FEVER-dataset)
1. [Using an internal company LLM API (Falcon-40B) to get assumptions](#Falcon-40B-Offering-to-get-assumptions)
1. [Using ReAct to get reasoning chains](#Using-ReAct-to-get-reasoning-chains)
1. [Using Open AI API for reasoning chains](#Using-Open-AI-API)
    1. [Prompt version 1](#Prompt-version-1)
    1. [Prompt version 2](#Prompt-version-2)

In [3]:
import json
with open('secrets.json') as f:
    secrets = json.load(f)

In [2]:
import pickle
import pandas as pd
from tqdm import tqdm

## Loading the FEVER dataset

In [4]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
d_train = load_dataset("fever", "v1.0") # training set

In [None]:
d_val = load_dataset("fever", "v2.0") # validation set

In [None]:
example_claim = d_train['train'][0]

In [57]:
example_claim

{'id': 75397,
 'label': 'SUPPORTS',
 'claim': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.',
 'evidence_annotation_id': 92206,
 'evidence_id': 104971,
 'evidence_wiki_url': 'Nikolaj_Coster-Waldau',
 'evidence_sentence_id': 7}

In [None]:
ex_with_evidence = d_train['train'][1059]

In [None]:
d_wiki = load_dataset("fever", "wiki_pages") # wikipedia evidence
d_wikipages = d_wiki['wikipedia_pages']

In [59]:
d_wikipages[104971]

{'id': '1974_in_Cambodia',
 'text': 'The following lists events that happened during 1974 in Cambodia . ',
 'lines': '0\tThe following lists events that happened during 1974 in Cambodia .\t1974\t1974\tCambodia\tCambodia\n1\t'}

In [None]:
wiki_text = {val: text for val, text in zip(d_wikipages['id'], d_wikipages['text'])}

In [None]:
with open('data/fever/wiki_text.pkl', 'wb') as f:
    pickle.dump(wiki_text, f)

In [17]:
with open('data/fever/wiki_text.pkl', 'rb') as f:
    wiki_text = pickle.load(f)

In [18]:
import re
bad_pattern_1 = re.compile(r'\s*-RCB-\s*')
bad_css = ['align = ', 'width = ', 'colspan = ', 'style = ', 'bgcolor = ']
cleaned_wiki_text = {}
#Removing any CSS properties or unneeded phrases/characters from the wikipedia evidence (so we have less chance of exceeding the context window)
m = 0
for v, t in wiki_text.items():
    l = len(t)

    temp = ''
    for token in t.split('|'):
        token = re.sub(bad_pattern_1, ' ', token)
        if token == ' ' or token == ' -  ' or any(pat in token for pat in bad_css):
            continue

        temp += token
    t = temp
    cleaned_wiki_text[v] = t
    
    if l > m:
        m = l
        x = t

In [19]:
wiki_text = cleaned_wiki_text

Load the test set

In [12]:
d_train['paper_test'][0]

{'id': 113501,
 'label': 'NOT ENOUGH INFO',
 'claim': 'Grease had bad reviews.',
 'evidence_annotation_id': 133128,
 'evidence_id': -1,
 'evidence_wiki_url': '',
 'evidence_sentence_id': -1}

In [31]:
claims = []
evidences = []
labels = []

In [32]:
prev_claim = ''
prev_l = ''
curr_wiki_set = set()
for i in range(len(d_train['paper_test'])):
    example = d_train['paper_test'][i]
    c, e, l = example['claim'], example['evidence_wiki_url'], example['label']
    if e in wiki_text:
        evidence = wiki_text[e]

    if c == prev_claim:
        curr_wiki_set.add(evidence)
    else:
        if len(curr_wiki_set) > 0:
            claims.append(prev_claim)
            evidences.append('\n'.join(ev for ev in list(curr_wiki_set)))
            labels.append(prev_l)
        curr_wiki_set = set([evidence])

    prev_claim = c
    prev_l = l

In [33]:
df_test_set = pd.DataFrame({
    'claim': claims,
    'evidence': evidences,
    'label': labels
})

In [40]:
df_test_set['label'].value_counts()

label
NOT ENOUGH INFO    3333
SUPPORTS           3333
REFUTES            3332
Name: count, dtype: int64

In [37]:
df_test_set.to_csv('data/test.csv')

## Falcon 40B Offering to get assumptions

In [None]:
import requests
uri, api_key = secrets["DELL-LLM-URI"], secrets["DELL-LLM-API-KEY"]
model_name = "falcon-40b-instruct"
def llm_api(prompt):
    url = uri + model_name
    payload = {
        "instruction": prompt
    }
    headers = {
        "accept": "application/json",
        "api-key": api_key,
        "Content-Type": "application/json"
    }

    try:
        response = requests.post(url, headers=headers, json=payload, verify=False)
        response.raise_for_status()
        response = response.json()
        generated_text = response['response'][0]['generated_text']
        return generated_text
    except Exception as err:
        if response is not None:
            print("Error code:", response.status_code)
            print("Error message:", response.text)
        else:
            print("Error:", err)
        return None

In [None]:
prompt = f"""
Here is a statement:
{ex_with_evidence['claim']}
Make a numbered list of the assumptions you made when producing the above statement.
"""
res = llm_api(prompt)

In [None]:
evidence = [wiki_text[url] for url in ex_with_evidence['evidence_wiki_url'] if url in wiki_text]

In [None]:
prompt = f"""
Here is a numbered list of assertions:
{res}
Here is an evidence passage:
{evidence}
For each assertion, determine whether it is SUPPORTED, REFUTED, or NOT ENOUGH INFO based on the evidence given. Give back a numbered list, each with 1 of these labels, where each number corresponds to its corresponding assertion. Also, give back a final answer based on a majority vote
"""
res2 = llm_api(prompt)

In [110]:
print(res2)

1. The movie is called Sully
2. Sully is a real person
3. Tom Hanks is an actor
4. Tom Hanks has played real people before
5. Tom Hanks played Sully in a movie called Sully.


## Using ReAct to get reasoning chains

Seeing what the data looks like from ReAct paper

In [104]:
import json
with open('data/react/fever.json', 'r') as f:
    x = json.load(f)

In [3]:
x.keys()

dict_keys(['webact_simple3', 'cotqa_simple3', 'webqa_simple3', 'webthink_simple3'])

In [None]:
print(x['webact_simple3'])

In [None]:
print(x['cotqa_simple3'])

In [None]:
print(x['webqa_simple3'])

In [None]:
print(x['webthink_simple3'])

Follow the same ReAct code from the original source (just using GPT-3.5 turbo instead now)

In [112]:
import os
import openai
openai.api_key = secrets["OPEN-AI-KEY"]

import wikienv, wrappers
import json
import sys
import random
import time

In [113]:
def llm(prompt, stop=["\n"]):
    res = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        stop=stop
    )

    res = res['choices'][0]['message']['content']
    return res

In [None]:
!pip install gym beautifulsoup4

In [114]:
env = wikienv.WikiEnv()
env = wrappers.FeverWrapper(env, split="train")
env = wrappers.LoggingWrapper(env)

def step(env, action):
    attempts = 0
    while attempts < 10:
        try:
            return env.step(action)
        except:
            attempts += 1

In [126]:
folder = 'prompts/'
prompt_file = 'fever.json'
with open('data/react/' + folder + prompt_file, 'r') as f:
    prompt_dict = json.load(f)

webthink_prompt = prompt_dict['webthink_simple3']

In [None]:
print(webthink_prompt)

In [159]:
webthink_prompt = """
Reason about why the claim is supported, refuted, or has not enough information based on Wikipedia evidences.

Claim: Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.
Label: SUPPORTS
Thought 1: I need to search Nikolaj Coster-Waldau and find if he has worked with the Fox Broadcasting Company.
Action 1: Search[Nikolaj Coster-Waldau]
Observation 1: Nikolaj William Coster-Waldau (born 27 July 1970) is a Danish actor and producer. He graduated from the Danish National School of Performing Arts in Copenhagen in 1993,[1] and had his breakthrough role in Denmark with the film Nightwatch (1994). He played Jaime Lannister in the HBO fantasy drama series Game of Thrones, for which he received two Primetime Emmy Award nominations for Outstanding Supporting Actor in a Drama Series.. Coster-Waldau has appeared in numerous films in his native Denmark and Scandinavia, including Headhunters (2011) and A Thousand Times Good Night (2013). In the U.S, his debut film role was in the war film Black Hawk Down (2001), playing Medal of Honor recipient Gary Gordon.[2] He then played a detective in the short-lived Fox television series New Amsterdam (2008), and appeared in the 2009 Fox television film Virtuality, originally intended as a pilot.
Thought 2: Because he "appeared in the 2009 Fox television film Virtuality", he should have worked with the Fox Broadcasting Company. So, the claim is indeed supported by the evidence.
Action 2: Finish[done]

Claim: Stranger Things is set in Bloomington, Indiana.
Label: REFUTES
Thought 1: I should search for Stranger Things, and see if it is set in Bloomington, Indiana.
Action 1: Search[Stranger Things]
Observation 1: Stranger Things is an American science fiction horror drama television series created by the Duffer Brothers. Set in the 1980s, primarily in the fictional town of Hawkins, Indiana, the series centers on a number of mysteries and supernatural events occurring around the town and their impact on an ensemble of child and adult characters. 
Thought 2: The observation says that it is set in a "fictional town of Hawkins, Indiana", so it is not set in Bloomington. So, the claim is indeed refuted by the evidence.
Action 2: Finish[done]

Claim: Beautiful reached number two on the Billboard Hot 100 in 2003.?
Label: NOT ENOUGH INFO
Thought 1: I need to search the song Beautiful and find if it reached number two on the Billboard Hot 100 in 2003.
Action 1: Search[Beautiful]
Observation 1: Could not find [Beautiful]. Similar: ['Beautiful', 'Beautiful, Beautiful', 'A Beautiful Mind (film)', 'Beautiful (Christina Aguilera song)', 'Life Is Beautiful'].
Thought 2: From suggestions, I should search "Beautiful (Christina Aguilera song)" to find the song.
Action 2: Search[Beautiful (Christina Aguilera song)]
Observation 2: "Beautiful" is a song recorded by American singer Christina Aguilera for her fourth studio album, Stripped (2002).
Thought 3: It does not mention Billboard, so I need to look up "Billboard Hot 100" to find if it reached number two on it in 2003.
Action 3: Lookup[Billboard Hot 100]
Observation 3: (Result 1 / 3) The song peaked at number two on the Billboard Hot 100 in the United States, where it was certified Gold for 500,000 units shipped.
Thought 4: It only says the song peaked at number two on the Billboard Hot 100, but not if it was in 2003. I am not sure if this claim is true or not. So, the evidence is indeed not sufficient to verify the claim.
Action 4: Finish[done]
"""

In [133]:
d_train['train'][350]

{'id': 65112,
 'label': 'SUPPORTS',
 'claim': 'Mark Zuckerberg co-founded a website.',
 'evidence_annotation_id': 81592,
 'evidence_id': 93354,
 'evidence_wiki_url': 'Facebook',
 'evidence_sentence_id': 1}

In [135]:
df_train = pd.DataFrame(d_train['train'])

In [164]:
def webthink(idx=None, prompt=webthink_prompt, to_print=False):
    question = env.reset(idx=idx)
    label = df_train[df_train['claim'] == question[7:]]['label'].values[0]

    if to_print:
        print(idx, question)
    
    prompt += question + "\n"
    prompt += "Label: " + label + "\n"
    #print(prompt)

    n_calls, n_badcalls = 0, 0
    for i in range(1, 8):
        n_calls += 1
        thought_action = llm(prompt + f"Thought {i}:", stop=[f"\nObservation {i}:"])
        try:
            thought, action = thought_action.strip().split(f"\nAction {i}: ")
        except:
            #print('ohh...', thought_action)
            n_badcalls += 1
            n_calls += 1
            thought = thought_action.strip().split('\n')[0]
            action = llm(prompt + f"Thought {i}: {thought}\nAction {i}:", stop=[f"\n"]).strip()
        obs, r, done, info = step(env, action[0].lower() + action[1:])
        obs = obs.replace('\\n', '')
        step_str = f"Thought {i}: {thought}\nAction {i}: {action}\nObservation {i}: {obs}\n"
        prompt += step_str
        if to_print:
            print(step_str)
        if done:
            break
    if not done:
        obs, r, done, info = step(env, "finish[]")
    if to_print:
        print(info, '\n')
    info.update({'n_calls': n_calls, 'n_badcalls': n_badcalls, 'traj': prompt})
    return r, info

In [170]:
idxs = list(range(3000))
random.Random(233).shuffle(idxs)
rs = []
infos = []
old_time = time.time()

In [119]:
with open('data/react/react_output_first_1564_seed_233.pkl', 'rb') as f:
    so_far = pickle.load(f)

In [208]:
seen = set([x['question_idx'] for x in infos])

Generate examples

In [None]:
#prog = tqdm(initial=n, total=len(idxs))
prev = len(infos)
n = 0
for i in tqdm(idxs):
    if i in seen:
        continue
    
    try:
        r, info = webthink(i, prompt=webthink_prompt)
    except:
        continue
    rs.append(info['em'])
    infos.append(info)
    #print(i)
    #infos2.append(info)

    # if info['answer'] != info['gt_answer']:
    #     print("answers unequal at", i)
    
    # if n % 5 == 0:
    #     time.sleep(25)

    #prog.update(1)
    assert len(infos) == prev + 1
    prev = len(infos)
    n += 1

In [None]:
infos

In [175]:
with open('data/react_output_first_1381.pkl', 'wb') as f:
    pickle.dump(so_far + infos, f)

In [19]:
with open('data/react_output_first_1564_seed_233.pkl', 'wb') as f:
    pickle.dump(so_far + infos, f)

In [115]:
with open('data/react_output_first_1077_seed_233.pkl', 'rb') as f:
    x = pickle.load(f)

In [54]:
# This code is for the original react prompt template
labels = []
claims = []
assertions = []
evidences = []
question_idxs = []
for i in infos:
    qi = i['question_idx']
    a, ga = i['answer'], i['gt_answer']
    traj = i['traj']
    assertion_num = 1

    # Only process react output if it aligns with ground truth
    if a == ga:
        evidence = ""
        claim = ""
        assertion_str = ""
        start = traj.rfind("Claim:")
        end = traj.rfind("Finish[")
        steps = traj[start:end].split('\n')
        for s in steps:
            if s.startswith("Observation"):
                evidence += s[15:]
            elif s.startswith("Action"):
                continue
            elif s.startswith("Claim"):
                claim = s[7:]
            elif s.startswith("Thought"):
                thought = s[11:]
                if thought.startswith("I"):
                    continue
                assertion_str += f"{assertion_num}. {thought} \n"
                assertion_num += 1

        labels.append(a)
        claims.append(claim)
        assertions.append(assertion_str)
        question_idxs.append(qi)
        evidences.append(evidence)

In [210]:
# This code is for the new react prompt template (forcing it to give reasoning for ground truth)
labels = []
claims = []
trajectories = []
question_idxs = []
n = 0
for i in infos:
    qi = i['question_idx']
    a, ga = i['answer'], i['gt_answer']
    traj = i['traj']

    last_claim_chain = traj.split("Claim: ")[-1]
    lines = last_claim_chain.split('\n')
    claim = lines[0]
    lines[1] = "Label: " + lines[1]
    trajectories.append("\n".join(lines[:-4]))

    labels.append(ga)
    claims.append(claim)
    question_idxs.append(qi)

In [211]:
import pandas as pd
df_react_reasoning_chains = pd.DataFrame({
    'question_idx': question_idxs,
    'claim': claims,
    'label': labels,
    #'evidence': evidences,
    #'assertions': assertions
    'trajectories': trajectories
})

In [212]:
df_react_reasoning_chains.to_csv('data/react/df_react_chains_1381_forced_gt.csv')

## Using Open AI API

In [99]:
import os
import openai
openai.api_key = secrets["OPEN-AI-KEY"]

In [100]:
def llm(prompt):
    res = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )

    res = res['choices'][0]['message']['content']
    return res

### Prompt version 1

In [17]:
prompt = f"""
Here is a claim:
{ex_with_evidence['claim']}
Make a numbered list of the assumptions made when producing the above claim.
"""

In [20]:
res = llm(prompt)

In [25]:
print(res['choices'][0]['message']['content'])

1. The claim assumes that "Sully" is a title of a movie.
2. It assumes that Tom Hanks is an actor who has appeared in movies.
3. It assumes that Tom Hanks has appeared in the movie titled "Sully."
4. It assumes that there is no other movie titled "Sully" that does not feature Tom Hanks.
5. It assumes that the claim refers to a specific movie and not a book, TV show, or any other form of media.


In [31]:
evidence = [wiki_text[url] for url in ex_with_evidence['evidence_wiki_url'] if url in wiki_text]

In [37]:
prompt = f"""
Here is a numbered list of assertions:
{res}
Here is an evidence passage:
{evidence}
For each assertion, determine whether it is SUPPORTED, REFUTED, or NOT ENOUGH INFO based on the evidence given. Give back a numbered list, each with 1 of these labels, where each number corresponds to its corresponding assertion. Also, give back a final answer based on a majority vote
"""

In [38]:
res2 = llm(prompt)

In [40]:
print(res2['choices'][0]['message']['content'])

The assertions given are:

1. The claim assumes that "Sully" is a title of a movie.
2. It assumes that Tom Hanks is an actor who has appeared in movies.
3. It assumes that Tom Hanks has appeared in the movie titled "Sully."
4. It assumes that there is no other movie titled "Sully" that does not feature Tom Hanks.
5. It assumes that the claim refers to a specific movie and not a book, TV show, or any other form of media.

Based on the evidence passage provided, we can determine the following:

1. SUPPORTED - The evidence passage mentions that "Sully" is a 2016 American biographical drama film.
2. SUPPORTED - The evidence passage mentions that Tom Hanks stars in the film.
3. SUPPORTED - The evidence passage mentions that Tom Hanks plays the role of Sullenberger in the film.
4. SUPPORTED - There is no evidence provided that suggests the existence of another movie titled "Sully" that does not feature Tom Hanks.
5. SUPPORTED - The evidence passage specifically refers to the movie "Sully" an

### Prompt version 2

This version involves providing the ground truth label and asking the model to generate reasoning, based on the evidence, for why the ground truth label is what it is

In [66]:
def get_prompt_ex(claim, evidence, label):
    prompt = f"""
    Given the following claim, evidence, and label indicating whether the
    the claim is supported, refuted, or not having enough information.
    Derive a bullet list of up to 8 assertions verifying the label for the claim based
    on the given evidence. Please provide up to three lists of assertions.

    Claim - {claim}

    Evidence - {evidence}

    Label - {label}
    """

    return prompt

In [67]:
# Make sure we are targeting the same sample of data used by ReAct
import random
idxs = list(range(3000))
random.Random(233).shuffle(idxs)
react_generated_examples = pd.read_csv('data/react/df_react_chains_826.csv')

In [69]:
env = wikienv.WikiEnv()
env = wrappers.FeverWrapper(env, split="train")
env = wrappers.LoggingWrapper(env)

In [85]:
with open('data/fever/claim_evidence.pkl', 'rb') as f:
    claim_evidence = pickle.load(f)
with open('data/fever/claim_labels.pkl', 'rb') as f:
    claim_labels = pickle.load(f)

In [80]:
folder = 'prompts/'
prompt_file = 'fever.json'
with open('data/react/' + folder + prompt_file, 'r') as f:
    prompt_dict = json.load(f)

webthink_prompt = prompt_dict['webthink_simple3']

In [None]:
labels = []
claims = []
assertions = []
evidences = []
question_idxs = []
for i in tqdm(idxs):
    claim = env.reset(idx=i)
    claim = claim[7:]
    if claim in react_generated_examples['claim'].values:
        #print("sadasd")
        continue
    
    if claim in claim_evidence:
        evidence = claim_evidence[claim]
        label = claim_labels[claim]

        prompt = get_prompt_ex(claim, evidence, label)
        response = llm(prompt)

        print(response)
        break

        assertion_str = ...
        
        labels.append(label)
        claims.append(claim)
        assertions.append(assertion_str)
        question_idxs.append(i)
        evidences.append(evidence)