# Setup

In [44]:
import os
import transformers 
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import torch
import re
from transformers import StoppingCriteria, StoppingCriteriaList

In [2]:
# load model : llama2
model_path = "/share/nikola/llama2/llama-2-7b-chat-hf-converted"
cache_dir = "/home/cw862/KG/llama_cache"
tokenizer = AutoTokenizer.from_pretrained(model_path,cache_dir=cache_dir)
model = transformers.AutoModelForCausalLM.from_pretrained(model_path,load_in_8bit=True)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [02:00<00:00, 60.16s/it]


In [120]:
stop_words="\nObservation 1:"
stop_word_ids = tokenizer.encode(stop_words,add_prefix_space=False)

TypeError: _batch_encode_plus() got an unexpected keyword argument 'add_prefix_space'

In [99]:
class StoppingCriteriaSub(StoppingCriteria):

    def __init__(self, stops = [], encounters=1):
        super().__init__()
        self.stops = [stop.to("cuda") for stop in stops]

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        for stop in self.stops:
            if torch.all((stop == input_ids[0][-len(stop):])).item():
                return True

        return False

In [100]:
def llm(prompt, stopping_criteria, max_new_tokens=400):
    str_len = len(prompt)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    import pdb; pdb.set_trace()
    outputs = model.generate(
        **inputs, 
        max_length=2048,
        do_sample=True,
        temperature=0.1,
        stopping_criteria=stopping_criteria
    )
    
    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # print('text output:', text_output)
    response = text_output[str_len:]
    # print('raw text output:\n', text_output)
    
    # hack for the stop functionality in the openai API
    if len(stop) > 0:
        stop_pattern = "|".join([f"({s})" for s in stop])
        match = re.search(stop_pattern, response)
        # print('match:', match)
        if match:
            response = response[:match.start()]
    return response

In [82]:
import wikienv, wrappers
env = wikienv.WikiEnv()
env = wrappers.HotPotQAWrapper(env, split="dev")
env = wrappers.LoggingWrapper(env)

def step(env, action):
    attempts = 0
    while attempts < 10:
        try:
            return env.step(action)
        except requests.exceptions.Timeout:
            attempts += 1

# ReAct

In [117]:
import json
import sys

folder = './prompts/'
prompt_file = 'prompts_naive.json'
with open(folder + prompt_file, 'r') as f:
    prompt_dict = json.load(f)

webthink_examples = prompt_dict['webthink_simple6']
instruction = """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: 
(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
(3) Finish[answer], which returns the answer and finishes the task.
Here are some examples.
"""
webthink_prompt = instruction + webthink_examples

def webthink(idx=None, prompt=webthink_prompt, to_print=True):
    question = env.reset(idx=idx)
    if to_print:
        print(idx, question)
    prompt += question + "\n"
    n_calls, n_badcalls = 0, 0
    for i in range(1, 8):
        n_calls += 1
        
        # stopping_criteria for generation
        
        stop_words=[f"\nObservation {i}:"]
        stop_word_ids = [tokenizer.encode(stop_word,add_prefix_space=False)['input_ids'].squeeze() for stop_word in stop_words]
        stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub(stops=stop_word_ids)])
        
        thought_action = llm(prompt + f"Thought {i}:", stopping_criteria=stopping_criteria)
        try:
            thought, action = thought_action.strip().split(f"\nAction {i}: ")
        except:
            print('ohh...', thought_action)
            n_badcalls += 1
            n_calls += 1
            thought = thought_action.strip().split('\n')[0]
            action = llm(prompt + f"Thought {i}: {thought}\nAction {i}:", stop=[f"\n"]).strip()
        obs, r, done, info = step(env, action[0].lower() + action[1:])
        obs = obs.replace('\\n', '')
        step_str = f"Thought {i}: {thought}\nAction {i}: {action}\nObservation {i}: {obs}\n"
        prompt += step_str
        if to_print:
            print(step_str)
        if done:
            break
    if not done:
        obs, r, done, info = step(env, "finish[]")
    if to_print:
        print(info, '\n')
    info.update({'n_calls': n_calls, 'n_badcalls': n_badcalls, 'traj': prompt})
    return r, info

In [118]:
import random
import time
idxs = list(range(7405))
random.Random(233).shuffle(idxs)

rs = []
infos = []
old_time = time.time()
for i in idxs[:500]:
    r, info = webthink(i, to_print=True)
    rs.append(info['em'])
    infos.append(info)
    print(sum(rs), len(rs), sum(rs) / len(rs), (time.time() - old_time) / len(rs))
    print('-----------')
    print()

3687 Question: What movie did actress Irene Jacob complete before the American action crime thriller film directed by Stuart Bird?


TypeError: _batch_encode_plus() got an unexpected keyword argument 'add_prefix_space'