# Setup

In [44]:
import os
import transformers 
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import torch
import re
from transformers import StoppingCriteria, StoppingCriteriaList

In [2]:
# load model : llama2
model_path = "/share/nikola/llama2/llama-2-7b-chat-hf-converted"
cache_dir = "/home/cw862/KG/llama_cache"
tokenizer = AutoTokenizer.from_pretrained(model_path,cache_dir=cache_dir)
model = transformers.AutoModelForCausalLM.from_pretrained(model_path,load_in_8bit=True)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [02:00<00:00, 60.16s/it]


In [99]:
class StoppingCriteriaSub(StoppingCriteria):

    def __init__(self, stops = [], encounters=1):
        super().__init__()
        self.stops = [stop.to("cuda") for stop in stops]

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        for stop in self.stops:
            if torch.all((stop == input_ids[0][-len(stop):])).item():
                return True

        return False

In [100]:
def llm(prompt, stopping_criteria, max_new_tokens=400):
    str_len = len(prompt)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    import pdb; pdb.set_trace()
    outputs = model.generate(
        **inputs, 
        max_length=2048,
        do_sample=True,
        temperature=0.1,
        stopping_criteria=stopping_criteria
    )
    
    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # print('text output:', text_output)
    response = text_output[str_len:]
    # print('raw text output:\n', text_output)
    
    # hack for the stop functionality in the openai API
    if len(stop) > 0:
        stop_pattern = "|".join([f"({s})" for s in stop])
        match = re.search(stop_pattern, response)
        # print('match:', match)
        if match:
            response = response[:match.start()]
    return response

In [82]:
import wikienv, wrappers
env = wikienv.WikiEnv()
env = wrappers.HotPotQAWrapper(env, split="dev")
env = wrappers.LoggingWrapper(env)

def step(env, action):
    attempts = 0
    while attempts < 10:
        try:
            return env.step(action)
        except requests.exceptions.Timeout:
            attempts += 1

# ReAct

In [115]:
import json
import sys

folder = './prompts/'
prompt_file = 'prompts_naive.json'
with open(folder + prompt_file, 'r') as f:
    prompt_dict = json.load(f)

webthink_examples = prompt_dict['webthink_simple6']
instruction = """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: 
(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
(3) Finish[answer], which returns the answer and finishes the task.
Here are some examples.
"""
webthink_prompt = instruction + webthink_examples

def webthink(idx=None, prompt=webthink_prompt, to_print=True):
    question = env.reset(idx=idx)
    if to_print:
        print(idx, question)
    prompt += question + "\n"
    n_calls, n_badcalls = 0, 0
    for i in range(1, 8):
        n_calls += 1
        
        # stopping_criteria for generation
        
        stop_words=[f"\nObservation {i}:"]
        stop_word_ids = [tokenizer(stop_word,return_tensors="pt")['input_ids'].squeeze() for stop_word in stop_words]
        stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub(stops=stop_word_ids)])
        
        thought_action = llm(prompt + f"Thought {i}:", stopping_criteria=stopping_criteria)
        try:
            thought, action = thought_action.strip().split(f"\nAction {i}: ")
        except:
            print('ohh...', thought_action)
            n_badcalls += 1
            n_calls += 1
            thought = thought_action.strip().split('\n')[0]
            action = llm(prompt + f"Thought {i}: {thought}\nAction {i}:", stop=[f"\n"]).strip()
        obs, r, done, info = step(env, action[0].lower() + action[1:])
        obs = obs.replace('\\n', '')
        step_str = f"Thought {i}: {thought}\nAction {i}: {action}\nObservation {i}: {obs}\n"
        prompt += step_str
        if to_print:
            print(step_str)
        if done:
            break
    if not done:
        obs, r, done, info = step(env, "finish[]")
    if to_print:
        print(info, '\n')
    info.update({'n_calls': n_calls, 'n_badcalls': n_badcalls, 'traj': prompt})
    return r, info

In [116]:
import random
import time
idxs = list(range(7405))
random.Random(233).shuffle(idxs)

rs = []
infos = []
old_time = time.time()
for i in idxs[:500]:
    r, info = webthink(i, to_print=True)
    rs.append(info['em'])
    infos.append(info)
    print(sum(rs), len(rs), sum(rs) / len(rs), (time.time() - old_time) / len(rs))
    print('-----------')
    print()

3687 Question: What movie did actress Irene Jacob complete before the American action crime thriller film directed by Stuart Bird?
> [0;32m/tmp/ipykernel_268772/1731617159.py[0m(6)[0;36mllm[0;34m()[0m
[0;32m      4 [0;31m[0;34m[0m[0m
[0m[0;32m      5 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 6 [0;31m    outputs = model.generate(
[0m[0;32m      7 [0;31m        [0;34m**[0m[0minputs[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      8 [0;31m        [0mmax_length[0m[0;34m=[0m[0;36m2048[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> n
> [0;32m/tmp/ipykernel_268772/1731617159.py[0m(7)[0;36mllm[0;34m()[0m
[0;32m      5 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      6 [0;31m    outputs = model.generate(
[0m[0;32m---->

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


> [0;32m/tmp/ipykernel_268772/1731617159.py[0m(14)[0;36mllm[0;34m()[0m
[0;32m     12 [0;31m    )
[0m[0;32m     13 [0;31m[0;34m[0m[0m
[0m[0;32m---> 14 [0;31m    [0mtext_output[0m [0;34m=[0m [0mtokenizer[0m[0;34m.[0m[0mdecode[0m[0;34m([0m[0moutputs[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0mskip_special_tokens[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     15 [0;31m    [0;31m# print('text output:', text_output)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     16 [0;31m    [0mresponse[0m [0;34m=[0m [0mtext_output[0m[0;34m[[0m[0mstr_len[0m[0;34m:[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> n
> [0;32m/tmp/ipykernel_268772/1731617159.py[0m(16)[0;36mllm[0;34m()[0m
[0;32m     14 [0;31m    [0mtext_output[0m [0;34m=[0m [0mtokenizer[0m[0;34m.[0m[0mdecode[0m[0;34m([0m[0moutputs[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0mskip_special_tokens[0m[0;34m=[0m[0;32mTrue

ipdb> print(outputs[0])
tensor([    1,  4956,   345,  ..., 29896, 29929, 29947], device='cuda:0')
ipdb> n
> [0;32m/tmp/ipykernel_268772/1731617159.py[0m(20)[0;36mllm[0;34m()[0m
[0;32m     18 [0;31m[0;34m[0m[0m
[0m[0;32m     19 [0;31m    [0;31m# hack for the stop functionality in the openai API[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 20 [0;31m    [0;32mif[0m [0mlen[0m[0;34m([0m[0mstop[0m[0;34m)[0m [0;34m>[0m [0;36m0[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     21 [0;31m        [0mstop_pattern[0m [0;34m=[0m [0;34m"|"[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0;34m[[0m[0;34mf"({s})"[0m [0;32mfor[0m [0ms[0m [0;32min[0m [0mstop[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     22 [0;31m        [0mmatch[0m [0;34m=[0m [0mre[0m[0;34m.[0m[0msearch[0m[0;34m([0m[0mstop_pattern[0m[0;34m,[0m [0mresponse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> n
> [0;32m/tmp/ipykernel_268772/1731617159.p

ipdb> n
> [0;32m/tmp/ipykernel_268772/2155143608.py[0m(43)[0;36mwebthink[0;34m()[0m
[0;32m     41 [0;31m            [0maction[0m [0;34m=[0m [0mllm[0m[0;34m([0m[0mprompt[0m [0;34m+[0m [0;34mf"Thought {i}: {thought}\nAction {i}:"[0m[0;34m,[0m [0mstop[0m[0;34m=[0m[0;34m[[0m[0;34mf"\n"[0m[0;34m][0m[0;34m)[0m[0;34m.[0m[0mstrip[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     42 [0;31m        [0mobs[0m[0;34m,[0m [0mr[0m[0;34m,[0m [0mdone[0m[0;34m,[0m [0minfo[0m [0;34m=[0m [0mstep[0m[0;34m([0m[0menv[0m[0;34m,[0m [0maction[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m.[0m[0mlower[0m[0;34m([0m[0;34m)[0m [0;34m+[0m [0maction[0m[0;34m[[0m[0;36m1[0m[0;34m:[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 43 [0;31m        [0mobs[0m [0;34m=[0m [0mobs[0m[0;34m.[0m[0mreplace[0m[0;34m([0m[0;34m'\\n'[0m[0;34m,[0m [0;34m''[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m


ipdb> n
> [0;32m/tmp/ipykernel_268772/2155143608.py[0m(47)[0;36mwebthink[0;34m()[0m
[0;32m     45 [0;31m        [0mprompt[0m [0;34m+=[0m [0mstep_str[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     46 [0;31m        [0;32mif[0m [0mto_print[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 47 [0;31m            [0mprint[0m[0;34m([0m[0mstep_str[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     48 [0;31m        [0;32mif[0m [0mdone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     49 [0;31m            [0;32mbreak[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> 
Thought 1: I need to search Irene Jacob and the movie, find the movie she completed before the American action crime thriller film directed by Stuart Bird.
Action 1: Search[Irene Jacob]
Observation 1: Irene Jacob is a French-Swiss actress who has appeared in over 50 films and television shows.
Thought 2: I need to search the movie she completed before the American action crime thriller film di

ipdb> n
> [0;32m/tmp/ipykernel_268772/1731617159.py[0m(7)[0;36mllm[0;34m()[0m
[0;32m      5 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      6 [0;31m    outputs = model.generate(
[0m[0;32m----> 7 [0;31m        [0;34m**[0m[0minputs[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      8 [0;31m        [0mmax_length[0m[0;34m=[0m[0;36m2048[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      9 [0;31m        [0mdo_sample[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> n
> [0;32m/tmp/ipykernel_268772/1731617159.py[0m(6)[0;36mllm[0;34m()[0m
[0;32m      4 [0;31m[0;34m[0m[0m
[0m[0;32m      5 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 6 [0;31m    outputs = model.generate(
[0m[0;32m      7 [0

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


ValueError: Input length of input_ids is 2159, but `max_length` is set to 2048. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
> [0;32m/tmp/ipykernel_268772/1731617159.py[0m(6)[0;36mllm[0;34m()[0m
[0;32m      4 [0;31m[0;34m[0m[0m
[0m[0;32m      5 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 6 [0;31m    outputs = model.generate(
[0m[0;32m      7 [0;31m        [0;34m**[0m[0minputs[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      8 [0;31m        [0mmax_length[0m[0;34m=[0m[0;36m2048[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m
--KeyboardInterrupt--

KeyboardInterrupt: Interrupted by user


ValueError: Input length of input_ids is 2159, but `max_length` is set to 2048. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.