In [1]:
from discussion_agents.cog.agent.reflexion import ReflexionReActAgent
from langchain_community.chat_models import ChatOpenAI
from typing import List, Dict, Optional

import dotenv
import os

dotenv.load_dotenv("../.env")
openai_api_key = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", openai_api_key=openai_api_key)

In [2]:
import joblib
hotpot = joblib.load('../agent/hotpot-qa-distractor-sample.joblib').reset_index(drop=True)
hotpot.head()

Unnamed: 0,id,question,answer,type,level,supporting_facts,context
0,5a7613c15542994ccc9186bf,VIVA Media AG changed it's name in 2004. What ...,Gesellschaft mit beschränkter Haftung,bridge,hard,"{'title': ['VIVA Media', 'Gesellschaft mit bes...","{'title': ['Constantin Medien', 'VIVA Poland',..."
1,5adf2fa35542993344016c11,Which of Jonny Craig and Pete Doherty has been...,"Jonny"" Craig",comparison,hard,"{'title': ['Jonny Craig', 'Jonny Craig', 'Pete...","{'title': ['Pete Doherty', 'Relativity (Emaros..."
2,5adfdef9554299025d62a36b,Where was the first governor after the The Mis...,"Bath, Maine",bridge,hard,"{'title': ['Maine gubernatorial election, 1820...","{'title': ['Compromise of 1790', 'Anti-Nebrask..."
3,5a7180205542994082a3e856,"The creator of ""Wallace and Gromit"" also creat...",Creature Comforts,bridge,hard,"{'title': ['Creature Comforts', 'Creature Comf...","{'title': ['Creature Comforts', 'Tata Steel Zo..."
4,5a78bc6b554299148911f979,Woman's Era and Naj are what kind of magazines?,fortnightly women interest magazine,comparison,hard,"{'title': ['Woman's Era', 'Naj'], 'sent_id': [...","{'title': ['Lifestyle trends and media', 'Chin..."


In [3]:
def gather_experience(
    reflexion_react_agent: ReflexionReActAgent,
    questions: List[str],
    keys: List[str],
    strategy: Optional[str] = "reflexion",
) -> Dict[str, List]:
    experiences = {
        "idxs": [],
        "questions": [],
        "keys": [],
        "trajectories": [],
        "reflections": []
    }
    for idx, (question, key) in enumerate(zip(questions, keys)):
        trajectory = reflexion_react_agent.generate(
            question=question, key=key, strategy=strategy, reset=True
        )

        experiences["idxs"].append(idx)
        experiences["questions"].append(question)
        experiences["keys"].append(key)
        experiences["trajectories"].append(trajectory)
        experiences["reflections"].append(reflexion_react_agent.reflector.reflections)
        
    return experiences

In [4]:
k = 5

agent = ReflexionReActAgent(
    self_reflect_llm=llm,
    action_llm=llm,
    max_steps=7,
    max_trials=3,
)

# experiences_tmp = gather_experience(agent, questions=hotpot.question.values.tolist()[10:10+k], keys=hotpot.answer.values.tolist()[10:10+k])

In [56]:
# joblib.dump(experiences, "experiences_10.joblib")

['experiences_5.joblib']

In [5]:
import joblib
experiences = joblib.load("exp_15_compare_fake.joblib")

In [6]:

def categorize_experiences(experiences: Dict[str, List]) -> Dict[str, List]:
    count_dict = {
        "compare": [],
        "success": [],
        "fail": []
    }

    for idx in experiences["idxs"]:  # Index for a particular task.
        trajectory = experiences["trajectories"][idx]
        trials_are_correct = [trial[0] for trial in trajectory]  # (is_correct, answer, output)[0]

        # Success.
        if all(trials_are_correct) and len(trials_are_correct) == 1:  # If success @ first trial, then stop generation.
            count_dict["success"].append(idx)
        # Compare.
        elif trials_are_correct[-1]:  # If fail(s), then succeeds, then only last trial is True.
            count_dict["compare"].append(idx)
        # Fail.
        elif not all(trials_are_correct):  # All trials failed, then fail case.
            count_dict["fail"].append(idx)
        else:
            raise ValueError(f"Unhandled scenario for trajectory at index {idx}.")

    return count_dict

categories = categorize_experiences(experiences)
categories

{'compare': [10, 11, 12, 13, 14],
 'success': [1, 3, 6, 7, 8],
 'fail': [0, 2, 4, 5, 9]}

In [7]:
experiences.keys()

dict_keys(['idxs', 'questions', 'keys', 'trajectories', 'reflections'])

In [8]:
import random

def get_folds(categories: Dict[str, List], n_instances: int, n_folds: int = 2) -> Dict[str, List]:
    folds = {fold: [] for fold in range(n_folds)}

    # Assign labels for 'compare', 'success', and  'fail'.
    for _, indices in categories.items():
        random.shuffle(indices)
        for count, idx in enumerate(indices):
            folds[count % n_folds].append(idx)

    # Each fold is a validation set. Take the difference to get the training set of each fold.
    folds = {fold: list(set(list(range(n_instances))).difference(values)) for fold, values in folds.items()}

    return folds

folds = get_folds(categories, 15)
folds

{0: [1, 2, 4, 6, 12, 14], 1: [0, 3, 5, 7, 8, 9, 10, 11, 13]}

# Insight Extraction

In [9]:
train_idxs = folds[0]

train_category_idxs = {
    category: list(set(train_idxs).intersection(set(category_idxs))) \
        for category, category_idxs in categories.items()
}

In [10]:
train_idxs

[1, 2, 4, 6, 12, 14]

In [11]:
train_category_idxs

{'compare': [12, 14], 'success': [1, 6], 'fail': [2, 4]}

In [12]:
from langchain_core.prompts.chat import HumanMessagePromptTemplate
from langchain_core.messages.human import HumanMessage

from discussion_agents.cog.prompts.expel import (
    SYSTEM_TEMPLATE, 
    SYSTEM_CRITIQUE_EXISTING_RULES_INSTRUCTION,
    EXISTING_RULES_AI_NAME,
    NON_EXISTENT_RULES_AT_NAME,
    HUMAN_CRITIQUE_EXISTING_RULES_TEMPLATE,
    CRITIQUE_SUMMARY_SUFFIX_FULL,
    CRITIQUE_SUMMARY_SUFFIX_NOT_FULL
)

def _build_compare_prompt(
    rule_items: List[str], 
    question: str,
    failed_traj: str, 
    success_traj: str, 
    is_full: bool,
) -> List[HumanMessage]:
    # is_full = self.max_num_rules <= len(self.rule_items_with_count)   ->    20 <= len(self.rule_items_with_count)

    critique_history = []

    if rule_items == []:
        rule_items = ['']

    # System prompt.
    prefix = (
        HumanMessagePromptTemplate.from_template(SYSTEM_TEMPLATE)
        .format_messages(
            ai_name=NON_EXISTENT_RULES_AT_NAME if not rule_items else EXISTING_RULES_AI_NAME,
            instruction=SYSTEM_CRITIQUE_EXISTING_RULES_INSTRUCTION
        )
    )
    critique_history.extend(prefix)

    # Task prompt.
    human_format_dict = {
        'question': question,
        'failed_traj': failed_traj,
        'success_traj': success_traj,
        'existing_rules': '\n'.join([f'{i}. {r}' for i, r in enumerate(rule_items, 1)])
    }

    human_critique_summary_message = HumanMessagePromptTemplate.from_template(HUMAN_CRITIQUE_EXISTING_RULES_TEMPLATE).format_messages(**human_format_dict)[0]
    critique_summary_suffix = CRITIQUE_SUMMARY_SUFFIX_FULL if is_full else CRITIQUE_SUMMARY_SUFFIX_NOT_FULL
    human_critique_summary_message.content = human_critique_summary_message.content + critique_summary_suffix
    critique_history.append(human_critique_summary_message)

    return critique_history

In [13]:
failed_traj = experiences['trajectories'][11][0][-1]
success_traj = experiences['trajectories'][11][-1][-1]
question = experiences['questions'][11]

In [14]:
compare_prompt_msgs = _build_compare_prompt([], question, failed_traj, success_traj, is_full=False)
compare_prompt_msgs

[HumanMessage(content='You are an advanced reasoning agent that can add, edit or remove rules from your existing rule set, based on forming new critiques of past task trajectories. You will be given two previous task trials in which you were given access to a Docstore API environment and a question to answer: one successful and one unsuccessful trial. You failed the trial either because you guessed the wrong answer with Finish[<answer>], or you used up your set number of reasoning steps.'),
 HumanMessage(content='\nHere are the two previous trials to compare and critique:\nTRIAL TASK:\nThe creator of "Wallace and Gromit" also created what animation comedy that matched animated zoo animals with a soundtrack of people talking about their homes? \n\nSUCCESSFUL TRIAL:\n\nThought: I need to search for the creator of "Wallace and Gromit" and find out what other animation comedy they created that involves animated zoo animals and people talking about their homes. \nAction: Search[creator of W

In [15]:
from langchain_core.messages.chat import ChatMessage

def collapse_prompts(prompt_history: List[ChatMessage]) -> List[ChatMessage]:
    """Courtesy of GPT4"""
    if not prompt_history:
        return []

    new_prompt_history = []
    scratch_pad = prompt_history[0].content
    last_message_type = type(prompt_history[0])

    for message in prompt_history[1:]:
        current_message_type = type(message)
        if current_message_type == last_message_type:
            scratch_pad += '\n' + message.content
        else:
            new_prompt_history.append(last_message_type(content=scratch_pad))
            scratch_pad = message.content
            last_message_type = current_message_type

    # Handle the last accumulated message.
    new_prompt_history.append(last_message_type(content=scratch_pad))

    return new_prompt_history

In [16]:
compare_prompt_msgs = collapse_prompts(compare_prompt_msgs)
compare_prompt_msgs

[HumanMessage(content='You are an advanced reasoning agent that can add, edit or remove rules from your existing rule set, based on forming new critiques of past task trajectories. You will be given two previous task trials in which you were given access to a Docstore API environment and a question to answer: one successful and one unsuccessful trial. You failed the trial either because you guessed the wrong answer with Finish[<answer>], or you used up your set number of reasoning steps.\n\nHere are the two previous trials to compare and critique:\nTRIAL TASK:\nThe creator of "Wallace and Gromit" also created what animation comedy that matched animated zoo animals with a soundtrack of people talking about their homes? \n\nSUCCESSFUL TRIAL:\n\nThought: I need to search for the creator of "Wallace and Gromit" and find out what other animation comedy they created that involves animated zoo animals and people talking about their homes. \nAction: Search[creator of Wallace and Gromit]\nObser

In [25]:
from langchain_core.language_models.chat_models import BaseChatModel

def _prompt_compare_critique(compare_prompt_msgs: List[HumanMessage], llm: BaseChatModel, replace_newline: bool = False):
    out = llm(compare_prompt_msgs).content.strip('\n').strip()
    if replace_newline:
        out = out.replace('\n', '')
    return out

In [26]:
out = _prompt_compare_critique(compare_prompt_msgs, llm)

In [27]:
import re

def parse_rules(llm_text):
    pattern = r'((?:REMOVE|EDIT|ADD|AGREE)(?: \d+|)): (?:[a-zA-Z\s\d]+: |)(.*)'
    matches = re.findall(pattern, llm_text)

    res = []
    banned_words = ['ADD', 'AGREE', 'EDIT']
    for operation, text in matches:
        text = text.strip()
        if text != '' and not any([w in text for w in banned_words]) and text.endswith('.'):
        # if text is not empty
        # if text doesn't contain banned words (avoid weird formatting cases from llm)
        # if text ends with a period (avoid cut off sentences from llm)
            if 'ADD' in operation:
                res.append(('ADD', text))
            else:
                res.append((operation.strip(), text))
    return(res)

In [28]:
parse_rules(out)

[('ADD',
  'Clearly define the specific information needed to answer the question.'),
 ('EDIT 3',
  'Refine search terms if initial search does not yield relevant results.'),
 ('REMOVE 2', 'Avoid redundant searches for the same information.'),
 ('AGREE 4',
  'Narrow down search results to find specific information needed for the answer.')]

In [None]:
from typing import Tuple

def create_rules(experiences: Dict[str, List], categories: Dict[str, int], train_idxs: List[int]):
    rule_items = []
    rule_items_with_count: List[Tuple[str, int]] = []

    # Intersect between train_idxs and each category (compare, success, fail).
    train_category_idxs = {
        category: list(set(train_idxs).intersection(set(category_idxs))) \
            for category, category_idxs in categories.items()
    }

    # Compare.
    for train_idx in train_category_idxs["compare"]:
        question = experiences["questions"][train_idx]
        key = experiences["keys"][train_idx]
        trajectory = experiences["trajectories"][train_idx]
        reflection = experiences["reflections"][train_idx]

    # Success.
    for train_idx in train_category_idxs["success"]:
        question = experiences["questions"][train_idx]
        key = experiences["keys"][train_idx]
        trajectory = experiences["trajectories"][train_idx]
        reflection = experiences["reflections"][train_idx]
    
    # Fail.
    for train_idx in train_category_idxs["fail"]:
        question = experiences["questions"][train_idx]
        key = experiences["keys"][train_idx]
        trajectory = experiences["trajectories"][train_idx]
        reflection = experiences["reflections"][train_idx]


In [None]:
for fold, train_idxs in folds.items():
    print(fold, train_idxs)
    create_rules(experiences, categories, train_idxs)

In [None]:
for k, eval_idxs in enumerate(eval_idx_list):
    if k < starting_fold:
        continue
    training_ids = set(range(num_training_tasks)) - set(eval_idxs)
    (SAVE_PATH / f"fold_{k}").mkdir(exist_ok=True)
    log += f'################## FOLD {k} ##################\n'
    log += react_agent.create_rules(
        list(training_ids),
        cache_fold=k,
        logging_dir=str(SAVE_PATH / f"fold_{k}"),
        run_name=cfg.run_name,
        loaded_dict=dicts[-1] if resume and resume_starting_fold == starting_fold else None,
        loaded_log=critique_summary_log if resume and resume_starting_fold == starting_fold else None,
        eval_idx_list=eval_idx_list,
        saving_dict=True,
    )
    starting_fold += 1