In [5]:
from discussion_agents.cog.agent.reflexion import ReflexionReActAgent
from langchain_community.chat_models import ChatOpenAI
from typing import List, Dict, Optional

import dotenv
import os

dotenv.load_dotenv("../.env")
openai_api_key = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", openai_api_key=openai_api_key)

In [52]:
import joblib
hotpot = joblib.load('../agent/hotpot-qa-distractor-sample.joblib').reset_index(drop=True)
hotpot.head()

Unnamed: 0,id,question,answer,type,level,supporting_facts,context
0,5a7613c15542994ccc9186bf,VIVA Media AG changed it's name in 2004. What ...,Gesellschaft mit beschränkter Haftung,bridge,hard,"{'title': ['VIVA Media', 'Gesellschaft mit bes...","{'title': ['Constantin Medien', 'VIVA Poland',..."
1,5adf2fa35542993344016c11,Which of Jonny Craig and Pete Doherty has been...,"Jonny"" Craig",comparison,hard,"{'title': ['Jonny Craig', 'Jonny Craig', 'Pete...","{'title': ['Pete Doherty', 'Relativity (Emaros..."
2,5adfdef9554299025d62a36b,Where was the first governor after the The Mis...,"Bath, Maine",bridge,hard,"{'title': ['Maine gubernatorial election, 1820...","{'title': ['Compromise of 1790', 'Anti-Nebrask..."
3,5a7180205542994082a3e856,"The creator of ""Wallace and Gromit"" also creat...",Creature Comforts,bridge,hard,"{'title': ['Creature Comforts', 'Creature Comf...","{'title': ['Creature Comforts', 'Tata Steel Zo..."
4,5a78bc6b554299148911f979,Woman's Era and Naj are what kind of magazines?,fortnightly women interest magazine,comparison,hard,"{'title': ['Woman's Era', 'Naj'], 'sent_id': [...","{'title': ['Lifestyle trends and media', 'Chin..."


In [53]:
def gather_experience(
    reflexion_react_agent: ReflexionReActAgent,
    questions: List[str],
    keys: List[str],
    strategy: Optional[str] = "reflexion",
) -> Dict[str, List]:
    experiences = {
        "idxs": [],
        "questions": [],
        "keys": [],
        "trajectories": [],
        "reflections": []
    }
    for idx, (question, key) in enumerate(zip(questions, keys)):
        trajectory = reflexion_react_agent.generate(
            question=question, key=key, strategy=strategy, reset=True
        )

        experiences["idxs"].append(idx)
        experiences["questions"].append(question)
        experiences["keys"].append(key)
        experiences["trajectories"].append(trajectory)
        experiences["reflections"].append(reflexion_react_agent.reflector.reflections)
        
    return experiences

In [54]:
k = 10

agent = ReflexionReActAgent(
    self_reflect_llm=llm,
    action_llm=llm,
    max_steps=7,
    max_trials=3,
)

experiences = gather_experience(agent, questions=hotpot.question.values.tolist()[:k], keys=hotpot.answer.values.tolist()[:k])



  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


In [56]:
joblib.dump(experiences, "experiences_10.joblib")

['experiences_5.joblib']

In [6]:
import joblib
experiences = joblib.load("experiences_10.joblib")

In [7]:
def categorize_experiences(experiences: Dict[str, List]) -> Dict[str, List]:
    count_dict = {
        "compare": [],
        "success": [],
        "fail": []
    }

    for idx in experiences["idxs"]:  # Index for a particular task.
        trajectory = experiences["trajectories"][idx]
        trials_are_correct = [trial[0] for trial in trajectory]  # (is_correct, answer, output)[0]

        # Success.
        if all(trials_are_correct) and len(trials_are_correct) == 1:  # If success @ first trial, then stop generation.
            count_dict["success"].append(idx)
        # Compare.
        elif trials_are_correct[-1]:  # If fail(s), then succeeds, then only last trial is True.
            count_dict["compare"].append(idx)
        # Fail.
        elif not all(trials_are_correct):  # All trials failed, then fail case.
            count_dict["fail"].append(idx)
        else:
            raise ValueError(f"Unhandled scenario for trajectory at index {idx}.")

    return count_dict

categories = categorize_experiences(experiences)
categories

{'compare': [], 'success': [1, 3, 6, 7, 8], 'fail': [0, 2, 4, 5, 9]}

In [30]:
import random

def get_folds(categories: Dict[str, List], n_instances: int, n_folds: int = 2) -> Dict[str, List]:
    folds = {fold: [] for fold in range(n_folds)}

    # Assign labels for 'compare', 'success', and  'fail'.
    for _, indices in categories.items():
        random.shuffle(indices)
        for count, idx in enumerate(indices):
            folds[count % n_folds].append(idx)

    # Each fold is a validation set. Take the difference to get the training set of each fold.
    folds = {fold: list(set(list(range(n_instances))).difference(values)) for fold, values in folds.items()}

    return folds

folds = get_folds(categories, 10)
folds

{0: [3, 5, 6, 9], 1: [0, 1, 2, 4, 7, 8]}

# Insight Extraction

In [51]:
train_idxs = folds[0]

train_category_idxs = {
    category: list(set(train_idxs).intersection(set(category_idxs))) \
        for category, category_idxs in categories.items()
}

In [53]:
train_idxs

[3, 5, 6, 9]

In [52]:
train_category_idxs

{'compare': [], 'success': [3, 6], 'fail': [9, 5]}

In [49]:
from langchain_core.prompts.chat import HumanMessagePromptTemplate
from langchain_core.messages.human import HumanMessage

from discussion_agents.cog.prompts.expel import (
    system_template, 
    SYSTEM_CRITIQUE_EXISTING_RULES_INSTRUCTION,
    EXISTING_RULES_AI_NAME,
    NON_EXISTENT_RULES_AT_NAME
)

def _build_compare_prompt(
    rule_items: List[str], 
    fail_history: str, 
    success_history: str, 
    question: str,
    reflections: Optional[List[str]] = None,
) -> List[HumanMessage]:
    critique_history = []

    # System prompt.
    prefix = (
        HumanMessagePromptTemplate.from_template(system_template)
        .format_messages(
            ai_name=NON_EXISTENT_RULES_AT_NAME if not rule_items else EXISTING_RULES_AI_NAME,
            instruction=SYSTEM_CRITIQUE_EXISTING_RULES_INSTRUCTION
        )
    )
    critique_history.extend(prefix)

    # Task prompt.
    human_format_dict = dict(instruction='')
    human_format_dict['fail_history'] = fail_history
    human_format_dict['task'] = question
    human_format_dict['success_history'] = success_history
    human_format_dict['existing_rules'] = '\n'.join([f'{i}. {r}' for i, r in enumerate(rule_items, 1)])
    if reflections:
        human_format_dict['reflections_list'] = '- ' + '\n- '.join(reflections)

    return critique_history

In [50]:
_build_compare_prompt([])[0]

TypeError: _build_compare_prompt() missing 3 required positional arguments: 'fail_history', 'success_history', and 'question'

In [None]:
from typing import Tuple

def create_rules(experiences: Dict[str, List], categories: Dict[str, int], train_idxs: List[int]):
    rule_items = []
    rule_items_with_count: List[Tuple[str, int]] = []

    # Intersect between train_idxs and each category (compare, success, fail).
    train_category_idxs = {
        category: list(set(train_idxs).intersection(set(category_idxs))) \
            for category, category_idxs in categories.items()
    }

    # Compare.
    for train_idx in train_category_idxs["compare"]:
        question = experiences["questions"][train_idx]
        key = experiences["keys"][train_idx]
        trajectory = experiences["trajectories"][train_idx]
        reflection = experiences["reflections"][train_idx]

    # Success.
    for train_idx in train_category_idxs["success"]:
        question = experiences["questions"][train_idx]
        key = experiences["keys"][train_idx]
        trajectory = experiences["trajectories"][train_idx]
        reflection = experiences["reflections"][train_idx]
    
    # Fail.
    for train_idx in train_category_idxs["fail"]:
        question = experiences["questions"][train_idx]
        key = experiences["keys"][train_idx]
        trajectory = experiences["trajectories"][train_idx]
        reflection = experiences["reflections"][train_idx]


In [None]:
for fold, train_idxs in folds.items():
    print(fold, train_idxs)
    create_rules(experiences, categories, train_idxs)

In [None]:
for k, eval_idxs in enumerate(eval_idx_list):
    if k < starting_fold:
        continue
    training_ids = set(range(num_training_tasks)) - set(eval_idxs)
    (SAVE_PATH / f"fold_{k}").mkdir(exist_ok=True)
    log += f'################## FOLD {k} ##################\n'
    log += react_agent.create_rules(
        list(training_ids),
        cache_fold=k,
        logging_dir=str(SAVE_PATH / f"fold_{k}"),
        run_name=cfg.run_name,
        loaded_dict=dicts[-1] if resume and resume_starting_fold == starting_fold else None,
        loaded_log=critique_summary_log if resume and resume_starting_fold == starting_fold else None,
        eval_idx_list=eval_idx_list,
        saving_dict=True,
    )
    starting_fold += 1