In [4]:
from agential.cog.agent.critic import CriticAgent
from langchain_community.utilities.google_search import GoogleSearchAPIWrapper
from langchain_openai import ChatOpenAI




from agential.cog.prompts.benchmark.hotpotqa import (
    HOTPOTQA_FEWSHOT_EXAMPLES_COT,
    HOTPOTQA_FEWSHOT_EXAMPLES_DIRECT,
    HOTPOTQA_FEWSHOT_EXAMPLES_REACT,
)
from agential.cog.prompts.benchmark.fever import (
    FEVER_FEWSHOT_EXAMPLES_COT,
    FEVER_FEWSHOT_EXAMPLES_DIRECT,
    FEVER_FEWSHOT_EXAMPLES_REACT,
)
from agential.cog.prompts.benchmark.triviaqa import (
    TRIVIAQA_FEWSHOT_EXAMPLES_COT,
    TRIVIAQA_FEWSHOT_EXAMPLES_DIRECT,
    TRIVIAQA_FEWSHOT_EXAMPLES_REACT,
)
from agential.cog.prompts.benchmark.ambignq import (
    AMBIGNQ_FEWSHOT_EXAMPLES_COT,
    AMBIGNQ_FEWSHOT_EXAMPLES_DIRECT,
    AMBIGNQ_FEWSHOT_EXAMPLES_REACT,
)
from agential.cog.prompts.benchmark.gsm8k import (
    GSM8K_FEWSHOT_EXAMPLES_POT,
)
from agential.cog.prompts.benchmark.svamp import (
    SVAMP_FEWSHOT_EXAMPLES_POT,
)
from agential.cog.prompts.benchmark.tabmwp import (
    TABMWP_FEWSHOT_EXAMPLES_POT,
)
from agential.cog.prompts.benchmark.humaneval import (
    HUMANEVAL_FEWSHOT_EXAMPLES_POT,
)
from agential.cog.prompts.benchmark.mbpp import (
    MBPP_FEWSHOT_EXAMPLES_POT,
)

from agential.cog.prompts.agent.critic import (
    # QA.
    CRITIC_INSTRUCTION_HOTPOTQA,
    CRITIC_CRITIQUE_INSTRUCTION_HOTPOTQA,
    HOTPOTQA_FEWSHOT_EXAMPLES_CRITIC,

    CRITIC_INSTRUCTION_FEVER,
    CRITIC_CRITIQUE_INSTRUCTION_FEVER,
    FEVER_FEWSHOT_EXAMPLES_CRITIC,

    CRITIC_INSTRUCTION_AMBIGNQ,
    CRITIC_CRITIQUE_INSTRUCTION_AMBIGNQ,
    AMBIGNQ_FEWSHOT_EXAMPLES_CRITIC,

    CRITIC_INSTRUCTION_TRIVIAQA,
    CRITIC_CRITIQUE_INSTRUCTION_TRIVIAQA,
    TRIVIAQA_FEWSHOT_EXAMPLES_CRITIC,

    # Math.
    CRITIC_POT_INSTRUCTION_GSM8K,
    CRITIC_CRITIQUE_INSTRUCTION_GSM8K,
    GSM8K_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_GSM8K,
    GSM8K_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,

    CRITIC_POT_INSTRUCTION_SVAMP,
    CRITIC_CRITIQUE_INSTRUCTION_SVAMP,
    SVAMP_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_SVAMP,
    SVAMP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,

    CRITIC_POT_INSTRUCTION_TABMWP,
    CRITIC_CRITIQUE_INSTRUCTION_TABMWP,
    TABMWP_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_TABMWP,
    TABMWP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,

    # Code.
    CRITIC_POT_INSTRUCTION_HUMANEVAL,
    CRITIC_CRITIQUE_INSTRUCTION_HUMANEVAL,
    HUMANEVAL_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_HUMANEVAL,
    HUMANEVAL_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,

    CRITIC_POT_INSTRUCTION_MBPP,
    CRITIC_CRITIQUE_INSTRUCTION_MBPP,
    MBPP_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_MBPP,
    MBPP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
)

import dotenv
dotenv.load_dotenv()


import warnings
warnings.filterwarnings("ignore")

llm = ChatOpenAI()
search = GoogleSearchAPIWrapper()

# QA

### HotpotQA

In [5]:
question = 'Who was once considered the best kick boxer in the world, however he has been involved in a number of controversies relating to his "unsportsmanlike conducts" in the sport and crimes of violence outside of the ring'

agent = CriticAgent(
    llm=llm, 
    mode={"qa": "hotpotqa"}, 
    search=search, 
    evidence_length=400,
    num_results=8
)
use_tool = True

out = agent.generate(
    question=question,
    examples=HOTPOTQA_FEWSHOT_EXAMPLES_COT,  # HOTPOTQA_FEWSHOT_EXAMPLES_DIRECT, HOTPOTQA_FEWSHOT_EXAMPLES_REACT
    prompt=CRITIC_INSTRUCTION_HOTPOTQA,
    critique_examples=HOTPOTQA_FEWSHOT_EXAMPLES_CRITIC,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_HOTPOTQA,
    additional_keys={},
    critique_additional_keys={},
    max_interactions=3,
    use_tool=use_tool,
    reset=True,
    # kwargs.
    evidence_length=400,
    num_results=8,
)

In [6]:
out

[CriticOutput(answer="Thought: Let's think step by step. The person described was once considered the best kick boxer in the world but has been involved in controversies and crimes of violence. This description matches with the career and controversies of Badr Hari, a Moroccan-Dutch kickboxer. So the answer is Badr Hari.\nAction: Finish[Badr Hari]", critique='\nThe question asks for the name of the kickboxer described, and the answer "Badr Hari" is a name. So it\'s plausible.\n\n2. Truthfulness:\n\nLet\'s search the question in google:\n\n> Search Query: Who was once considered the best kick boxer in the world, however he has been involved in a number of controversies relating to his "unsportsmanlike conducts" in the sport and crimes of violence outside of the ring\n> Evidence: [tyzhu/lmind_hotpot_train500_eval300_v1_qa · Datasets at Hugging ...] ... he has been involved in a number of controversies relating to his "unsportsmanlike conducts" in the sport and crimes of violence outside 

### FEVER

In [9]:
question = "Nikolaj Coster-Waldau worked with the Fox Broadcasting Company."

agent = CriticAgent(
    llm=llm, 
    mode={"qa": "fever"}, 
    search=search,
    evidence_length=400,
    num_results=8
)
use_tool = False

out = agent.generate(
    question=question,
    examples=FEVER_FEWSHOT_EXAMPLES_COT,  # FEVER_FEWSHOT_EXAMPLES_DIRECT, FEVER_FEWSHOT_EXAMPLES_REACT
    prompt=CRITIC_INSTRUCTION_FEVER,
    critique_examples=FEVER_FEWSHOT_EXAMPLES_CRITIC,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_FEVER,
    additional_keys={},
    critique_additional_keys={},
    max_interactions=3,
    use_tool=use_tool,
    reset=True,
    # kwargs.
    evidence_length=400,
    num_results=8,
)

In [10]:
out

[CriticOutput(answer='Yes, Nikolaj Coster-Waldau worked with the Fox Broadcasting Company as he appeared in the 2009 Fox television film Virtuality. So the answer is: SUPPORTS.', critique='Yes, Nikolaj Coster-Waldau worked with the Fox Broadcasting Company as he appeared in the 2009 Fox television film Virtuality. So the answer is: SUPPORTS.', external_tool_info={'search_query': '', 'search_result': ''})]

### AmbigNQ

In [11]:
question = "When did the simpsons first air on television?"

agent = CriticAgent(
    llm=llm, 
    mode={"qa": "ambignq"}, 
    search=search,
    evidence_length=400,
    num_results=8
)
use_tool = False

out = agent.generate(
    question=question,
    examples=AMBIGNQ_FEWSHOT_EXAMPLES_COT,  # AMBIGNQ_FEWSHOT_EXAMPLES_DIRECT, AMBIGNQ_FEWSHOT_EXAMPLES_REACT
    prompt=CRITIC_INSTRUCTION_AMBIGNQ,
    critique_examples=AMBIGNQ_FEWSHOT_EXAMPLES_CRITIC,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_AMBIGNQ,
    additional_keys={},
    critique_additional_keys={},
    max_interactions=3,
    use_tool=use_tool,
    reset=True,
    # kwargs.
    evidence_length=400,
    num_results=8,
)

In [12]:
out

[CriticOutput(answer='1989', critique='\n\nThe question asks for a specific time, and the answer is "1989", which is a year. So it\'s plausible.\n\n2. Truthfulness:\n\nLet\'s verify the accuracy of the answer by searching the question in Google:\n\n> Search Query: When did the simpsons first air on television?\n> Evidence: [The Simpsons - Wikipedia] The Simpsons first aired on television on December 17, 1989.\n\nThe evidence confirms that The Simpsons first aired on television on December 17, 1989, not just in 1989. Therefore, the answer should specify the exact date.\n\nQuestion: When did the simpsons first air on television?\nHere\'s the most possible answer: The Simpsons first aired on television on December 17, 1989. So the answer is: December 17, 1989.', external_tool_info={'search_query': 'When did the simpsons first air on television?', 'search_result': "[The Simpsons - Wikipedia] The Simpsons first aired on television on December 17, 1989.\n\nThe evidence confirms that The Simp

### TriviaQA

In [13]:
question = "Which American-born Sinclair won the Nobel Prize for Literature in 1930?"

agent = CriticAgent(
    llm=llm, 
    mode={"qa": "triviaqa"}, 
    search=search,
    evidence_length=400,
    num_results=8
)
use_tool = False

out = agent.generate(
    question=question,
    examples=TRIVIAQA_FEWSHOT_EXAMPLES_COT,  # TRIVIAQA_FEWSHOT_EXAMPLES_DIRECT, TRIVIAQA_FEWSHOT_EXAMPLES_REACT
    prompt=CRITIC_INSTRUCTION_TRIVIAQA,
    critique_examples=TRIVIAQA_FEWSHOT_EXAMPLES_CRITIC,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_TRIVIAQA,
    additional_keys={},
    critique_additional_keys={},
    max_interactions=3,
    use_tool=use_tool,
    reset=True,
    # kwargs.
    evidence_length=400,
    num_results=8,
)

In [14]:
out

[CriticOutput(answer='Upton Sinclair', critique='\n\nThe question asks for the name of an American-born Sinclair who won the Nobel Prize for Literature in 1930, and the answer is "Upton Sinclair", which is a plausible name. So it\'s plausible.\n\n2. Truthfulness:\n\nLet\'s search the question in google:\n\n> Search Query: Which American-born Sinclair won the Nobel Prize for Literature in 1930?\n> Evidence: [Winners of the Nobel Prize in Literature - Wikipedia] The 1930 Nobel Prize in Literature was awarded to Sinclair Lewis, an American writer.\n\nThe evidence shows that Sinclair Lewis, not Upton Sinclair, won the Nobel Prize for Literature in 1930. Therefore, the proposed answer is incorrect.\n\nQuestion: Which American-born Sinclair won the Nobel Prize for Literature in 1930?\nHere\'s the most possible answer: Sinclair Lewis won the Nobel Prize for Literature in 1930. So the answer is: Sinclair Lewis.', external_tool_info={'search_query': 'Which American-born Sinclair won the Nobel P

# Math

### GSM8K

In [19]:
question = "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with 4933828. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"

agent = CriticAgent(
    llm=llm, mode={"math": "gsm8k"}, patience=2
)

use_tool = True

out = agent.generate(
    question=question,
    examples=GSM8K_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_GSM8K,
    critique_examples=GSM8K_FEWSHOT_EXAMPLES_CRITIC if use_tool else GSM8K_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_GSM8K if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_GSM8K,
    additional_keys={},
    critique_additional_keys={},
    max_interactions=3,
    use_tool=use_tool,
    reset=True,
)


In [20]:
out

[CriticOutput(answer='eggs_laid_per_day = 16\neggs_used_for_breakfast = 3\neggs_used_for_muffins = 4933828\neggs_sold_per_day = eggs_laid_per_day - eggs_used_for_breakfast - eggs_used_for_muffins\nmoney_made_per_day = eggs_sold_per_day * 2\nanswer = money_made_per_day', critique="1. The amount of money made per day should be a positive number, -9867630 < 0, which is not reasonable.\n\n2. Let's check the code:\n\n- `eggs_laid_per_day = 16` - This defines the number of eggs laid per day, which is correct.\n- `eggs_used_for_breakfast = 3` - This defines the number of eggs used for breakfast, which is correct.\n- `eggs_used_for_muffins = 4933828` - This defines the number of eggs used for muffins, which seems extremely high and might be a mistake.\n- `eggs_sold_per_day = eggs_laid_per_day - eggs_used_for_breakfast - eggs_used_for_muffins` - This calculates the number of eggs sold per day, but the calculation might be incorrect due to the large number of eggs used for muffins.\n- `money_mad

### SVAMP

In [16]:
question = "There are 87 oranges and 290 bananas in Philip's collection. If the bananas are organized into 2 groups and oranges are organized into 93 groups. How big is each group of bananas?"

agent = CriticAgent(
    llm=llm, mode={"math": "svamp"}, patience=2
)

use_tool = True

out = agent.generate(
    question=question,
    examples=SVAMP_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_SVAMP,
    critique_examples=SVAMP_FEWSHOT_EXAMPLES_CRITIC if use_tool else SVAMP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_SVAMP if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_SVAMP,
    additional_keys={},
    critique_additional_keys={},
    max_interactions=3,
    use_tool=use_tool,
    reset=True
)

In [18]:
out

[CriticOutput(answer='num_oranges = 87\nnum_bananas = 290\nnum_orange_groups = 93\nnum_banana_groups = 2\nanswer = num_bananas / num_banana_groups', critique="1. The answer, 145.0, is a reasonable result.\n2. Let's check the code:\n\nThe code correctly calculates the size of each group of bananas by dividing the total number of bananas by the number of banana groups. There is no problem with the above code.", external_tool_info={'execution_status': 'Done', 'code_answer': 145.0}),
 CriticOutput(answer='num_oranges = 87\nnum_bananas = 290\nnum_orange_groups = 93\nnum_banana_groups = 2\n\n# Calculate the size of each group of bananas\nbanana_group_size = num_bananas / num_banana_groups\nanswer = banana_group_size', critique="1. The answer, 145, is a reasonable result.\n2. Let's check the code:\n\nThe code correctly calculates the size of each group of bananas by dividing the total number of bananas by the number of banana groups. There is no problem with the above code. \n\nTherefore, the

### TabMWP

In [21]:
question = """Read the following table regarding "Bowling Scores" and then write Python code to answer a question:

Name | Score
Amanda | 117
Sam | 236
Irma | 144
Mike | 164

Question: Some friends went bowling and kept track of their scores. How many more points did Mike score than Irma?"""

agent = CriticAgent(
    llm=llm, mode={"math": "tabmwp"}, patience=2
)

use_tool = True

out = agent.generate(
    question=question,
    examples=TABMWP_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_TABMWP,
    critique_examples=TABMWP_FEWSHOT_EXAMPLES_CRITIC if use_tool else TABMWP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_TABMWP if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_TABMWP,
    additional_keys={},
    critique_additional_keys={},
    max_interactions=3,
    use_tool=use_tool,
    reset=True
)

20


In [22]:
out

[CriticOutput(answer='# Python Code, return answer\nmike_score = 164\nirma_score = 144\ndifference = mike_score - irma_score\ndifference', critique='There is no problem with the above code. It correctly calculates the difference in scores between Mike and Irma and stores it in the `difference` variable. The code answers the question accurately.', external_tool_info={'execution_status': 'Done', 'code_answer': ''}),
 CriticOutput(answer='mike_score = 164\nirma_score = 144\ndifference = mike_score - irma_score\nanswer = difference\nanswer', critique='There is no problem with the above code. It correctly calculates the difference in scores between Mike and Irma and provides the answer as output.', external_tool_info={'execution_status': 'Done', 'code_answer': 20}),
 CriticOutput(answer='# Given bowling scores\nmike_score = 164\nirma_score = 144\n\n# Calculate the difference in scores\ndifference = mike_score - irma_score\n\n# Store the answer in a variable\nanswer = difference\n\n# Print t

# Code

### HumanEval

In [23]:
inst = {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}
question = inst['prompt']
tests = f"{inst['test']}\ncheck({inst['entry_point']})"

agent = CriticAgent(
    llm=llm, mode={"code": "humaneval"}
)
use_tool = False


out = agent.generate(
    question=question,
    examples=HUMANEVAL_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_HUMANEVAL,
    critique_examples=HUMANEVAL_FEWSHOT_EXAMPLES_CRITIC if use_tool else HUMANEVAL_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_HUMANEVAL if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_HUMANEVAL,
    additional_keys={},
    critique_additional_keys={"tests": tests},
    max_interactions=3,
    use_tool=use_tool,
    reset=True
)

In [24]:
out

[CriticOutput(answer='def has_close_elements(numbers, threshold):\n    return any(abs(a - b) < threshold for i, a in enumerate(numbers) for b in numbers[i+1:])', critique='The code snippet provided has a logical error in the `has_close_elements` function implementation. ', external_tool_info={}),
 CriticOutput(answer='    return any(abs(a - b) < threshold for i, a in enumerate(numbers) for b in numbers[i+1:])', critique="The function `has_close_elements` has a logical flaw in how it checks for close elements in the list. \n\n1. The function uses a nested loop to compare each number with all subsequent numbers, which can lead to redundant comparisons and incorrect results.\n\n2. Let's analyze the code:\n\n- The function iterates over each number (a) in the list and then compares it with all subsequent numbers (b) in the list.\n\n- The condition `abs(a - b) < threshold` checks if the absolute difference between any two numbers is less than the threshold.\n\n- The function returns `True` 

### MBPP

In [25]:
question = "Write a python function to find the first repeated character in a given string."
tests = """assert first_repeated_char("abcabc") == "a"
assert first_repeated_char("abc") == None
assert first_repeated_char("123123") == "1\""""

agent = CriticAgent(
    llm=llm, mode={"code": "mbpp"}
)
use_tool = True


out = agent.generate(
    question=question,
    examples=MBPP_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_MBPP,
    critique_examples=MBPP_FEWSHOT_EXAMPLES_CRITIC if use_tool else MBPP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_MBPP if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_MBPP,
    additional_keys={"tests": tests},
    critique_additional_keys={"tests": tests},
    max_interactions=3,
    use_tool=use_tool,
    reset=True
)

In [26]:
out

[CriticOutput(answer='def first_repeated_char(s):\n    char_count = {}\n    for char in s:\n        if char in char_count:\n            return char\n        else:\n            char_count[char] = 1\n    return None\n\n# Testing the function with the provided test cases\nassert first_repeated_char("abcabc") == "a"\nassert first_repeated_char("abc") == None\nassert first_repeated_char("123123") == "1"', critique='There is no problem with the above code. The function `first_repeated_char` correctly finds the first repeated character in a given string by keeping track of character counts in a dictionary and returning the first character that is repeated. The function passes the provided test cases successfully.', external_tool_info={'execution_status': 'Done'})]