In [1]:
from agential.cog.agent.critic import CriticAgent
from langchain_community.utilities.google_search import GoogleSearchAPIWrapper

from langchain_community.chat_models.openai import ChatOpenAI

from agential.cog.prompts.benchmarks.hotpotqa import (
    HOTPOTQA_FEWSHOT_EXAMPLES_COT,
    HOTPOTQA_FEWSHOT_EXAMPLES_DIRECT,
    HOTPOTQA_FEWSHOT_EXAMPLES_REACT,
)
from agential.cog.prompts.benchmarks.fever import (
    FEVER_FEWSHOT_EXAMPLES_COT,
    FEVER_FEWSHOT_EXAMPLES_DIRECT,
    FEVER_FEWSHOT_EXAMPLES_REACT,
)
from agential.cog.prompts.benchmarks.triviaqa import (
    TRIVIAQA_FEWSHOT_EXAMPLES_COT,
    TRIVIAQA_FEWSHOT_EXAMPLES_DIRECT,
    TRIVIAQA_FEWSHOT_EXAMPLES_REACT,
)
from agential.cog.prompts.benchmarks.ambignq import (
    AMBIGNQ_FEWSHOT_EXAMPLES_COT,
    AMBIGNQ_FEWSHOT_EXAMPLES_DIRECT,
    AMBIGNQ_FEWSHOT_EXAMPLES_REACT,
)
from agential.cog.prompts.benchmarks.gsm8k import (
    GSM8K_FEWSHOT_EXAMPLES_POT,
)
from agential.cog.prompts.benchmarks.svamp import (
    SVAMP_FEWSHOT_EXAMPLES_POT,
)
from agential.cog.prompts.benchmarks.tabmwp import (
    TABMWP_FEWSHOT_EXAMPLES_POT,
)
from agential.cog.prompts.benchmarks.humaneval import (
    HUMANEVAL_FEWSHOT_EXAMPLES_POT,
)
from agential.cog.prompts.benchmarks.mbpp import (
    MBPP_FEWSHOT_EXAMPLES_POT,
)

from agential.cog.prompts.critic import (
    # QA.
    CRITIC_INSTRUCTION_HOTPOTQA,
    CRITIC_CRITIQUE_INSTRUCTION_HOTPOTQA,
    HOTPOTQA_FEWSHOT_EXAMPLES_CRITIC,

    CRITIC_INSTRUCTION_FEVER,
    CRITIC_CRITIQUE_INSTRUCTION_FEVER,
    FEVER_FEWSHOT_EXAMPLES_CRITIC,

    CRITIC_INSTRUCTION_AMBIGNQ,
    CRITIC_CRITIQUE_INSTRUCTION_AMBIGNQ,
    AMBIGNQ_FEWSHOT_EXAMPLES_CRITIC,

    CRITIC_INSTRUCTION_TRIVIAQA,
    CRITIC_CRITIQUE_INSTRUCTION_TRIVIAQA,
    TRIVIAQA_FEWSHOT_EXAMPLES_CRITIC,

    # Math.
    CRITIC_POT_INSTRUCTION_GSM8K,
    CRITIC_CRITIQUE_INSTRUCTION_GSM8K,
    GSM8K_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_GSM8K,
    GSM8K_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,

    CRITIC_POT_INSTRUCTION_SVAMP,
    CRITIC_CRITIQUE_INSTRUCTION_SVAMP,
    SVAMP_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_SVAMP,
    SVAMP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,

    CRITIC_POT_INSTRUCTION_TABMWP,
    CRITIC_CRITIQUE_INSTRUCTION_TABMWP,
    TABMWP_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_TABMWP,
    TABMWP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,

    # Code.
    CRITIC_POT_INSTRUCTION_HUMANEVAL,
    CRITIC_CRITIQUE_INSTRUCTION_HUMANEVAL,
    HUMANEVAL_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_HUMANEVAL,
    HUMANEVAL_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,

    CRITIC_POT_INSTRUCTION_MBPP,
    CRITIC_CRITIQUE_INSTRUCTION_MBPP,
    MBPP_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_MBPP,
    MBPP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
)

import dotenv
dotenv.load_dotenv()

import warnings
warnings.filterwarnings("ignore")

llm = ChatOpenAI()
search = GoogleSearchAPIWrapper()

# QA

In [None]:
agent = CriticAgent(
    llm=llm, mode={"qa": "hotpotqa"}, search=search
)
use_tool = True

# HotpotQA
question = 'Who was once considered the best kick boxer in the world, however he has been involved in a number of controversies relating to his "unsportsmanlike conducts" in the sport and crimes of violence outside of the ring'
out = agent.generate(
    question=question,
    examples=HOTPOTQA_FEWSHOT_EXAMPLES_COT,  # HOTPOTQA_FEWSHOT_EXAMPLES_DIRECT, HOTPOTQA_FEWSHOT_EXAMPLES_REACT
    prompt=CRITIC_INSTRUCTION_HOTPOTQA,
    critique_examples=HOTPOTQA_FEWSHOT_EXAMPLES_CRITIC,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_HOTPOTQA,
    max_interactions=3,
    use_tool=use_tool
)

In [None]:
out

In [None]:
agent = CriticAgent(
    llm=llm, mode={"qa": "fever"}, search=search
)
use_tool = False

# FEVER
question = "Nikolaj Coster-Waldau worked with the Fox Broadcasting Company."
out = agent.generate(
    question=question,
    examples=FEVER_FEWSHOT_EXAMPLES_COT,  # FEVER_FEWSHOT_EXAMPLES_DIRECT, FEVER_FEWSHOT_EXAMPLES_REACT
    prompt=CRITIC_INSTRUCTION_FEVER,
    critique_examples=FEVER_FEWSHOT_EXAMPLES_CRITIC,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_FEVER,
    max_interactions=3,
    use_tool=use_tool
)

In [None]:
out

In [None]:
agent = CriticAgent(
    llm=llm, mode={"qa": "ambignq"}, search=search
)
use_tool = False

# AmbigNQ
question = "When did the simpsons first air on television?"
out = agent.generate(
    question=question,
    examples=AMBIGNQ_FEWSHOT_EXAMPLES_COT,  # AMBIGNQ_FEWSHOT_EXAMPLES_DIRECT, AMBIGNQ_FEWSHOT_EXAMPLES_REACT
    prompt=CRITIC_INSTRUCTION_AMBIGNQ,
    critique_examples=AMBIGNQ_FEWSHOT_EXAMPLES_CRITIC,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_AMBIGNQ,
    max_interactions=3,
    use_tool=use_tool
)

In [None]:
out

In [None]:
agent = CriticAgent(
    llm=llm, mode={"qa": "triviaqa"}, search=search
)
use_tool = False

# TriviaQA
question = "Which American-born Sinclair won the Nobel Prize for Literature in 1930?"
out = agent.generate(
    question=question,
    examples=TRIVIAQA_FEWSHOT_EXAMPLES_COT,  # TRIVIAQA_FEWSHOT_EXAMPLES_DIRECT, TRIVIAQA_FEWSHOT_EXAMPLES_REACT
    prompt=CRITIC_INSTRUCTION_TRIVIAQA,
    critique_examples=TRIVIAQA_FEWSHOT_EXAMPLES_CRITIC,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_TRIVIAQA,
    max_interactions=3,
    use_tool=use_tool
)

In [None]:
out

# Math

In [4]:
agent = CriticAgent(
    llm=llm, mode={"math": "gsm8k"}
)

use_tool = True

# GSM8k
question = "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with 4933828. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"
out = agent.generate(
    question=question,
    examples=GSM8K_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_GSM8K,
    critique_examples=GSM8K_FEWSHOT_EXAMPLES_CRITIC if use_tool else GSM8K_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_GSM8K if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_GSM8K,
    use_tool=use_tool,
    max_interactions=3
)

# Write Python Code to solve the following questions. Store your result as a variable named 'answer'.

Question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?
# Python code, return answer
jason_lollipops_initial = 20
jason_lollipops_after = 12
denny_lollipops = jason_lollipops_initial - jason_lollipops_after
answer = denny_lollipops

---

Question: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
# Python code, return answer
trees_initial = 15
trees_after = 21
trees_added = trees_after - trees_initial
answer = trees_added

---

Question: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?
# Python code, return answer
toys_initial = 5
mom_toys = 2
dad_toys = 2
total_received = mom_toys + dad_toys
total_toys = toys_initial 

In [5]:
out

[{'code': 'eggs_laid_per_day = 16\neggs_for_breakfast = 3\neggs_used_for_muffins = 4933828\neggs_remaining = eggs_laid_per_day - eggs_for_breakfast - eggs_used_for_muffins\nmoney_made_per_day = eggs_remaining * 2\nanswer = money_made_per_day',
  'critique': "1. The amount of money made every day at the farmers' market should be a positive number, -9867630 < 0, which is not reasonable.\n\n2. Let's analyze the code:\n\n- `eggs_laid_per_day = 16`: This variable correctly represents the number of eggs Janet's ducks lay per day.\n- `eggs_for_breakfast = 3`: This variable correctly represents the number of eggs Janet eats for breakfast every morning.\n- `eggs_used_for_muffins = 4933828`: This variable seems to be incorrectly set to a very large number, which doesn't align with the scenario of baking muffins daily. This is likely a mistake.\n- `eggs_remaining = eggs_laid_per_day - eggs_for_breakfast - eggs_used_for_muffins`: This calculation is incorrect due to the extremely high value of `eg

In [None]:
agent = CriticAgent(
    llm=llm, mode={"math": "svamp"}
)

use_tool = True

# SVAMP
question = "There are 87 oranges and 290 bananas in Philip's collection. If the bananas are organized into 2 groups and oranges are organized into 93 groups. How big is each group of bananas?"
out = agent.generate(
    question=question,
    examples=SVAMP_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_SVAMP,
    critique_examples=SVAMP_FEWSHOT_EXAMPLES_CRITIC if use_tool else SVAMP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_SVAMP if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_SVAMP,
    use_interpreter_tool=use_tool,
    max_interactions=3
)

In [None]:
out

In [None]:
agent = CriticAgent(
    llm=llm, mode={"math": "tabmwp"}
)

use_tool = True

# TabMWP
question = """Read the following table regarding "Bowling Scores" and then write Python code to answer a question:

Name | Score
Amanda | 117
Sam | 236
Irma | 144
Mike | 164

Question: Some friends went bowling and kept track of their scores. How many more points did Mike score than Irma?"""
out = agent.generate(
    question=question,
    examples=TABMWP_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_TABMWP,
    critique_examples=TABMWP_FEWSHOT_EXAMPLES_CRITIC if use_tool else TABMWP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_TABMWP if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_TABMWP,
    use_interpreter_tool=use_tool,
    max_interactions=3
)

In [None]:
out

# Code

In [None]:
# HumanEval
agent = CriticAgent(
    llm=llm, mode={"code": "humaneval"}
)
use_tool = True

inst = {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}

question = inst['prompt']

tests = f"{inst['test']}\ncheck({inst['entry_point']})"

out = agent.generate(
    question=question,
    critique_additional_keys={"tests": tests},
    examples=HUMANEVAL_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_HUMANEVAL,
    critique_examples=HUMANEVAL_FEWSHOT_EXAMPLES_CRITIC if use_tool else HUMANEVAL_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_HUMANEVAL if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_HUMANEVAL,
    use_tool=use_tool,
    max_interactions=3
)

In [None]:
out

In [None]:
# MBPP
agent = CriticAgent(
    llm=llm, mode={"code": "mbpp"}
)
use_tool = True

question = "Write a python function to find the first repeated character in a given string."
tests = """assert first_repeated_char("abcabc") == "a"
assert first_repeated_char("abc") == None
assert first_repeated_char("123123") == "1\""""

out = agent.generate(
    question=question,
    additional_keys={"tests": tests},
    critique_additional_keys={"tests": tests},
    examples=MBPP_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_MBPP,
    critique_examples=MBPP_FEWSHOT_EXAMPLES_CRITIC if use_tool else MBPP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_MBPP if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_MBPP,
    use_tool=use_tool,
    max_interactions=3
)

In [None]:
out