In [None]:
from agential.cog.critic.agent import CriticAgent
from langchain_community.utilities.google_search import GoogleSearchAPIWrapper
from langchain_openai import ChatOpenAI

from agential.cog.fewshots.hotpotqa import (
    HOTPOTQA_FEWSHOT_EXAMPLES_COT,
    HOTPOTQA_FEWSHOT_EXAMPLES_DIRECT,
    HOTPOTQA_FEWSHOT_EXAMPLES_REACT,
)
from agential.cog.fewshots.fever import (
    FEVER_FEWSHOT_EXAMPLES_COT,
    FEVER_FEWSHOT_EXAMPLES_DIRECT,
    FEVER_FEWSHOT_EXAMPLES_REACT,
)
from agential.cog.fewshots.triviaqa import (
    TRIVIAQA_FEWSHOT_EXAMPLES_COT,
    TRIVIAQA_FEWSHOT_EXAMPLES_DIRECT,
    TRIVIAQA_FEWSHOT_EXAMPLES_REACT,
)
from agential.cog.fewshots.ambignq import (
    AMBIGNQ_FEWSHOT_EXAMPLES_COT,
    AMBIGNQ_FEWSHOT_EXAMPLES_DIRECT,
    AMBIGNQ_FEWSHOT_EXAMPLES_REACT,
)
from agential.cog.fewshots.gsm8k import (
    GSM8K_FEWSHOT_EXAMPLES_POT,
)
from agential.cog.fewshots.svamp import (
    SVAMP_FEWSHOT_EXAMPLES_POT,
)
from agential.cog.fewshots.tabmwp import (
    TABMWP_FEWSHOT_EXAMPLES_POT,
)
from agential.cog.fewshots.humaneval import (
    HUMANEVAL_FEWSHOT_EXAMPLES_POT,
)
from agential.cog.fewshots.mbpp import (
    MBPP_FEWSHOT_EXAMPLES_POT,
)

from agential.cog.critic.prompts import (
    # QA.
    CRITIC_INSTRUCTION_HOTPOTQA,
    CRITIC_CRITIQUE_INSTRUCTION_HOTPOTQA,
    HOTPOTQA_FEWSHOT_EXAMPLES_CRITIC,

    CRITIC_INSTRUCTION_FEVER,
    CRITIC_CRITIQUE_INSTRUCTION_FEVER,
    FEVER_FEWSHOT_EXAMPLES_CRITIC,

    CRITIC_INSTRUCTION_AMBIGNQ,
    CRITIC_CRITIQUE_INSTRUCTION_AMBIGNQ,
    AMBIGNQ_FEWSHOT_EXAMPLES_CRITIC,

    CRITIC_INSTRUCTION_TRIVIAQA,
    CRITIC_CRITIQUE_INSTRUCTION_TRIVIAQA,
    TRIVIAQA_FEWSHOT_EXAMPLES_CRITIC,

    # Math.
    CRITIC_POT_INSTRUCTION_GSM8K,
    CRITIC_CRITIQUE_INSTRUCTION_GSM8K,
    GSM8K_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_GSM8K,
    GSM8K_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,

    CRITIC_POT_INSTRUCTION_SVAMP,
    CRITIC_CRITIQUE_INSTRUCTION_SVAMP,
    SVAMP_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_SVAMP,
    SVAMP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,

    CRITIC_POT_INSTRUCTION_TABMWP,
    CRITIC_CRITIQUE_INSTRUCTION_TABMWP,
    TABMWP_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_TABMWP,
    TABMWP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,

    # Code.
    CRITIC_POT_INSTRUCTION_HUMANEVAL,
    CRITIC_CRITIQUE_INSTRUCTION_HUMANEVAL,
    HUMANEVAL_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_HUMANEVAL,
    HUMANEVAL_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,

    CRITIC_POT_INSTRUCTION_MBPP,
    CRITIC_CRITIQUE_INSTRUCTION_MBPP,
    MBPP_FEWSHOT_EXAMPLES_CRITIC,
    CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_MBPP,
    MBPP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
)

import dotenv
dotenv.load_dotenv()

import warnings
warnings.filterwarnings("ignore")

llm = ChatOpenAI()
search = GoogleSearchAPIWrapper()

# QA

### HotpotQA

In [None]:
question = 'Who was once considered the best kick boxer in the world, however he has been involved in a number of controversies relating to his "unsportsmanlike conducts" in the sport and crimes of violence outside of the ring'

agent = CriticAgent(
    llm=llm, 
    benchmark="hotpotqa", 
    # kwargs.
    search=search, 
    evidence_length=400,
    num_results=8
)
use_tool = True

out = agent.generate(
    question=question,
    examples=HOTPOTQA_FEWSHOT_EXAMPLES_COT,  # HOTPOTQA_FEWSHOT_EXAMPLES_DIRECT, HOTPOTQA_FEWSHOT_EXAMPLES_REACT
    prompt=CRITIC_INSTRUCTION_HOTPOTQA,
    critique_examples=HOTPOTQA_FEWSHOT_EXAMPLES_CRITIC,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_HOTPOTQA,
    additional_keys={},
    critique_additional_keys={},
    max_interactions=3,
    use_tool=use_tool,
    reset=True,
    # kwargs.
    evidence_length=400,
    num_results=8,
)

In [None]:
out

### FEVER

In [None]:
question = "Nikolaj Coster-Waldau worked with the Fox Broadcasting Company."

agent = CriticAgent(
    llm=llm, 
    benchmark="fever", 
    # kwargs.
    search=search,
    evidence_length=400,
    num_results=8
)
use_tool = False

out = agent.generate(
    question=question,
    examples=FEVER_FEWSHOT_EXAMPLES_COT,  # FEVER_FEWSHOT_EXAMPLES_DIRECT, FEVER_FEWSHOT_EXAMPLES_REACT
    prompt=CRITIC_INSTRUCTION_FEVER,
    critique_examples=FEVER_FEWSHOT_EXAMPLES_CRITIC,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_FEVER,
    additional_keys={},
    critique_additional_keys={},
    max_interactions=3,
    use_tool=use_tool,
    reset=True,
    # kwargs.
    evidence_length=400,
    num_results=8,
)

In [None]:
out

### AmbigNQ

In [None]:
question = "When did the simpsons first air on television?"

agent = CriticAgent(
    llm=llm, 
    benchmark="ambignq", 
    # kwargs.
    search=search,
    evidence_length=400,
    num_results=8
)
use_tool = False

out = agent.generate(
    question=question,
    examples=AMBIGNQ_FEWSHOT_EXAMPLES_COT,  # AMBIGNQ_FEWSHOT_EXAMPLES_DIRECT, AMBIGNQ_FEWSHOT_EXAMPLES_REACT
    prompt=CRITIC_INSTRUCTION_AMBIGNQ,
    critique_examples=AMBIGNQ_FEWSHOT_EXAMPLES_CRITIC,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_AMBIGNQ,
    additional_keys={},
    critique_additional_keys={},
    max_interactions=3,
    use_tool=use_tool,
    reset=True,
    # kwargs.
    evidence_length=400,
    num_results=8,
)

In [None]:
out

### TriviaQA

In [None]:
question = "Which American-born Sinclair won the Nobel Prize for Literature in 1930?"

agent = CriticAgent(
    llm=llm, 
    benchmark="triviaqa", 
    # kwargs.
    search=search,
    evidence_length=400,
    num_results=8
)
use_tool = False

out = agent.generate(
    question=question,
    examples=TRIVIAQA_FEWSHOT_EXAMPLES_COT,  # TRIVIAQA_FEWSHOT_EXAMPLES_DIRECT, TRIVIAQA_FEWSHOT_EXAMPLES_REACT
    prompt=CRITIC_INSTRUCTION_TRIVIAQA,
    critique_examples=TRIVIAQA_FEWSHOT_EXAMPLES_CRITIC,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_TRIVIAQA,
    additional_keys={},
    critique_additional_keys={},
    max_interactions=3,
    use_tool=use_tool,
    reset=True,
    # kwargs.
    evidence_length=400,
    num_results=8,
)

In [None]:
out

# Math

### GSM8K

In [None]:
question = "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with 4933828. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"

agent = CriticAgent(
    llm=llm, 
    benchmark="gsm8k", 
    # kwargs.
    patience=2
)

use_tool = True

out = agent.generate(
    question=question,
    examples=GSM8K_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_GSM8K,
    critique_examples=GSM8K_FEWSHOT_EXAMPLES_CRITIC if use_tool else GSM8K_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_GSM8K if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_GSM8K,
    additional_keys={},
    critique_additional_keys={},
    max_interactions=3,
    use_tool=use_tool,
    reset=True,
)

In [None]:
out

### SVAMP

In [None]:
question = "There are 87 oranges and 290 bananas in Philip's collection. If the bananas are organized into 2 groups and oranges are organized into 93 groups. How big is each group of bananas?"

agent = CriticAgent(
    llm=llm, 
    benchmark="svamp", 
    # kwargs.
    patience=2
)

use_tool = True

out = agent.generate(
    question=question,
    examples=SVAMP_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_SVAMP,
    critique_examples=SVAMP_FEWSHOT_EXAMPLES_CRITIC if use_tool else SVAMP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_SVAMP if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_SVAMP,
    additional_keys={},
    critique_additional_keys={},
    max_interactions=3,
    use_tool=use_tool,
    reset=True
)

In [None]:
out

### TabMWP

In [None]:
question = """Read the following table regarding "Bowling Scores" and then write Python code to answer a question:

Name | Score
Amanda | 117
Sam | 236
Irma | 144
Mike | 164

Question: Some friends went bowling and kept track of their scores. How many more points did Mike score than Irma?"""

agent = CriticAgent(
    llm=llm, 
    benchmark="tabmwp", 
    # kwargs.
    patience=2
)

use_tool = True

out = agent.generate(
    question=question,
    examples=TABMWP_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_TABMWP,
    critique_examples=TABMWP_FEWSHOT_EXAMPLES_CRITIC if use_tool else TABMWP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_TABMWP if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_TABMWP,
    additional_keys={},
    critique_additional_keys={},
    max_interactions=3,
    use_tool=use_tool,
    reset=True
)

In [None]:
out

# Code

### HumanEval

In [None]:
inst = {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}
question = inst['prompt']
tests = f"{inst['test']}\ncheck({inst['entry_point']})"

agent = CriticAgent(
    llm=llm, benchmark="humaneval"
)
use_tool = False


out = agent.generate(
    question=question,
    examples=HUMANEVAL_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_HUMANEVAL,
    critique_examples=HUMANEVAL_FEWSHOT_EXAMPLES_CRITIC if use_tool else HUMANEVAL_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_HUMANEVAL if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_HUMANEVAL,
    additional_keys={},
    critique_additional_keys={"tests": tests},
    max_interactions=3,
    use_tool=use_tool,
    reset=True
)

In [None]:
out

### MBPP

In [None]:
question = "Write a python function to find the first repeated character in a given string."
tests = """assert first_repeated_char("abcabc") == "a"
assert first_repeated_char("abc") == None
assert first_repeated_char("123123") == "1\""""

agent = CriticAgent(
    llm=llm, benchmark="mbpp"
)
use_tool = True


out = agent.generate(
    question=question,
    examples=MBPP_FEWSHOT_EXAMPLES_POT,
    prompt=CRITIC_POT_INSTRUCTION_MBPP,
    critique_examples=MBPP_FEWSHOT_EXAMPLES_CRITIC if use_tool else MBPP_FEWSHOT_EXAMPLES_CRITIC_NO_TOOL,
    critique_prompt=CRITIC_CRITIQUE_INSTRUCTION_MBPP if use_tool else CRITIC_CRITIQUE_NO_TOOL_INSTRUCTION_MBPP,
    additional_keys={"tests": tests},
    critique_additional_keys={"tests": tests},
    max_interactions=3,
    use_tool=use_tool,
    reset=True
)

In [None]:
out