# Calculate Metrics on BIG-Bench

In [1]:
import pandas as pd
from collections import defaultdict
import random
import os
from openai import OpenAI
from google import genai
from google.genai import types
import anthropic
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
import re
import collections
import json
import copy

In [3]:
def load_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in: {file_path}")
        return None

# Example usage:
file_path = 'data/big_bench_augmented_with_steps.json'
data = load_json_file(file_path)

if data:
    print('Question:', data[0]['question'])
    print('Prompt CoT:', data[0]['prompt_cot'])
    print(len(data))
    print(data[0].keys())

Question: Keith is 5 feet tall so he is less likely to become an amateur basketball player than a horse jockey.
Prompt CoT: Evaluate if the following Q follows common sense. Answer 'True' or 'False'
Q: Keith is 5 feet tall so he is less likely to become an amateur basketball player than a horse jockey.
In answering this question each step should be on a separate line and start with a number and a period, followed by the reasoning. Finally the answer should be on a new line with the word 'Answer' proceeded by a colon.
A: Let's think step by step.
1200
dict_keys(['id', 'task', 'question', 'answer', 'prompt_direct', 'prompt_cot', 'response', 'steps'])


In [4]:
system_prompts = {
    "type_label":
      ("You are an expert at analyzing each step in a multi-step answer. Given a step used to answer a question,"
      "your job is to decide whether it contains factual information that must be verified by external sources"
      "(Attribution step.), performs logical inference from past steps (Logical step.), or does both (Both.). "
      "Only output the answer, without explanation."),
    "correctness_label":
      (
          "You are a logic expert. Your job is to determine if a reasoning step (the hypothesis) follows logically "
          "from the previous steps (the premise) and the question. You can assume all statements are correct "
          "in previous steps. A logically correct step must be a valid inference from the given premises and question. "
          "If it is a valid logical inference, output 'Correct'. If it does not, output 'Incorrect'. "
          "Only output the answer, without explanation.\n\n"
          "Here are some examples:\n\n"
          "Human:\n"
          "Question: 'Is it possible to walk from France to Japan?'\n"
          "Premise: 'France and Japan are separated by thousands of kilometers and an ocean.'\n"
          "Hypothesis: 'No, you cannot walk from France to Japan.'\n"
          "Output: {{Correct, Incorrect}}"
          "Assistant:\n"
          "Correct\n\n"
          "Human:\n"
          "Question: 'Is this statement plausible: John threw a strike out at Little League on Sunday.'\n"
          "Premise: 'Little League is a youth baseball organization. Striking out is a common baseball term.'\n"
          "Hypothesis: 'No, the statement is not plausible.'\n"
          "Output: {{Correct, Incorrect}}"
          "Assistant:\n"
          "Incorrect"
      ),
    "logic_relevance_label":
       (
          "You are a judge evaluating whether each step in a multi-step answer contributes "
          "meaningfully toward answering the question. Steps that are off-topic or irrelevant "
          "should be marked 'Not Relevant'. Others should be marked 'Relevant'. Only output the answer, without explanation."
          "Here are some examples:\n\n"
          "Human:\n"
          "Question: 'Is it possible to walk from France to Japan?'\n"
          "Answer: 'France and Japan are separated by thousands of kilometers and an ocean. France has more walking areas than Japan. No, you cannot walk from France to Japan.'\n"
          "Step: France has more walking areas than Japan."
          "Is this step relevant answering the question? Output: {{Relevant, Not Relevant}}"
          "Assistant:\n"
          "Not Relevant\n\n"
          "Human:\n"
          "Question: 'Is this statement plausible: John threw a strike out at Little League on Sunday.'\n"
          "Answer: 'Little League is a youth baseball organization. Striking out is a common baseball term. No, the statement is not plausible.'\n"
          "Step: Striking out is a common baseball term."
          "Is this step relevant to answering the question? Output: {{Relevant, Not Relevant}}"
          "Assistant:\n"
          "Relevant\n\n"
          "Question: 'Is it possible to walk from France to Japan?'\n"
          "Answer: 'France and Japan are separated by thousands of kilometers and an ocean. France has more walking areas than Japan. So the answer is No.'\n"
          "Step: So the answer is No."
          "Is this step relevant to answering the quesiton? Output: {{Relevant, Not Relevant}}"
          "Assistant:\n"
          "Relevant\n\n"
        ),
    "logic_relevance_label_reflect": (
         "You are a logic expert reviewing your own reasoning. You must evaluate whether each step in a multi-step answer contributes "
        "meaningfully toward answering the question. Steps that are off-topic, redundant, or irrelevant "
        "should be marked 'Not Relevant'. Others should be marked 'Relevant'. You previously said this step is relevant. "
        "I want you to reflect carefully: Is this step relevant to answering the question? Briefly explain your answer then "
        "at the end, output your final judgment clearly as either: Relevant or Not Relevant."
    ),
    "answer_is_logically_correct": (
        "You are a fact-checking and logic expert. Given a question, a multi-step answer, and supporting evidence, "
        "determine whether the answer is fully correct both logically and factually. Refer to the provided evidence, if relevant. "
        "Output 'True' if correct or 'False' if incorrect. Only output the answer, without explanation."
    ),
    "correctness_label_reflect": (
        "You are a logic expert reviewing your own reasoning. You claimed that the following reasoning step logically follows "
        "from earlier steps (the premise) and the question being asked. "
        "However, you may be wrong, and people are doubting your reasoning. I want you to reflect carefully: "
        "Does the hypothesis follow logically from the question and the premise? Assume all earlier steps are factually correct."
        "Defend your reasoning if it is valid. Otherwise, admit the error. Don't overthink. "
        "At the end, output your final judgment clearly as either: Correct or Incorrect."
    ),
    "correctness_label_reflect_confidence": (
        "You are assessing the validity of your own logical step. Given the question and the preceding, "
        "factually sound steps (the premise), you previously stated that the following step logically follows. "
        "Now, critically evaluate the strength of this logical connection. On a scale of 1 to 10 (where 1 is "
        "'not at all' and 10 is 'absolutely certain'), how confident are you that the hypothesis logically follows? Is it "
        "a valid inference and is it doing everything correct? Explain your confidence score in 1-3 sentences. Finally, state your overall judgment: Correct or Incorrect."
    ),
    "correctness_label_reflect_law": (
        "You are defending yourself in court regarding a statement you made. You claimed that the following reasoning step logically follows "
        "from earlier steps (the premise) and the question being asked. "
        "Assuming that all earlier steps are factually correct, is the inference you made in this step correct?"
        "Provide your defense. Remember, you cannot lie on the stand. Defend your reasoning if it is valid. Otherwise, admit the error. "
        "Don't overthink. At the end, output your final judgment to the jury clearly as either: Correct or Incorrect. "
    ),
    "alignment": (
        "You are a judge evaluating whether a set of steps in a multi-step answer is in alignment "
        "with the answer to the question. Outputs where the answer is not derived from the steps should be marked "
        "as No, and outputs where the answer does come in alignment with the steps should be marked as Yes. Only output the answer "
        "without explanation."
    ),
    "resilience": (
        "TODO"
    ),
}

In [11]:
def query_llm_as_judge(prompt, task, client, prompts, model_family = 'openai', model_type = 'gpt-3.5-turbo'):
    res = None
    model_family = model_family.lower()
    system_prompt = prompts[task]
    if 'openai' in model_family:
        response = client.chat.completions.create(
            model=model_type,
            messages= [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ]
        )
        res = response.choices[0].message.content.strip()

    elif 'anthropic' in model_family:
        response = client.messages.create(
            model=model_type,
            max_tokens=1000,
            system=system_prompt,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        res = response.content[0].text

    elif 'gemini' in model_family:
        response = client.models.generate_content(
            model=model_type,
            config=types.GenerateContentConfig(system_instruction=system_prompt),
            contents=prompt
        )
        res =  response.text

    return res

In [6]:
def logic_correctness_prompt(question, previous_steps, step):
    return f"""Logic Task
            Question: '{question}'
            Premise: '{previous_steps}'
            Is the following hypothesis a correct logical inference based on the premise and question being asked?
            Hypothesis: '{step}'
            Output: {{Correct, Incorrect}}"""

def relevance_prompt(question, cot_answer, step):
    return f"""Relevance Task
            Question: {question}
            Answer: {cot_answer}
            Step: {step}
            Is this step relevant to answering the question?
            Output: {{Relevant, Not Relevant}}"""

def reasoning_answer_alignment_prompt(question, reasoning_steps, answer):
    return f"""Reasoning Task
            Question: '{question}'
            Reasoning steps: '{reasoning_steps}'
            Answer: '{answer}'

            Are the reasoning steps in alignment with the answer?
            Output: {{Yes, No}}"""

def resilience_prompt(question, question_a, question_b, cot, cot_a, cot_b):
    # TODO
    return "TODO"

In [7]:
def extract_answer_from_response(text, output_type='correctness'):
    if output_type == 'correctness':
        match = re.search(r"Output:\s*(Correct|Incorrect)", text, re.IGNORECASE)
        if match:
            return match.group(1).capitalize()  # Returns 'Correct' or 'Incorrect'
    elif output_type == 'relevance':
        match = re.search(r"Output:\s*(Relevant|Not Relevant)", text, re.IGNORECASE)
        if match:
            return match.group(1).capitalize()  # Returns 'Relevant' or 'Not Relevant'
    return None

def evaluate_cot_steps_ensemble(data, clients, model_types):
    """
    Args:
        grouped_data: dict with (question_id, answer_id, answer_model) -> steps
        clients: dict of {model_family: client}
        model_types: dict of {model_family: model_name}
    Returns:
        DataFrame with step-wise and final predictions using majority vote
    """

    def majority_vote(preds):
        # strip all punctuation including new lines and make lowercase
        preds = [v.lower() for v in preds]
        preds = [re.sub(r'[^\w\s]', '', v) for v in preds]
        tally = collections.Counter(preds)

        if len(tally) == 0:
            return None
        return tally.most_common(1)[0][0]  # Most frequent prediction

    results = []

    for i in range(len(data)):
        if i % 10 == 0:
            print(f"Processing {i}/{len(data)}")
        
        id = data[i]['id']
        question = data[i]['question']
        step_info = data[i]['steps']
        

        if id[-1] == 'a' or id[-1] == 'b':
            # augmented data: skip, it will be used later for resilience
            continue

        # store augmented data: used for resilience metric
        question_a = data[i + 1]['question']
        question_b = data[i + 2]['question']
        step_info_a = data[i + 1]['steps']
        step_info_b = data[i + 2]['steps']

        for model in step_info: # loop through outputs from different models
            steps = step_info[model]
            steps_a = step_info_a[model]
            steps_b = step_info_b[model]

            full_cot = " ".join(steps)
            full_cot_a = " ".join(steps_a)
            full_cot_b = " ".join(steps_b)
            for i, step in enumerate(steps):
                prev_steps = ' '.join(steps[:i])
                step_text = step

                # logic metric
                logic_prompt_text = logic_correctness_prompt(question, prev_steps, step_text)
                logic_correctness_preds = {}
                for model_family, client in clients.items():
                    model_type = model_types[model_family]
                    logic_output = query_llm_as_judge(logic_prompt_text, "correctness_label", client, system_prompts, model_family, model_type)
                    # print(logic_prompt_text, logic_output)
                    # logic_output = extract_answer_from_response(logic_output, 'correctness') # only if using self reflection prompting
                    logic_output = logic_output.replace("\n", "")
                    if logic_output:
                        logic_correctness_preds[model_family] = logic_output
                
                # relevance metric
                relevance_preds = {}
                relevance_prompt_text = relevance_prompt(question, full_cot, step_text)
                for model_family, client in clients.items():
                    model_type = model_types[model_family]
                    relevance_output = query_llm_as_judge(relevance_prompt_text, "logic_relevance_label", client, system_prompts, model_family, model_type)
                    # relevance_output = extract_answer_from_response(relevance_output, 'relevance')  # only if using self reflection prompting
                    relevance_output = relevance_output.replace("\n", "")
                    if relevance_output:
                        relevance_preds[model_family] = relevance_output
            
            # reasoning alignment metric
            reasoning_steps = ' '.join(steps[:-1])
            answer = steps[-1]
            reasoning_alignment_prompt = reasoning_answer_alignment_prompt(question, reasoning_steps, answer)
            alignment_preds = {}
            for model_family, client in clients.items():
                model_type = model_types[model_family]
                alignment_output = query_llm_as_judge(reasoning_alignment_prompt, "alignment", client, system_prompts, model_family, model_type)

                # logic_output = extract_answer_from_response(logic_output, 'correctness') # only if using self reflection prompting
                alignment_output = alignment_output.replace("\n", "")
                if alignment_output:
                    alignment_preds[model_family] = alignment_output

            # resilience metric
            resilience_text = resilience_prompt(question, question_a, question_b, full_cot, full_cot_a, full_cot_b)
            resilience_preds = {}
            for model_family, client in clients.items():
                model_type = model_types[model_family]
                resilience_output = query_llm_as_judge(resilience_text, "resilience", client, system_prompts, model_family, model_type)

                # logic_output = extract_answer_from_response(logic_output, 'correctness') # only if using self reflection prompting
                resilience_output = resilience_output.replace("\n", "")
                if resilience_output:
                    resilience_preds[model_family] = resilience_output

In [12]:
models_dict = {
    'openai1': 'gpt-3.5-turbo', 
    'openai2': 'gpt-4-turbo', 
    'anthropic': 'claude-3-haiku-20240307', 
    'gemini1': 'gemini-1.5-flash',
    'gemini2': 'gemini-2.0-flash'
}

# API Keys
MY_OPENAI_KEY = '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'
MY_ANTHROPIC_KEY = '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'
MY_GEMINI_KEY = 'AIzaSyC13qSGNQ8vMeqwkQdQA1pQ7o4LSBZJBX0'

# Connect to APIs
gpt_client = OpenAI(api_key = MY_OPENAI_KEY)
claude_client = anthropic.Anthropic(api_key = MY_ANTHROPIC_KEY)
gemini_client = genai.Client(api_key=MY_GEMINI_KEY)

# TODO: Change clients_dict and models_dict based on the LLMs you want to evaluate
# TODO: If no majority vote ensembling, just choose one model
clients_dict = {'openai1': gpt_client, 'openai2': gpt_client, 'anthropic': claude_client, 'gemini1': gemini_client, 'gemini2': gemini_client}

evaluate_cot_steps_ensemble(data, clients_dict, models_dict)

Processing 0/1200


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}