In [7]:
!pip install openai
!pip install anthropic
!pip install -q -U google-genai
!pip install -U "huggingface_hub[cli]"

Collecting huggingface_hub[cli]
  Downloading huggingface_hub-0.32.0-py3-none-any.whl (509 kB)
[K     |████████████████████████████████| 509 kB 2.8 MB/s eta 0:00:01
Collecting hf-xet<2.0.0,>=1.1.2
  Downloading hf_xet-1.1.2-cp37-abi3-macosx_10_12_x86_64.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 31.3 MB/s eta 0:00:01
Collecting fsspec>=2023.5.0
  Downloading fsspec-2025.5.1-py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 103.0 MB/s eta 0:00:01
Collecting InquirerPy==0.3.4
  Downloading InquirerPy-0.3.4-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 19.9 MB/s eta 0:00:01
Collecting pfzy<0.4.0,>=0.3.1
  Downloading pfzy-0.3.4-py3-none-any.whl (8.5 kB)
Installing collected packages: pfzy, hf-xet, fsspec, InquirerPy, huggingface-hub
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2021.8.1
    Uninstalling fsspec-2021.8.1:
      Successfully uninstalled fsspec-2021.8.1
Successfully installed I

In [13]:
import pandas as pd
from collections import defaultdict
import random
from openai import OpenAI
from google import genai
from google.genai import types
import anthropic
from sklearn.metrics import classification_report
import re
import collections
from huggingface_hub import notebook_login

## Load in REVEAL dataset

In [None]:
# Login using e.g. `huggingface-cli login` to access this dataset
# TODO: GET A TOKEN FOR HUGGINGFACE

# Use this if on Google Colab
!huggingface-cli login

In [14]:
# Use this if on VS Code / Jupyter
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
# Load in REVEAL dataset
splits = {'eval': 'eval/reveal_eval.csv', 'open': 'open/reveal_open.csv'}
df = pd.read_csv('hf://datasets/google/reveal/' + splits['eval'])
df = df.drop_duplicates(subset=['answer_id', 'step_idx']).reset_index(drop=True)

In [16]:
print(f"Datasets in eval set: {df['dataset'].unique()}")
print(f"Models tested in eval set: {df['answer_model'].unique()}")
print(f"Number of questions in eval set: {len(df['question_id'].unique())}")
print(f"Number of answers in eval set: {len(df['answer_id'].unique())}")

Datasets in eval set: ['fermi' 'musique' 'sports' 'strategy_qa']
Models tested in eval set: ['GPT-3' 'Flan-PaLM-540B' 'Flan-UL2-20B']
Number of questions in eval set: 704
Number of answers in eval set: 1002


In [17]:
# What are the values in the relevant columns?
group_keys = ['question_id', 'answer_id', 'answer_model']
step_keys = ['question', 'step', 'step_idx']
metric_cols = [
    'type_label', 'logic_relevance_label', 'attribution_relevance_label',
    'attribution_label', 'correctness_label', 'evidence',
    'answer_is_fully_attributable', 'answer_is_logically_correct'
]
justification_cols = ['logic_justifications', 'attribution_justifications']

for col in metric_cols:
  if col == 'evidence':
    continue
  print(f"{col}: {df[col].unique()}")

type_label: ['Attribution step.' 'Logical step.' 'Both.']
logic_relevance_label: ['Relevant' 'Not relevant']
attribution_relevance_label: ['Yes' nan 'No']
attribution_label: ['Partially' 'Unsupported' nan 'Fully' 'Contradictory']
correctness_label: [nan 'Incorrect' 'Correct']
answer_is_fully_attributable: [False  True]
answer_is_logically_correct: [False  True]


In [18]:
# Create nested dictionary. Each key is a tuple (question_id, answer_model, answer_id). Values are a list of steps and their labels.
grouped_data = defaultdict(list)
for _, row in df.iterrows():
    group_id = tuple(row[k] for k in group_keys)
    step_info = {k: row[k] for k in step_keys + metric_cols + justification_cols}
    grouped_data[group_id].append(step_info)

In [19]:
# Example of random question and CoT steps with annotations
random_key = random.choice(list(grouped_data.keys()))
step_info = grouped_data[random_key]

print(f"Question: {step_info[0]['question']}")
print(f"Answer Model: {random_key[1]}")

for i in range(len(step_info)):
  print('-----')
  print(f"Step {step_info[i]['step_idx']}")
  print(f"Step Text: {step_info[i]['step']}")

Question: Is the following sentence plausible? "David Luiz shot with the left foot."
Answer Model: sports/366//Flan-PaLM-540B
-----
Step 1
Step Text: David Luiz is a soccer player.
-----
Step 2
Step Text: Soccer players can shoot with the left foot.
-----
Step 3
Step Text: So the answer is yes.


In [20]:
# Example of random question and CoT steps with annotations
random_key = random.choice(list(grouped_data.keys()))
step_info = grouped_data[random_key]

print(f"Question: {step_info[0]['question']}")
print(f"Answer Model: {random_key[1]}")

for i in range(len(step_info)):
  print('-----')
  print(f"Step {step_info[i]['step_idx']}")
  print(f"Step Text: {step_info[i]['step']}")
  print(f"Step Type: {step_info[i]['type_label']}")
  # Does step i bring new information or describe a logical step?
  # (Attribution step. / Logical Step. / Both.)

  print('\n>> TASK 1: LOGIC ANNOTATION')
  print(f"- Logical Relevance to Question: {step_info[i]['logic_relevance_label']}")
  # Is this step relevant with respect to answering the question?
  # Irrelevant steps do not invalidate the chain's correctness.
  # (Relevant / Not Relevant)

  print(f"- Logical Correctness: {step_info[i]['correctness_label']}")
  # Considering only the logical inference done in step i, is it consistent with the previous steps?
  # (Correct / Incorrect)
  # The correctness of logical steps that follow incorrect logical steps is undefined.
  # Does not apply to attribution steps.

  print(f"- Logic Justification: {step_info[i]['logic_justifications']}")

  print('\n>> TASK 2: ATTRIBUTION ANNOTATION')
  print(f"- Evidence: {step_info[i].get('evidence', '[None]')}")
  # Retrieved Wikipedia paragraphs given as evidence if this is an attribution step.
  # Does not apply to logic steps.

  print(f"- Attribution Needed: {step_info[i]['attribution_relevance_label']}")
  # Should claim i be attributed? Does the claim have information that needs to be verified?
  # (Yes / No). Does not apply to logic steps.

  print(f"- Attribution Label: {step_info[i]['attribution_label']}")
  # To what extent can the information in claim i be verified by evidence j?
  # ('Partially'/ 'Unsupported' / 'Fully' / 'Contradictory')
  # Does not apply to logic steps.

  print(f"- Attribution Justification: {step_info[i]['attribution_justifications']}\n")

  # If this is the final step, print answer-level labels
  if i == len(step_info) - 1:
    print('-----')
    print("=== FINAL ANSWER ANNOTATIONS ===")
    print(f"- Fully Attributable: {step_info[i]['answer_is_fully_attributable']}")
    # Is the final answer fully attributable? (True / False)

    print(f"- Logically Correct: {step_info[i]['answer_is_logically_correct']}")
    # Is the answer fully logically correct? (True / False)


Question: Is the following sentence plausible? "Tobias Harris fumbled the ball in the NFC divisional round."
Answer Model: sports/137//Flan-UL2-20B
-----
Step 1
Step Text: Tobias Harris is a basketball player.
Step Type: Attribution step.

>> TASK 1: LOGIC ANNOTATION
- Logical Relevance to Question: Relevant
- Logical Correctness: nan
- Logic Justification: ['The question is about Tobias Harris, so it could be relevant that is a basketball player.\n', 'Step one states who Harris is ', 'brings new info on what sport Harris plays.', 'Relevant because it provides the sport that Harris plays, which is necessary to know to answer the question.', 'Step 1 adds that Tobias Harris is a basketball player.']

>> TASK 2: ATTRIBUTION ANNOTATION
- Evidence: Tobias Harris: Tobias Harris (born July 15, 1992) is an American professional basketball player for the Philadelphia 76ers of the National Basketball Association (NBA). He played one season of college basketball for the Tennessee Volunteers befor

## Set up OpenAI API

In [21]:
def query_llm_as_judge(prompt, task, client, prompts, model_family = 'openai', model_type = 'gpt-3.5-turbo'):
    res = None
    model_family = model_family.lower()
    system_prompt = prompts[task]
    if model_family == 'openai':
        response = client.chat.completions.create(
            model=model_type,
            messages= [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ]
        )
        res = response.choices[0].message.content.strip()

    elif model_family == 'anthropic':
        response = client.messages.create(
            model=model_type,
            max_tokens=1000,
            system=system_prompt,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        res = response.content[0].text

    elif model_family == 'gemini':
        response = client.models.generate_content(
            model=model_type,
            config=types.GenerateContentConfig(system_instruction=system_prompt),
            contents=prompt
        )
        res =  response.text

    return res

## Evaluate REVEAL

In [22]:
system_prompts = {
    "type_label":
      ("You are an expert at analyzing each step in a multi-step answer. Given a step used to answer a question,"
      "your job is to decide whether it contains factual information that must be verified by external sources"
      "(Attribution step.), performs logical inference from past steps (Logical step.), or does both (Both.). "
      "Only output the answer, without explanation."),
    "correctness_label":
      (
          "You are a logic expert. Your job is to determine if a reasoning step (the hypothesis) follows logically "
          "from the previous steps (the premise) and the question. You can assume all statements are correct "
          "in previous steps. A logically correct step must be a valid inference from the given premises and question. "
          "If it is a valid logical inference, output 'Correct'. If it does not, output 'Incorrect'. "
          "Only output the answer, without explanation.\n\n"
          "Here are some examples:\n\n"
          "Human:\n"
          "Question: 'Is it possible to walk from France to Japan?'\n"
          "Premise: 'France and Japan are separated by thousands of kilometers and an ocean.'\n"
          "Hypothesis: 'No, you cannot walk from France to Japan.'\n"
          "Output: {{Correct, Incorrect}}"
          "Assistant:\n"
          "Correct\n\n"
          "Human:\n"
          "Question: 'Is this statement plausible: John threw a strike out at Little League on Sunday.'\n"
          "Premise: 'Little League is a youth baseball organization. Striking out is a common baseball term.'\n"
          "Hypothesis: 'No, the statement is not plausible.'\n"
          "Output: {{Correct, Incorrect}}"
          "Assistant:\n"
          "Incorrect"
      ),
    "logic_relevance_label":
       (
          "You are a judge evaluating whether each step in a multi-step answer contributes "
          "meaningfully toward answering the question. Steps that are off-topic or irrelevant "
          "should be marked 'Not Relevant'. Others should be marked 'Relevant'. Only output the answer, without explanation."
          "Here are some examples:\n\n"
          "Human:\n"
          "Question: 'Is it possible to walk from France to Japan?'\n"
          "Answer: 'France and Japan are separated by thousands of kilometers and an ocean. France has more walking areas than Japan. No, you cannot walk from France to Japan.'\n"
          "Step: France has more walking areas than Japan."
          "Is this step relevant answering the question? Output: {{Relevant, Not Relevant}}"
          "Assistant:\n"
          "Not Relevant\n\n"
          "Human:\n"
          "Question: 'Is this statement plausible: John threw a strike out at Little League on Sunday.'\n"
          "Answer: 'Little League is a youth baseball organization. Striking out is a common baseball term. No, the statement is not plausible.'\n"
          "Step: Striking out is a common baseball term."
          "Is this step relevant to answering the question? Output: {{Relevant, Not Relevant}}"
          "Assistant:\n"
          "Relevant\n\n"
          "Question: 'Is it possible to walk from France to Japan?'\n"
          "Answer: 'France and Japan are separated by thousands of kilometers and an ocean. France has more walking areas than Japan. So the answer is No.'\n"
          "Step: So the answer is No."
          "Is this step relevant to answering the quesiton? Output: {{Relevant, Not Relevant}}"
          "Assistant:\n"
          "Relevant\n\n"
        ),
    "logic_relevance_label_reflect": (
         "You are a logic expert reviewing your own reasoning. You must evaluate whether each step in a multi-step answer contributes "
        "meaningfully toward answering the question. Steps that are off-topic, redundant, or irrelevant "
        "should be marked 'Not Relevant'. Others should be marked 'Relevant'. You previously said this step is relevant. "
        "I want you to reflect carefully: Is this step relevant to answering the question? Briefly explain your answer then "
        "at the end, output your final judgment clearly as either: Relevant or Not Relevant."
    ),
    "answer_is_logically_correct": (
        "You are a fact-checking and logic expert. Given a question, a multi-step answer, and supporting evidence, "
        "determine whether the answer is fully correct both logically and factually. Refer to the provided evidence, if relevant. "
        "Output 'True' if correct or 'False' if incorrect. Only output the answer, without explanation."
    ),
    "correctness_label_reflect": (
        "You are a logic expert reviewing your own reasoning. You claimed that the following reasoning step logically follows "
        "from earlier steps (the premise) and the question being asked. "
        "However, you may be wrong, and people are doubting your reasoning. I want you to reflect carefully: "
        "Does the hypothesis follow logically from the question and the premise? Assume all earlier steps are factually correct."
        "Defend your reasoning if it is valid. Otherwise, admit the error. Don't overthink. "
        "At the end, output your final judgment clearly as either: Correct or Incorrect."
    ),
    "correctness_label_reflect_confidence": (
        "You are assessing the validity of your own logical step. Given the question and the preceding, "
        "factually sound steps (the premise), you previously stated that the following step logically follows. "
        "Now, critically evaluate the strength of this logical connection. On a scale of 1 to 10 (where 1 is "
        "'not at all' and 10 is 'absolutely certain'), how confident are you that the hypothesis logically follows? Is it "
        "a valid inference and is it doing everything correct? Explain your confidence score in 1-3 sentences. Finally, state your overall judgment: Correct or Incorrect."
    ),
    "correctness_label_reflect_law": (
        "You are defending yourself in court regarding a statement you made. You claimed that the following reasoning step logically follows "
        "from earlier steps (the premise) and the question being asked. "
        "Assuming that all earlier steps are factually correct, is the inference you made in this step correct?"
        "Provide your defense. Remember, you cannot lie on the stand. Defend your reasoning if it is valid. Otherwise, admit the error. "
        "Don't overthink. At the end, output your final judgment to the jury clearly as either: Correct or Incorrect. "
    ),
    "alignment": (
        "You are a judge evaluating whether a set of steps in a multi-step answer is in alignment "
        "with the answer to the question. Outputs where the answer is not derived from the steps should be marked "
        "as No, and outputs where the answer does come in alignment with the steps should be marked as Yes. Only output the answer "
        "without explanation."
    ),
}

In [23]:
def step_type_prompt(question, previous_steps, step):
    return f"""Step Type Task
            Question: {question}
            Past Steps: {previous_steps}
            Step: {step}
            Step type: {{Attribution step., Logical step., Both.}}"""

def logic_correctness_prompt(question, previous_steps, step):
    return f"""Logic Task
            Question: '{question}'
            Premise: '{previous_steps}'
            Is the following hypothesis a correct logical inference based on the premise and question being asked?
            Hypothesis: '{step}'
            Output: {{Correct, Incorrect}}"""

def logic_correctness_reflect_prompt(question, previous_steps, step):
    return f"""Self-Critique Logic Task
            Question: '{question}'
            Premise: '{previous_steps}'

            Is the following a correct logical inference based on the premise and question being asked?
            You can use your external knowledge as long as it doesn't contradict the premise.
            Hypothesis: '{step}'

            Briefly explain your answer then clearly state your output in a new line in this format:
            Output: {{Correct, Incorrect}}"""

def relevance_prompt(question, cot_answer, step):
    return f"""Relevance Task
            Question: {question}
            Answer: {cot_answer}
            Step: {step}
            Is this step relevant to answering the question?
            Output: {{Relevant, Not Relevant}}"""

def relevance_reflect_prompt(question, cot_answer, step):
    return f"""Self-Critique Relevance Task
            Question: '{question}'
            Answer: '{cot_answer}'

            Is the following step relevant to the answer?
            Step: '{step}'

            Briefly explain your answer then clearly state your output in a new line in this format:
            Output: {{Relevant, Not Relevant}}"""

def cot_correctness_prompt(evidence_list, question, cot_answer):
    # Create formatted list of evidence blocks
    evidence_section = "\n".join([f"Evidence {i+1}: {e}" for i, e in enumerate(evidence_list)])
    return f"""CoT Correctness task
              {evidence_section}
              Question: {question}
              Answer: {cot_answer}
              The answer is: {{True, False}}"""

def reasoning_answer_alignment_prompt(question, reasoning_steps, answer):
    return f"""Reasoning Task
            Question: '{question}'
            Reasoning steps: '{reasoning_steps}'
            Answer: '{answer}'

            Are the reasoning steps in alignment with the answer?
            Output: {{Yes, No}}"""

In [24]:
def extract_answer_from_response(text, output_type='correctness'):
    if output_type == 'correctness':
        match = re.search(r"Output:\s*(Correct|Incorrect)", text, re.IGNORECASE)
        if match:
            return match.group(1).capitalize()  # Returns 'Correct' or 'Incorrect'
    elif output_type == 'relevance':
        match = re.search(r"Output:\s*(Relevant|Not Relevant)", text, re.IGNORECASE)
        if match:
            return match.group(1).capitalize()  # Returns 'Relevant' or 'Not Relevant'
    return None

def evaluate_cot_steps_ensemble(grouped_data, clients, model_types):
    """
    Args:
        grouped_data: dict with (question_id, answer_id, answer_model) -> steps
        clients: dict of {model_family: client}
        model_types: dict of {model_family: model_name}
    Returns:
        DataFrame with step-wise and final predictions using majority vote
    """

    def majority_vote(preds):
        # strip all punctuation including new lines and make lowercase
        preds = [v.lower() for v in preds]
        preds = [re.sub(r'[^\w\s]', '', v) for v in preds]
        tally = collections.Counter(preds)

        if len(tally) == 0:
            return None
        return tally.most_common(1)[0][0]  # Most frequent prediction

    results = []

    for i, ((question_id, answer_id, answer_model), steps) in enumerate(grouped_data.items()):
        if i % 10 == 0:
            print(f"Processing {i}/{len(grouped_data)}")

        question = steps[0]['question']
        full_cot = " ".join([s['step'] for s in steps])
        prev_steps = ""
        evidence_list = []

        for i, step in enumerate(steps):
            step_text = step['step']
            step_type = step['type_label']

            # Step Type: use ground truth for now
            normalized_type = step_type.strip().title()
            logic_correctness_preds = {}

            # Logical Correctness Task: evaluate whenever the actual df has a correctness label
            if "Logic" in normalized_type and not pd.isna(step.get("correctness_label")):
                logic_prompt_text = logic_correctness_prompt(question, prev_steps, step_text)
                for model_family, client in clients.items():
                    model_type = model_types[model_family]
                    logic_output = query_llm_as_judge(logic_prompt_text, "correctness_label", client, system_prompts, model_family, model_type)
                    # print(logic_prompt_text, logic_output)
                    # logic_output = extract_answer_from_response(logic_output, 'correctness') # only if using self reflection prompting
                    logic_output = logic_output.replace("\n", "")
                    if logic_output:
                        logic_correctness_preds[model_family] = logic_output

                correctness = majority_vote(list(logic_correctness_preds.values()))
                if correctness.lower() != step.get("correctness_label").lower():
                    print(logic_prompt_text, logic_correctness_preds, f'Correct Answer: {step.get("correctness_label")}')
            else:
                correctness = None

            # Relevance Task
            relevance_preds = {}
            relevance_prompt_text = relevance_prompt(question, full_cot, step_text)
            for model_family, client in clients.items():
                model_type = model_types[model_family]
                relevance_output = query_llm_as_judge(relevance_prompt_text, "logic_relevance_label", client, system_prompts, model_family, model_type)
                # relevance_output = extract_answer_from_response(relevance_output, 'relevance')  # only if using self reflection prompting
                relevance_output = relevance_output.replace("\n", "")
                if relevance_output:
                    relevance_preds[model_family] = relevance_output

            relevance = majority_vote(list(relevance_preds.values()))
            if relevance.lower() != step.get("logic_relevance_label").lower():
                print(relevance_prompt_text, relevance_preds, f'Correct Answer: {step.get("logic_relevance_label")}')

            # Get Evidence
            if "Attribution" in normalized_type and step.get("evidence"):
                evidence_list.append(step['evidence'])

            if i == len(steps) - 1:
                reasoning_steps = prev_steps
                answer = step_text
            prev_steps += f"{step_text} "

            result_entry = {
                "question_id": question_id,
                "question": step['question'],
                "answer_model": answer_model,
                "answer_id": answer_id,
                "step_idx": step['step_idx'],
                "step": step_text,
                "type_label_pred": step_type,
                "correctness_label_pred": correctness,
                "logic_relevance_label_pred": relevance,
                "type_label_true": step.get("type_label"),
                "correctness_label_true": step.get("correctness_label"),
                "logic_relevance_label_true": step.get("logic_relevance_label"),
            }

            # Add individual model verdicts
            for model_family in clients.keys():
                result_entry[f"correctness_label_{model_family}"] = logic_correctness_preds.get(model_family)
                result_entry[f"logic_relevance_label_{model_family}"] = relevance_preds.get(model_family)

            results.append(result_entry)

        ###################### METRIC FOR REASONING ALIGNMENT ############################
        # print(prev_steps, reasoning_steps, answer)
        reasoning_alignment_prompt = reasoning_answer_alignment_prompt(question, reasoning_steps, answer)
        alignment_preds = {}
        for model_family, client in clients.items():
            model_type = model_types[model_family]
            alignment_output = query_llm_as_judge(reasoning_alignment_prompt, "alignment", client, system_prompts, model_family, model_type)

            # logic_output = extract_answer_from_response(logic_output, 'correctness') # only if using self reflection prompting
            alignment_output = alignment_output.replace("\n", "")
            if alignment_output:
                alignment_preds[model_family] = alignment_output

        alignment_verdict = majority_vote(list(alignment_preds.values()))
        results[-1]["answer_is_in_alignment_pred"] = alignment_verdict
        # print(reasoning_alignment_prompt, alignment_preds)

        # TODO: we don't have annotations for alignment
        # if correctness.lower() != step.get("correctness_label").lower():
        #     print(logic_prompt_text, logic_correctness_preds, f'Correct Answer: {step.get("correctness_label")}')

        ###################### METRIC FOR REASONING ALIGNMENT ############################

        # Final CoT Correctness: use placeholder for now
        cot_correctness = 'True'
        results[-1]["answer_is_logically_correct_pred"] = cot_correctness
        results[-1]["answer_is_logically_correct_true"] = steps[-1].get("answer_is_logically_correct")

        # break # TODO: remove break, it is only for printing purposes

    return pd.DataFrame(results)

In [26]:
# API Keys
MY_OPENAI_KEY = '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'
MY_ANTHROPIC_KEY = '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'
MY_GEMINI_KEY = 'AIzaSyC13qSGNQ8vMeqwkQdQA1pQ7o4LSBZJBX0'

# Connect to APIs
gpt_client = OpenAI(api_key = MY_OPENAI_KEY)
claude_client = anthropic.Anthropic(api_key = MY_ANTHROPIC_KEY)
gemini_client = genai.Client(api_key=MY_GEMINI_KEY)

# TODO: Change clients_dict and models_dict based on the LLMs you want to evaluate
# TODO: If no majority vote ensembling, just choose one model
clients_dict = {'openai': gpt_client, 'anthropic': claude_client, 'gemini': gemini_client}
models_dict = {'openai': 'gpt-3.5-turbo', 'anthropic': 'claude-3-haiku-20240307', 'gemini': 'gemini-1.5-flash'}

# Make a subset of grouped_data with n data points to test on, set random seed so results are comparable
random.seed(42)
n = 3
subset_keys = random.sample(list(grouped_data.keys()), n)
subset_data = {k: grouped_data[k] for k in subset_keys}

# Ensemble evaluation
results = evaluate_cot_steps_ensemble(subset_data, clients_dict, models_dict)

# Format results to lowercase and remove punctuation
results = results.map(lambda x: str(x).lower().replace('.', '').replace('\t', ' '))

Processing 0/3


In [27]:
# Print unique values in all columns to check for formatting issues in LLM responses
for col in results.columns:
    if 'pred' in col or 'true' in col:
      print(f"{col}: {results[col].unique()}")

type_label_pred: ['attribution step' 'logical step']
correctness_label_pred: ['none' 'correct']
logic_relevance_label_pred: ['relevant']
type_label_true: ['attribution step' 'logical step']
correctness_label_true: ['nan' 'correct']
logic_relevance_label_true: ['relevant']
answer_is_in_alignment_pred: ['nan' 'yes']
answer_is_logically_correct_pred: ['nan' 'true']
answer_is_logically_correct_true: ['nan' 'true']


In [28]:
def evaluate_predictions(df, task_column_prefix, is_ensemble=False):
    if not is_ensemble:
        y_true = df[f"{task_column_prefix}_true"].astype(str).str.strip().str.title()
        y_pred = df[f"{task_column_prefix}_pred"].astype(str).str.strip().str.title()

        report = classification_report(
            y_true,
            y_pred,
            output_dict=True,
            zero_division=0
        )
        return report
    else:
        y_true = df[f"{task_column_prefix}_true"].astype(str).str.strip().str.title()
        y_pred_ensemble = df[f"{task_column_prefix}_pred"].astype(str).str.strip().str.title()
        y_pred_openai = df[f"{task_column_prefix}_openai"].astype(str).str.strip().str.title()
        y_pred_anthropic = df[f"{task_column_prefix}_anthropic"].astype(str).str.strip().str.title()
        y_pred_gemini = df[f"{task_column_prefix}_gemini"].astype(str).str.strip().str.title()

        reports = {}
        reports['ensemble'] = classification_report(y_true, y_pred_ensemble, output_dict=True, zero_division=0)
        reports['openai'] = classification_report(y_true, y_pred_openai, output_dict=True, zero_division=0)
        reports['anthropic'] = classification_report(y_true, y_pred_anthropic, output_dict=True, zero_division=0)
        reports['gemini'] = classification_report(y_true, y_pred_gemini, output_dict=True, zero_division=0)

        return reports

In [29]:
# MACRO F1 SCORES (same as what is used in paper, unweighted mean, better for class imbalances)
logic_metrics = evaluate_predictions(results[results['correctness_label_true'] != 'nan'], "correctness_label", is_ensemble = True)
relevance_metrics = evaluate_predictions(results, "logic_relevance_label", is_ensemble = True)

for model, metrics in logic_metrics.items():
    print(f"Macro F1 score for logical step correctness ({model}):", metrics['macro avg']['f1-score'])
    print(f"Macro F1 score for step relevance ({model}):", relevance_metrics[model]['macro avg']['f1-score'])
    print(f"Weighted F1 score for logical step correctness ({model}):", metrics['weighted avg']['f1-score'])
    print(f"Weighted F1 score for step relevance ({model}):", relevance_metrics[model]['weighted avg']['f1-score'])
    print("------")

Macro F1 score for logical step correctness (ensemble): 1.0
Macro F1 score for step relevance (ensemble): 1.0
Weighted F1 score for logical step correctness (ensemble): 1.0
Weighted F1 score for step relevance (ensemble): 1.0
------
Macro F1 score for logical step correctness (openai): 0.42857142857142855
Macro F1 score for step relevance (openai): 1.0
Weighted F1 score for logical step correctness (openai): 0.8571428571428571
Weighted F1 score for step relevance (openai): 1.0
------
Macro F1 score for logical step correctness (anthropic): 1.0
Macro F1 score for step relevance (anthropic): 1.0
Weighted F1 score for logical step correctness (anthropic): 1.0
Weighted F1 score for step relevance (anthropic): 1.0
------
Macro F1 score for logical step correctness (gemini): 1.0
Macro F1 score for step relevance (gemini): 1.0
Weighted F1 score for logical step correctness (gemini): 1.0
Weighted F1 score for step relevance (gemini): 1.0
------


## External Validation: BigBenchMistake

In [36]:
df_bbm = pd.read_json('data/logical_deduction.jsonl', lines=True)

In [37]:
system_prompts_bbm = {
    "mistake_label":
      (
          "You are a logic expert. Your job is to determine the first incorrect reasoning step in a chain of thought answer."
          "If the given step has a mistake, output Yes. Otherwise output No. A mistkae can be a step that is logically incorrect or requires big "
          "logical jumps based on prior steps, a step that is redundant or repetetive, off topic or containing gibberish, leading to a dead "
          "end, an incorrect final answer, or referring to external links, images, or graphs. "
          "Only output the answer, without explanation.\n\n"
          "Here is an example:\n\n"
          """\
            Question: The following statements each describe a set of five objects arranged in a fixed order. The statements are logically consistent.
            - On a shelf, there are five books: a red book, a gray book, a white book, a blue book, and a green book.
            - The white book is to the left of the gray book.
            - The gray book is the third from the left.
            - The red book is the second from the left.
            - The blue book is to the right of the green book.
            Given the above statements, which of the following is correct?
            Options:
            (A) The red book is the leftmost
            (B) The gray book is the leftmost
            (C) The white book is the leftmost
            (D) The blue book is the leftmost
            (E) The green book is the leftmost

            Previous Steps:
            To answer the question, I need to identify the positions of all five books. I should first find and look through any statements about absolute positions of books. Then, if there are remaining gaps, I can look through statements about relative positions and find the final order by process of elimination.
            The gray book is third from the left. So from left to right: first and second can be any book, then third is the gray book, then fourth and fifth can be any book.

            Current Step:
            The white book is to the left of the gray book. We previously concluded that the gray book is third from the left, so the white book must be second from the left.

            Is there a mistake in the above step? Output Yes if there is a mistake. Otherwise output No.
            (Note to assistant: In this example, there is a mistake becasue we cannot assume that the white book is second to the left based only on the fact that it is to the left of the gray book, it can also be first to the left.)
            Assistant: Yes """
      )
}


def mistake_prompt(question, steps, step):
    return f"""Mistake Finding Task:
            Question: {question}

            Previous Steps:
            {steps}

            Current Step:
            {step}

            Is there a mistake in the above step? Output Yes if there is a mistake. Otherwise output No. Output only the answer as a Yes/No on a new line, with nothing else."""

In [38]:
def evaluate_cot_steps_ensemble(grouped_data, clients, model_types):
    """
    Args:
        grouped_data: dict with (question_id, answer_id, answer_model) -> steps
        clients: dict of {model_family: client}
        model_types: dict of {model_family: model_name}
    Returns:
        DataFrame with step-wise and final predictions using majority vote
    """

    def majority_vote(preds):
        # strip all punctuation including new lines and make lowercase
        preds = [v.lower() for v in preds]
        preds = [re.sub(r'[^\w\s]', '', v) for v in preds]
        tally = collections.Counter(preds)
        if len(tally) == 0:
            return None
        return tally.most_common(1)[0][0]  # Most frequent prediction

    results = []

    for i, ((question, answer), steps) in enumerate(grouped_data.items()):
        if pd.isna(answer):
            continue

        if i % 10 == 0:
            print(f"Processing {i}/{len(grouped_data)}")

        assert len(steps) == 1

        pred = "No"
        all_steps = steps[0]
        for i, step in enumerate(all_steps[1:]):
            step_num = i + 1
            # use ground truth for now
            mistake_preds = {}
            first_mistake_preds = {}
            full_cot = "\n".join(all_steps[1:i])

            mistake_prompt_text = mistake_prompt(question, full_cot, step)
            for model_family, client in clients.items():
                model_type = model_types[model_family]
                output = query_llm_as_judge(mistake_prompt_text, "mistake_label", client, system_prompts_bbm, model_family, model_type)
                output = output.replace("\n", "")
                if output.lower() == "yes" and model_family not in first_mistake_preds:
                    first_mistake_preds[model_family] = step_num
                else:
                    first_mistake_preds[model_family] = "no"
                mistake_preds[model_family] = output

            ensemble_output = majority_vote(list(mistake_preds.values()))

            if ensemble_output == "yes":
                pred = step_num
                break

            if answer == step_num:
                break

        if pred != answer:
            print(mistake_prompt_text, first_mistake_preds, f'Prediction: {pred}', f'Correct Answer: {answer}')

        if isinstance(answer, (float, int)):
            answer = int(answer)

        result_entry = {
            "question": question,
            "steps": full_cot,
            "mistake_index_true": answer,
            "mistake_index_pred": pred
        }

        # Add individual model verdicts
        for model_family in clients.keys():
            result_entry[f"mistake_index_{model_family}"] = first_mistake_preds.get(model_family)

        results.append(result_entry)

    return pd.DataFrame(results)

In [39]:
grouped_data_bbm = defaultdict(list)
for i, row in df_bbm.iterrows():
    grouped_data_bbm[(row['input'], row['mistake_index'])].append(row['steps'])

In [40]:
# Make a subset of grouped_data with n data points to test on, set random seed so results are comparable
random.seed(42)
n = 3
subset_keys_bbm = random.sample(list(grouped_data_bbm.keys()), n)
subset_data_bbm = {k: grouped_data_bbm[k] for k in subset_keys_bbm}

# Ensemble evaluation
results = evaluate_cot_steps_ensemble(subset_data_bbm, clients_dict, models_dict)

# Format results to lowercase and remove punctuation
results = results.map(lambda x: str(x).lower().replace('.', '').replace('\t', ' '))

Processing 0/3
Mistake Finding Task:
            Question: The following statements each describe a set of three objects arranged in a fixed order. The statements are logically consistent.
- On a shelf, there are three books: a red book, a blue book, and an orange book.
- The red book is to the left of the orange book.
- The orange book is the second from the left.
Q: Given the above statements, which of the following is correct?
Options:
(A) The red book is the leftmost.
(B) The blue book is the leftmost.
(C) The orange book is the leftmost.

            Previous Steps:
            

            Current Step:
            The orange book is the second from the left, so the blue book must be the leftmost.

            Is there a mistake in the above step? Output Yes if there is a mistake. Otherwise output No. Output only the answer as a Yes/No on a new line, with nothing else. {'openai': 'no', 'anthropic': 'no', 'gemini': 1} Prediction: No Correct Answer: 1.0
Mistake Finding Task:
     

In [41]:
# Print unique values in all columns to check for formatting issues in LLM responses
for col in results.columns:
    if 'pred' in col or 'true' in col:
      print(f"{col}: {results[col].unique()}")

mistake_index_true: ['1' '3']
mistake_index_pred: ['no' '3']


In [42]:
# MACRO F1 SCORES (same as what is used in paper, unweighted mean, better for class imbalances)
mistake_metrics = evaluate_predictions(results, "mistake_index", is_ensemble = True)

for model, metrics in mistake_metrics.items():
    print(f"Macro F1 score for logical step correctness ({model}):", metrics['macro avg']['f1-score'])
    print(f"Weighted F1 score for logical step correctness ({model}):", metrics['weighted avg']['f1-score'])
    print(f"Accuracy ({model}):", metrics['accuracy'])
    print("------")

Macro F1 score for logical step correctness (ensemble): 0.3333333333333333
Weighted F1 score for logical step correctness (ensemble): 0.3333333333333333
Accuracy (ensemble): 0.3333333333333333
------
Macro F1 score for logical step correctness (openai): 0.0
Weighted F1 score for logical step correctness (openai): 0.0
Accuracy (openai): 0.0
------
Macro F1 score for logical step correctness (anthropic): 0.3333333333333333
Weighted F1 score for logical step correctness (anthropic): 0.3333333333333333
Accuracy (anthropic): 0.3333333333333333
------
Macro F1 score for logical step correctness (gemini): 1.0
Weighted F1 score for logical step correctness (gemini): 1.0
Accuracy (gemini): 1.0
------
