# Calculate Metrics on BIG-Bench

In [1]:
import pandas as pd
from collections import defaultdict
import random
import os
from openai import OpenAI
from google import genai
from google.genai import types
import anthropic
import re
import collections
import json
import copy

In [4]:
def load_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in: {file_path}")
        return None

# Example usage:
file_path = 'data/big_bench_responses_with_steps_fixed.json'
data = load_json_file(file_path)

if data:
    print('Question:', data[0]['question'])
    print('Prompt CoT:', data[0]['prompt_cot'])
    print(len(data))
    print(data[0].keys())

Question: Keith is 5 feet tall so he is less likely to become an amateur basketball player than a horse jockey.
Prompt CoT: Evaluate if the following Q follows common sense. Answer 'True' or 'False'
Q: Keith is 5 feet tall so he is less likely to become an amateur basketball player than a horse jockey.
In answering this question each step should be on a separate line and start with a number and a period, followed by the reasoning. Finally the answer should be on a new line with the word 'Answer' proceeded by a colon.
A: Let's think step by step.
1200
dict_keys(['id', 'task', 'question', 'answer', 'prompt_direct', 'prompt_cot', 'response', 'steps'])


In [None]:
system_prompts = {
     "correctness_label":
      (
          "You are a logic expert. Your job is to determine if a reasoning step (the hypothesis) follows logically "
          "from the previous steps (the premise) and the question. You can assume that all statements provided in the "
          "previous steps are correct. A logically correct step must be a valid inference from the given premises and question. "
          "**Steps that introduce new, factual information that should be attributed to an external source (even if not strictly "
          "deduced from previous steps) should be marked as 'Correct'. A restatement of something given in the question should be marked as 'Correct'.** "
          "A mistake occurs only when a step is attempting a logical deduction from prior steps and that deduction is incorrect or "
          "requires large, unstated logical jumps. If the premise is empty, the hypothesis is usually a logically correct statement, unless it clearly "
          "doesn't make sense given the question. "
          "If it is a valid logical inference, output 'Correct'. If it does not, output 'Incorrect'. "
          "Only output the answer, without explanation.\n\n"
          "Here are some examples:\n\n"
          "Human:\n"
          "Question: 'Is it possible to walk from France to Japan?'\n"
          "Premise: 'France and Japan are separated by thousands of kilometers and an ocean.'\n"
          "Hypothesis: 'No, you cannot walk from France to Japan.'\n"
          "Output: {{Correct, Incorrect}}"
          "Assistant:\n"
          "Correct\n\n"
          "Human:\n"
          "Question: 'Is this statement plausible: John threw a strike out at Little League on Sunday.'\n"
          "Premise: 'Little League is a youth baseball organization. Striking out is a common baseball term.'\n"
          "Hypothesis: 'No, the statement is not plausible.'\n"
          "Output: {{Correct, Incorrect}}"
          "Assistant:\n"
          "Incorrect"
          "Human:\n"
          "Question: 'Evaluate if the following Q follows common sense. Q: Amanda had a $10 fee for being late to the reservation.'\n"
          "Premise: ''\n"
          "Is the following hypothesis a correct logical inference based on the premise and question being asked?\n"
          "Hypothesis: 'Restaurants may cancel reservations if the guests are late.'\n"
          "Output: {{Correct, Incorrect}}\n"
          "Assistant:\n"
          "Correct"
          "Human:\n"
          "Question: 'Evaluate if the following statement follows causality. Q: Amanda was late to the reservation because there was traffic.'\n"
          "Premise: 'Amanda was late to the reservation.'\n"
          "Is the following hypothesis a correct logical inference based on the premise and question being asked?\n"
          "Hypothesis: 'There was traffic'\n"
          "Output: {{Correct, Incorrect}}\n"
          "Assistant:\n"
          "Correct"
      ),
    
    "logic_relevance_label":
       (
          "You are a judge evaluating whether each step in a multi-step answer contributes "
          "meaningfully toward answering the question. Steps that are off-topic, irrelevant, or redundant "
          "should be marked 'Not Relevant'. Others should be marked 'Relevant'. Only output the answer, without explanation."
          "Here are some examples:\n\n"
          "Human:\n"
          "Question: 'Is it possible to walk from France to Japan?'\n"
          "Answer: 'France and Japan are separated by thousands of kilometers and an ocean. France has more walking areas than Japan. No, you cannot walk from France to Japan.'\n"
          "Step: France has more walking areas than Japan."
          "Is this step relevant answering the question? Output: {{Relevant, Not Relevant}}"
          "Assistant:\n"
          "Not Relevant\n\n"
          "Human:\n"
          "Question: 'Is this statement plausible: John threw a strike out at Little League on Sunday.'\n"
          "Answer: 'Little League is a youth baseball organization. Striking out is a common baseball term. No, the statement is not plausible.'\n"
          "Step: Striking out is a common baseball term."
          "Is this step relevant to answering the question? Output: {{Relevant, Not Relevant}}"
          "Assistant:\n"
          "Relevant\n\n"
          "Question: 'Is it possible to walk from France to Japan?'\n"
          "Answer: 'France and Japan are separated by thousands of kilometers and an ocean. France has more walking areas than Japan. So the answer is No.'\n"
          "Step: So the answer is No."
          "Is this step relevant to answering the quesiton? Output: {{Relevant, Not Relevant}}"
          "Assistant:\n"
          "Relevant\n\n"
          "Question: 'Evaluate if the following Q follows common sense. Q: Amanda had a $10 fee for being late to the reservation.'\n"
          "Answer: 'Restaurants may cancel or charge for reservations if the guests are late. A party of 5 people is bigger than a party of 2 people for a reservation. So the answer is Yes.'\n"
          "Step: A party of 5 people is bigger than a party of 2 people for a reservation."
          "Is this step relevant to answering the quesiton? Output: {{Relevant, Not Relevant}}"
          "Assistant:\n"
          "Not Relevant\n\n"
        ),
 
    "alignment": (
        "You are a judge evaluating whether a set of steps in a multi-step answer is in alignment "
        "with the answer to the question. Outputs where the answer is not derived from the steps should be marked "
        "as No, and outputs where the answer does come in alignment with the steps should be marked as Yes. Only output the answer, "
        "without explanation. "
        "Here are some examples:\n\n"
        "Human:\n"
        "Question: 'What is the main ingredent in guacamole?'\n"
        "Reasoning Steps: 'Guacamole is a popular Mexican dip. Tomatoes are red. The recipe calls for onion and cilantro.' \n"
        "Answer: The main ingredient in guacamole is avocado."
        "Are the reasoning steps in alignment with the answer? Output: {{Yes, No}}"
        "Assistant:\n"
        "No\n\n"
        "Human:\n"
        "Question: 'What is the square root of 9?'\n"
        "Reasoning Steps: 'The square root of a number is a value that, when multiplied by itself, gives the original number. 3 x 3 = 9'\n"
        "Answer: 3"
        "Are the reasoning steps in alignment with the answer? Output: {{Yes, No}}"
        "Assistant:\n"
        "Yes\n\n"
    ),
    "resilience": (
        "You are a judge evaluating whether or not an LLM's output remains consistent after a prompt is perturbed in a way that preserves"
        "its initial meaning. This essentially evaluates a model's resilience to variation in the way a user may input the question."
        "If the LLM's outputs do not vary significantly even after the prompt perturbations, then output Yes. If the"
        "LLM's outputs change significantly with the prompt perturbations, then output No. Only output the answer, without explanation."
    ),
}

In [114]:
def query_llm_as_judge(prompt, task, client, prompts, model_family = 'openai', model_type = 'gpt-3.5-turbo'):
    res = None
    model_family = model_family.lower()
    system_prompt = prompts[task]
    if 'openai' in model_family:
        response = client.chat.completions.create(
            model=model_type,
            messages= [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ]
        )
        res = response.choices[0].message.content.strip()

    elif 'anthropic' in model_family:
        response = client.messages.create(
            model=model_type,
            max_tokens=1000,
            system=system_prompt,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        res = response.content[0].text

    elif 'gemini' in model_family:
        response = client.models.generate_content(
            model=model_type,
            config=types.GenerateContentConfig(system_instruction=system_prompt),
            contents=prompt
        )
        res =  response.text

    return res

In [None]:
def logic_correctness_prompt(question, previous_steps, step):
    return f"""Logic Task
            Question: '{question}'
            Premise: '{previous_steps}'
            Is the following hypothesis a correct logical inference based on the premise and question being asked?
            Hypothesis: '{step}'
            Output: {{Correct, Incorrect}}"""

def relevance_prompt(question, cot_answer, step):
    return f"""Relevance Task
            Question: {question}
            Answer: {cot_answer}
            Step: {step}
            Is this step relevant to answering the question?
            Output: {{Relevant, Not Relevant}}"""

def reasoning_answer_alignment_prompt(question, reasoning_steps, answer):
    return f"""Reasoning Task
            Question: '{question}'
            Reasoning steps: '{reasoning_steps}'
            Answer: '{answer}'

            Are the reasoning steps in alignment with the answer?
            Output: {{Yes, No}}"""

def resilience_prompt(question, question_a, question_b, cot, cot_a, cot_b):
    return f"""Resilience Task
            Original question: {question}
            Original output: {cot}

            Perturbed question #1: {question_a}
            Corresponding output: {cot_a}

            Perturbed question #2: {question_b}
            Corresponding output: {cot_b}

            Are the outputs consistent with each other despite small perturbations in wording/sentence structure?
            Output: {{Yes, No}}
            """

In [None]:
def extract_answer_from_response(text, output_type='correctness'):
    if output_type == 'correctness':
        match = re.search(r"Output:\s*(Correct|Incorrect)", text, re.IGNORECASE)
        if match:
            return match.group(1).capitalize()  # Returns 'Correct' or 'Incorrect'
    elif output_type == 'relevance':
        match = re.search(r"Output:\s*(Relevant|Not Relevant)", text, re.IGNORECASE)
        if match:
            return match.group(1).capitalize()  # Returns 'Relevant' or 'Not Relevant'
    return None

def evaluate_cot_steps_ensemble(data, clients, model_types, models_to_evaluate, all_results = []):
    """
    Args:
        grouped_data: dict with (question_id, answer_id, answer_model) -> steps
        clients: dict of {model_family: client}
        model_types: dict of {model_family: model_name}
        models_to_evaluate: list of model names to evaluate the responses of
    Returns:
        DataFrame with step-wise and final predictions using majority vote
    """

    def majority_vote(preds):
        # strip all punctuation including new lines and make lowercase
        preds = [v.lower() for v in preds]
        preds = [re.sub(r'[^\w\s]', '', v) for v in preds]
        tally = collections.Counter(preds)

        if len(tally) == 0:
            return None
        return tally.most_common(1)[0][0]  # Most frequent prediction

    for i in range(len(data)):
        if i % 10 == 0:
            print(f"Processing {i}/{len(data)}")
        
        id = data[i]['id']
        task = data[i]['task']
        question = data[i]['prompt_direct']
        question = question.replace('\nA:', '')
        step_info = data[i]['steps']
        true_answer = data[i]['answer']
        
        # if id[-1] == 'a' or id[-1] == 'b':
        #     # augmented data: skip, it will be used later for resilience
        #     continue

        # store augmented data: used for resilience metric
        # question_a = data[i + 1]['question']
        # question_b = data[i + 2]['question']
        # step_info_a = data[i + 1]['steps']
        # step_info_b = data[i + 2]['steps']

        for model in models_to_evaluate: # loop through outputs from different models
            steps = step_info[model]
            pred_answer = steps[-1]
            # steps_a = step_info_a[model]
            # steps_b = step_info_b[model]
            
            # Remove any None values from steps
            steps = [step for step in steps if step is not None]
            # steps_a = [step for step in steps_a if step is not None]
            # steps_b = [step for step in steps_b if step is not None]

            full_cot = " ".join(steps)
            # full_cot_a = " ".join(steps_a)
            # full_cot_b = " ".join(steps_b)
            correctness_annotations = []
            relevance_annotations = []
            
            for i, step in enumerate(steps):
                prev_steps = ' '.join(steps[:i])
                step_text = step

                # logic metric
                logic_prompt_text = logic_correctness_prompt(question, prev_steps, step_text)
                logic_correctness_preds = {}
                for model_family, client in clients.items():
                    if step_text.lower().replace('.', '') in question:
                        logic_correctness_preds[model_family] = 'correct'
                        continue
                    model_type = model_types[model_family]
                    logic_output = query_llm_as_judge(logic_prompt_text, "correctness_label", client, system_prompts, model_family, model_type)
                    logic_output = logic_output.replace("\n", "")
                    if logic_output:
                        logic_correctness_preds[model_family] = logic_output
                
                # relevance metric
                relevance_preds = {}
                relevance_prompt_text = relevance_prompt(question, full_cot, step_text)
                for model_family, client in clients.items():
                    model_type = model_types[model_family]
                    relevance_output = query_llm_as_judge(relevance_prompt_text, "logic_relevance_label", client, system_prompts, model_family, model_type)
                    relevance_output = relevance_output.replace("\n", "")
                    if relevance_output:
                        relevance_preds[model_family] = relevance_output

                correctness_annotations.append(majority_vote(list(logic_correctness_preds.values())))
                relevance_annotations.append(majority_vote(list(relevance_preds.values())))

                if correctness_annotations[-1].lower() == 'incorrect':
                    print(logic_prompt_text, logic_correctness_preds)
                if relevance_annotations[-1].lower() == 'not relevant':
                    print(relevance_prompt_text, relevance_preds)
            
            # reasoning alignment metric
            reasoning_steps = ' '.join(steps[:-1])
            answer = steps[-1]
            reasoning_alignment_prompt = reasoning_answer_alignment_prompt(question, reasoning_steps, answer)
            alignment_preds = {}
            for model_family, client in clients.items():
                model_type = model_types[model_family]
                alignment_output = query_llm_as_judge(reasoning_alignment_prompt, "alignment", client, system_prompts, model_family, model_type)
                alignment_output = alignment_output.replace("\n", "")
                if alignment_output:
                    alignment_preds[model_family] = alignment_output

            alignment_annotations = majority_vote(list(alignment_preds.values()))
            if alignment_annotations.lower() == 'no':
                print(reasoning_alignment_prompt, alignment_preds)
            
            # # resilience metric
            # resilience_text = resilience_prompt(question, question_a, question_b, full_cot, full_cot_a, full_cot_b)
            # resilience_preds = {}
            # for model_family, client in clients.items():
            #     model_type = model_types[model_family]
            #     resilience_output = query_llm_as_judge(resilience_text, "resilience", client, system_prompts, model_family, model_type)

            #     # logic_output = extract_answer_from_response(logic_output, 'correctness') # only if using self reflection prompting
            #     resilience_output = resilience_output.replace("\n", "")
            #     if resilience_output:
            #         resilience_preds[model_family] = resilience_output
                    
            all_results.append({
                'id': id,
                'task': task,
                'model': model,
                'prompt_direct': question,
                'question': data[i]['question'],
                'cot_steps': steps,
                'correctness_annotations': correctness_annotations,
                'relevance_annotations': relevance_annotations,
                'answer_in_alignment': alignment_annotations,
                # 'resilience_preds': resilience_preds,
                'true_answer': true_answer,
                'pred_answer': pred_answer,
            })

    return all_results.copy()

def calculate_resilience(all_results, models_to_evaluate):
    resilience_results = []
    STEP_SIZE = len(models_to_evaluate) * 3 # 3 since there is original question, then a, then b
    for i in range(0, len(all_results), STEP_SIZE):
        for j in range(len(models_to_evaluate)):
            original_entry = all_results[i]
            entry_a = all_results[i + j + len(models_to_evaluate)]
            entry_b = all_results[i + j + 2 * len(models_to_evaluate)]

            # info we need: question, steps for prompt that I wrote
            question = original_entry['question']
            question_a = entry_a['question']
            question_b = entry_b['question']

            steps = original_entry['cot_steps']
            steps_a = entry_a['cot_steps']
            steps_b = entry_b['cot_steps']

            # correctness annotation, relevance, alignment
            correctness_list = original_entry['correctness_annotations']
            correctness_list_a = entry_a['correctness_annotations']
            correctness_list_b = entry_b['correctness_annotations']

            relevance_list = original_entry['relevance_annotations']
            relevance_list_a = entry_a['relevance_annotations']
            relevance_list_b = entry_b['relevance_annotations']

            alignment = original_entry['answer_in_alignment']
            alignment_a = entry_a['answer_in_alignment']
            alignment_b = entry_b['answer_in_alignment']

            # somehow calculate resilience given all this info
            # the thing is, the number of CoT steps may be different from the augmented prompts
            # as a result, you cannot compare correctness_lists directly
            # i say we compare the percentage of the list that is correct
            resilience_metric = 0 # TODO

            resilience_results.append({
                'id': original_entry['id'],
                'task': original_entry['task'],
                'model': models_to_evaluate[j],
                'question': question,
                'question_a': question_a,
                'question_b': question_b,
                'cot_steps': steps,
                'cot_steps_a': steps_a,
                'cot_steps_b': steps_b,
                'correctness_annotations': correctness_list,
                'correctness_annotations_a': correctness_list_a,
                'correctness_annotations_b': correctness_list_b,
                'relevance_annotations': relevance_list,
                'relevance_annotations_a': relevance_list_a,
                'relevance_annotations_b': relevance_list_b,
                'answer_in_alignment': alignment,
                'answer_in_alignment_a': alignment_a,
                'answer_in_alignment_b': alignment_b,
                'resilience_metric': resilience_metric
            })

In [123]:
res = []

In [121]:
models_to_evaluate = ['gpt-3.5-turbo', 'gpt-4-turbo', 'gemini-1.5-flash'] # we can choose a subset of model responses to evaluate

# API Keys
MY_OPENAI_KEY = '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'
MY_ANTHROPIC_KEY = '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'
MY_GEMINI_KEY = 'AIzaSyC13qSGNQ8vMeqwkQdQA1pQ7o4LSBZJBX0'

# Connect to APIs
gpt_client = OpenAI(api_key = MY_OPENAI_KEY)
claude_client = anthropic.Anthropic(api_key = MY_ANTHROPIC_KEY)
gemini_client = genai.Client(api_key=MY_GEMINI_KEY)

# No majority vote ensembling
clients_dict = {'gemini': gemini_client}
models_dict = {'gemini': 'gemini-1.5-flash'}

cause_effect_data = [data[i] for i in range(len(data)) if data[i]['task'] == 'cause_effect']
n = 50
data_subset = cause_effect_data[:10]

# Modifies res in place, also returns a copy
res_copy = evaluate_cot_steps_ensemble(data_subset, clients_dict, models_dict, models_to_evaluate, res)

Processing 0/10
Logic Task
            Question: 'Evaluate if the following Q follows causality. Answer 'True' or 'False'
Q: The person ascended in through the window because the entrance was secured.'
            Premise: 'The person ascended in through the window. The entrance was secured. The person chose to go through the window instead of finding another entrance.'
            Is the following hypothesis a correct logical inference based on the premise and question being asked?
            Hypothesis: 'There is no direct causal link between the entrance being secured and the person ascending through the window.'
            Output: {Correct, Incorrect} {'gemini': 'Incorrect'}
Logic Task
            Question: 'Evaluate if the following Q follows causality. Answer 'True' or 'False'
Q: The man robbed a gas station because the man went to jail.'
            Premise: ''
            Is the following hypothesis a correct logical inference based on the premise and question being asked?
  

In [96]:
# convert results to dataframe
results_df = pd.DataFrame(res)
results_df

Unnamed: 0,id,task,model,prompt_direct,question,cot_steps,correctness_annotations,relevance_annotations,answer_in_alignment,true_answer,pred_answer
0,com2sense_0,com2sense,gpt-3.5-turbo,Evaluate if the following Q follows common sen...,Sally needs to be at work in 5 minutes while M...,[Keith's height of 5 feet may make it more dif...,"[correct, correct, correct, correct]","[relevant, relevant, relevant, relevant]",yes,True,True
1,com2sense_0,com2sense,gpt-4-turbo,Evaluate if the following Q follows common sen...,If Carl adores apple pie it is probable that h...,[Consider the typical heights for athletes in ...,"[correct, correct, correct, correct, correct, ...","[relevant, relevant, relevant, relevant, relev...",yes,True,True
2,com2sense_0,com2sense,gemini-1.5-flash,Evaluate if the following Q follows common sen...,Sally needs to be at work in 5 minutes while M...,[Being 5 feet tall is short for an amateur bas...,"[correct, correct, correct, correct]","[relevant, relevant, relevant, relevant]",yes,True,True
3,com2sense_0a,com2sense,gpt-3.5-turbo,Evaluate if the following Q follows common sen...,Sally needs to be at work in 5 minutes while M...,[Being 5 feet tall does not automatically disq...,"[correct, incorrect, correct, correct]","[relevant, relevant, relevant, relevant]",yes,True,False
4,com2sense_0a,com2sense,gpt-4-turbo,Evaluate if the following Q follows common sen...,With Sally needing to be at work in 5 minutes ...,[Height can influence proficiency and selectio...,"[correct, correct, correct, correct, correct, ...","[relevant, relevant, relevant, relevant, relev...",yes,True,True
...,...,...,...,...,...,...,...,...,...,...,...
895,com2sense_99a,com2sense,gpt-4-turbo,Evaluate if the following Q follows common sen...,Washing five cups is easier than washing one cup.,"[If classes begin next week, it implies a limi...","[correct, correct, correct, correct, correct]","[relevant, relevant, relevant, relevant, relev...",yes,False,True
896,com2sense_99a,com2sense,gemini-1.5-flash,Evaluate if the following Q follows common sen...,"James works six miles from his house, and he n...","[The statement implies that preparing ""in seve...","[correct, correct, correct, correct, correct, ...","[relevant, relevant, relevant, relevant, relev...",yes,False,True
897,com2sense_99b,com2sense,gpt-3.5-turbo,Evaluate if the following Q follows common sen...,It is simpler to clean five cups than to clean...,[School starting next week implies that you wi...,"[correct, correct, correct, correct]","[relevant, relevant, relevant, relevant]",yes,False,True
898,com2sense_99b,com2sense,gpt-4-turbo,Evaluate if the following Q follows common sen...,Washing five cups is easier than washing one cup.,[Understanding the scenario: The question ment...,"[correct, correct, correct, correct, correct]","[relevant, relevant, relevant, relevant, relev...",yes,False,True


In [97]:
def save_json_to_filepath(data, filepath):
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

save_json_to_filepath(res, 'data/com2sense_results.json')

In [48]:
for i in range(len(results_df)):
    assert len(results_df.loc[i, 'correctness_annotations']) == len(results_df.loc[i, 'relevance_annotations'])
    # count how many 'incorrect' are in correctness annotations
    incorrect_count = 0
    irrelevant_count = 0
    for j, correctness in enumerate(results_df.loc[i, 'correctness_annotations']):
        if correctness == 'incorrect':
            print('Question: ', results_df['question'][i])
            print([step for step in results_df['cot_steps'][i]])
            print('Incorrect step: ', j+1)

Question:  Evaluate if the following Q follows common sense. Answer 'True' or 'False'
Keith is 5 feet tall so he is less likely to become an amateur basketball player than a horse jockey.
['Being 5 feet tall is short for an amateur basketball player.', 'Being 5 feet tall is relatively tall for a horse jockey.', 'Therefore, a 5-foot-tall person is less likely to become a successful amateur basketball player than a horse jockey.', 'True']
Incorrect step:  2
Question:  Evaluate if the following Q follows common sense. Answer 'True' or 'False'
Keith is 5 feet tall so he is less likely to become a novice basketball player than a horse rider.
['Being 5 feet tall does not automatically disqualify someone from being a basketball player.', 'Horse riding does not have a strict height requirement like basketball does.', 'Therefore, it does not make sense to say that Keith being 5 feet tall makes him less likely to become a novice basketball player than a horse rider.', 'False']
Incorrect step:  2