In [1]:
import pandas as pd
from collections import defaultdict
import random
import os
from openai import OpenAI
from google import genai
from google.genai import types
import anthropic
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
import re
import collections
import json
import copy

In [2]:
def load_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in: {file_path}")
        return None

# Example usage:
file_path = 'data/big_bench_responses_fixed.json'
data = load_json_file(file_path)

if data:
    print('Question:', data[0]['question'])
    print('Prompt CoT:', data[0]['prompt_cot'])
    print(len(data))

def save_json_to_filepath(data, filepath):
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

Question: Keith is 5 feet tall so he is less likely to become an amateur basketball player than a horse jockey.
Prompt CoT: Evaluate if the following Q follows common sense. Answer 'True' or 'False'
Q: Keith is 5 feet tall so he is less likely to become an amateur basketball player than a horse jockey.
In answering this question each step should be on a separate line and start with a number and a period, followed by the reasoning. Finally the answer should be on a new line with the word 'Answer' proceeded by a colon.
A: Let's think step by step.
1200


In [3]:
for k, v in data[0]['response'].items():
    print(k, v)

gpt-3.5-turbo 1. Keith's height of 5 feet may make it more difficult for him to play basketball at a competitive level because he is shorter than the average basketball player.
2. A horse jockey typically needs to be smaller in stature in order to ride a horse effectively.
3. Therefore, Keith is more likely to become a horse jockey than an amateur basketball player.
Answer: True
gpt-4-turbo 1. Consider the typical heights for athletes in various sports.
2. Basketball players are generally quite tall because height is a distinct advantage in the game.
3. The average height of a professional basketball player is around 6 feet 7 inches.
4. Keith, being 5 feet tall, is significantly shorter than the typical basketball player, which could disadvantage him in the sport, particularly at competitive levels.
5. Horse jockeys, on the other hand, are typically shorter and lighter, which is an advantage in horse racing.
6. The typical height for a horse jockey ranges from about 4 feet 10 inches to

In [4]:
models_dict = {
    'openai1': 'gpt-3.5-turbo', 
    'openai2': 'gpt-4-turbo', 
    'gemini1': 'gemini-1.5-flash'
}

# parse CoT into steps
def split_cot(output: str) -> list[str]:
    """
    Split numbered Chain-of-Thought into discrete steps.
    Accepts patterns like '(3)', '3.' or 'Step 3:' on a fresh line.
    Returns a list of step strings in order.
    """
    pat = re.compile(r"^\s*(?:\(?\s*(\d+)\s*[.):]|\bStep\s+(\d+)\s*[:.])", re.I)
    answer_pat = re.compile(r"^\s*Answer\s*:\s*(.+)", re.I)
    steps = []
    answer = None

    for line in output.splitlines():
        if pat.match(line):
            steps.append(pat.sub("", line).strip())
        else:
            m = answer_pat.match(line)
            if m:
                answer = m.group(1).strip()

    return steps, answer

step_data = []
for i in range(len(data)):
    id, task, question, answer, prompt_direct, prompt_cot, response = data[i].values()
    temp_data = copy.deepcopy(data[i])
    step_info = {}

    # parse openai1
    steps, answer = split_cot(response[models_dict['openai1']])
    step_info[models_dict['openai1']] = steps + [answer]

    # parse openai2
    steps, answer = split_cot(response[models_dict['openai2']])
    step_info[models_dict['openai2']] = steps + [answer]

    # parse gemini1
    steps, answer = split_cot(response[models_dict['gemini1']])
    step_info[models_dict['gemini1']] = steps + [answer]

    temp_data['steps'] = step_info
    step_data.append(temp_data)


save_json_to_filepath(step_data, 'data/big_bench_responses_with_steps_fixed.json')