In [1]:
import pandas as pd
from collections import defaultdict
import random
import os
from openai import OpenAI
from google import genai
from google.genai import types
import anthropic
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
import re
import collections
import json

In [2]:
def load_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in: {file_path}")
        return None

# Example usage:
file_path = 'data/big_bench_augmented_verified.json'
data = load_json_file(file_path)

if data:
    print('Question:', data[0]['question'])
    print('Prompt CoT:', data[0]['prompt_cot'])
    print(len(data))

Question: Keith is 5 feet tall so he is less likely to become an amateur basketball player than a horse jockey.
Prompt CoT: Evaluate if the following Q follows common sense. Answer 'True' or 'False'
Q: Keith is 5 feet tall so he is less likely to become an amateur basketball player than a horse jockey.
In answering this question each step should be on a separate line and start with a number and a period, followed by the reasoning. Finally the answer should be on a new line with the word 'Answer' proceeded by a colon.
A: Let's think step by step.
1200


In [10]:
# API Keys
MY_OPENAI_KEY = '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'
MY_ANTHROPIC_KEY = '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'
MY_GEMINI_KEY = 'AIzaSyC13qSGNQ8vMeqwkQdQA1pQ7o4LSBZJBX0'

# Connect to APIs
gpt_client = OpenAI(api_key = MY_OPENAI_KEY)
claude_client = anthropic.Anthropic(api_key = MY_ANTHROPIC_KEY)
gemini_client = genai.Client(api_key=MY_GEMINI_KEY)

clients_dict = {'openai': gpt_client, 'anthropic': claude_client, 'gemini': gemini_client}
models_dict = {
    'openai1': 'gpt-3.5-turbo', 
    'openai2': 'gpt-4-turbo', 
    'anthropic': 'claude-3-haiku-20240307', 
    'gemini1': 'gemini-1.5-flash',
    'gemini2': 'gemini-2.0-flash'
}

In [6]:
def get_response_object_from_llm(prompt, client, model_family = 'openai', model_type = 'gpt-3.5-turbo'):
    res = None
    model_family = model_family.lower()
    if model_family == 'openai':
        response = client.chat.completions.create(
            model=model_type,
            messages= [
                {"role": "user", "content": prompt}
            ]
        )
        res = response.choices[0].message.content.strip()

    elif model_family == 'anthropic':
        response = client.messages.create(
            model=model_type,
            max_tokens=1000,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        res = response.content[0].text

    elif model_family == 'gemini':
        response = client.models.generate_content(
            model=model_type,
            contents=prompt
        )
        res =  response.text

    return res

In [7]:
def save_json_to_filepath(data, filepath):
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

In [17]:
# create dataset containing CoT responses
import copy
import os # Ensure os is imported, though it's also in the first cell
# load_json_file function is defined in cell 'cda6da74'
# save_json_to_filepath function is defined in cell '0ae94a6c'

output_filepath = 'data/big_bench_augmented_with_responses.json'
new_data = []  # Initialize new_data

# Try to load existing data to resume processing
if os.path.exists(output_filepath):
    print(f"Attempting to load existing data from {output_filepath}...")
    # Assuming load_json_file is defined in a previous cell and handles file not found/decode errors
    loaded_data = load_json_file(output_filepath)
    if loaded_data is not None and isinstance(loaded_data, list):
        new_data = loaded_data
        print(f"Successfully loaded {len(new_data)} existing entries.")
    else:
        if loaded_data is None:
            print(f"Failed to load from {output_filepath} (file might be empty/corrupt or not found by load_json_file). Starting fresh.")
        else:
            print(f"Loaded data from {output_filepath} is not a list (type: {type(loaded_data)}). Starting fresh.")
        new_data = [] # Ensure new_data is a list if loading failed or format was wrong
else:
    print(f"No existing data file found at {output_filepath}. Starting fresh.")

start_index = len(new_data)

if start_index > 0:
    print(f"Resuming processing. {start_index} items already processed and loaded.")
    if start_index >= len(data):
        print(f"All {len(data)} items appear to be processed already. Check {output_filepath}.")
    else:
        print(f"Will process items from index {start_index} to {len(data) - 1}.")
else:
    print(f"Starting processing from the beginning for {len(data)} items.")

if start_index < len(data):  # Only run the loop if there's data to process
    for i in range(start_index, len(data)):
        # Progress indicator
        if i == start_index or (i - start_index) % 50 == 0:
            print(f'Processing item {i + 1}/{len(data)} (Overall index {i})')

        # Unpack data[i] values, renaming 'id' and 'prompt_cot' to avoid potential conflicts
        # This assumes data[i] is a dictionary and .values() returns them in a consistent order.
        # It's generally safer to access dictionary items by key if possible.
        # For now, proceeding with .values() as in the original code.
        item_values = list(data[i].values()) # Convert to list for robust unpacking
        id_val, task_val, question_val, answer_val, prompt_direct_val, prompt_cot_val = item_values

        temp_data = copy.deepcopy(data[i])
        response_obj = {}

        # output for gpt 3.5
        openai_output_3 = get_response_object_from_llm(prompt_cot_val, gpt_client, 'openai', models_dict['openai1'])
        response_obj[models_dict['openai1']] = openai_output_3

        # output for gpt 4
        openai_output_4 = get_response_object_from_llm(prompt_cot_val, gpt_client, 'openai', models_dict['openai2'])
        response_obj[models_dict['openai2']] = openai_output_4

        # output for claude
        anthropic_output = get_response_object_from_llm(prompt_cot_val, claude_client, 'anthropic', models_dict['anthropic'])
        response_obj[models_dict['anthropic']] = anthropic_output

        # output gemini 1.5
        gemini_output_1 = get_response_object_from_llm(prompt_cot_val, gemini_client, 'gemini', models_dict['gemini1'])
        response_obj[models_dict['gemini1']] = gemini_output_1

        # output gemini 2 (Corrected variable name from openai_output_2 to gemini_output_2)
        gemini_output_2 = get_response_object_from_llm(prompt_cot_val, gemini_client, 'gemini', models_dict['gemini2'])
        response_obj[models_dict['gemini2']] = gemini_output_2

        temp_data['response'] = response_obj
        new_data.append(temp_data)

        # Save every 10 items processed (based on overall index i) or at the very end of the loop
        if (i + 1) % 10 == 0 or i == len(data) - 1:
            print(f"Processed up to item {i + 1}/{len(data)}. Saving {len(new_data)} items to {output_filepath}...")
            save_json_to_filepath(new_data, output_filepath)
            print(f"Save complete. {len(new_data)} items are now in {output_filepath}.")

elif start_index == len(data) and len(data) > 0:
    print(f"All {len(data)} items were already processed. Data is in {output_filepath}.")
elif len(data) == 0:
    print("Input data is empty. Nothing to process.")
# No explicit 'else' needed here as the conditions cover all logical paths for processing.

Attempting to load existing data from data/big_bench_augmented_with_responses.json...
Successfully loaded 1050 existing entries.
Resuming processing. 1050 items already processed and loaded.
Will process items from index 1050 to 1199.
Processing item 1051/1200 (Overall index 1050)
Processed up to item 1060/1200. Saving 1060 items to data/big_bench_augmented_with_responses.json...
Save complete. 1060 items are now in data/big_bench_augmented_with_responses.json.
Processed up to item 1060/1200. Saving 1060 items to data/big_bench_augmented_with_responses.json...
Save complete. 1060 items are now in data/big_bench_augmented_with_responses.json.
Processed up to item 1070/1200. Saving 1070 items to data/big_bench_augmented_with_responses.json...
Save complete. 1070 items are now in data/big_bench_augmented_with_responses.json.
Processed up to item 1070/1200. Saving 1070 items to data/big_bench_augmented_with_responses.json...
Save complete. 1070 items are now in data/big_bench_augmented_wit