In [None]:
# import modules

import re
import os
import json
import time
import torch
from dotenv import load_dotenv
import pandas as pd
from tqdm import tqdm
from google import genai
from openai import OpenAI
from google.genai import types
from datasets import load_dataset
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
import requests
import gzip
from pprint import pprint

In [None]:
# load API keys
load_dotenv()

# Configure OpenAI
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

# Configure Gemini
from google import genai
api_key=os.getenv('GEMINI_API')

client = genai.Client(api_key=api_key)


In [82]:
# load dataset

# 1. CSQA – CommonsenseQA (train/dev/test splits)
csqa = load_dataset("commonsense_qa")          

# 2. StrategyQA – yes/no questions requiring multi-hop reasoning
strategyqa = load_dataset("metaeval/strategy-qa")

# 3. Date Understanding
date_url = "https://raw.githubusercontent.com/google/BIG-bench/main/bigbench/benchmark_tasks/date_understanding/task.json"
date_response = requests.get(date_url)
date = json.loads(date_response.text)

# 4. Sports Understanding – sports-related questions
sports_url = "https://raw.githubusercontent.com/google/BIG-bench/main/bigbench/benchmark_tasks/sports_understanding/task.json"
sports_response = requests.get(sports_url)
sports = json.loads(sports_response.text)

# 5. SayCan – Language→robot-action mapping (Google Robotics, 2022)
saycan = load_dataset("chiayewken/saycan")

Using the latest cached version of the dataset since commonsense_qa couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/thyag/.cache/huggingface/datasets/commonsense_qa/default/0.0.0/94630fe30dad47192a8546eb75f094926d47e155 (last modified on Sat Jul 12 21:50:08 2025).
Using the latest cached version of the dataset since metaeval/strategy-qa couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/thyag/.cache/huggingface/datasets/metaeval___strategy-qa/default/0.0.0/f0e71799fcfdee70d1618765168904370cfec189 (last modified on Sat Jul 12 21:50:17 2025).
Using the latest cached version of the dataset since chiayewken/saycan couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/thyag/.cache/huggingface/datasets/chiayewken___saycan/default/0.0.0/4cc33f0ebe8b27733035e097b7bd4976fc1a786d (last modified on Sat Jul 12 22:01:56 2025).


In [83]:
# template functions for commonsense reasoning tasks

def csqa_template(example: dict) -> tuple:
    """
    Converts a CommonsenseQA example dict to the question template.
    Args:
        example (dict): Dictionary with 'question', 'choices', and 'answerKey'.
    Returns:
        tuple: (question string, answer string)
    """
    labels = [label.lower() for label in example['choices']['label']]
    texts = example['choices']['text']
    choices_str = " ".join([f"({label}) {text}" for label, text in zip(labels, texts)])
    question_template = f"{example['question']} Answer Choices: {choices_str}"
    answer = f"The answer is ({example['answerKey'].lower()})"
    return question_template, answer

def strategyqa_template(example: dict) -> tuple:
    """
    Converts a StrategyQA example dict to the question template.
    Args:
        example (dict): Dictionary with 'question' and 'answer'.
    Returns:
        tuple: (question string, answer string)
    """
    question = example['question']
    answer = "yes" if example['answer'] else "no"
    answer_text = f"The answer is {answer}"
    return question, answer_text

def date_template(example: dict) -> tuple:
    """
    Converts a date understanding example dict to the question template.
    Args:
        example (dict): Dictionary with 'question' and 'target_scores'.
    Returns:
        tuple: (question string, answer string)
    """
    question = example['input']
    correct_answer = str(next((k for k, v in example['target_scores'].items() if v == 1), None))
    answer = f"The answer is {correct_answer}"
    return question, answer

def sports_template(example: dict) -> tuple:
    """
    Converts a sports understanding example dict to the question template.
    Args:
        example (dict): Dictionary with 'input' and 'target_scores'.
    Returns:
        tuple: (question string, answer string)
    """
    question = f'Is the following sentence plausible? "{example["input"]}"'
    answer = "yes" if example["target_scores"].get("plausible", 0) == 1 else "no"
    answer_text = f"The answer is {answer}"
    return question, answer_text

def saycan_template(example: dict) -> tuple:
    """
    Converts a SayCan example dict to the question template.
    Args:
        example (dict): Dictionary with 'instruction' and 'plan'.
    Returns:
        tuple: (question string, answer string)
    """
    question = example['INPUT']
    answer = f"The answer is {example['OUTPUT']}"
    return question, answer
  

In [84]:
saycan

DatasetDict({
    test: Dataset({
        features: ['INPUT', 'OUTPUT'],
        num_rows: 99
    })
})

In [85]:
# quick smoke test for templates

REAL_EXAMPLES = {
    "csqa": {
        "question": "Where would you find a hamburger?",
        "choices": {
            "label": ["A", "B", "C", "D", "E"],
            "text": ["fast food restaurant", "pizza", "kitchen", "grocery store", "farms"]
        },
        "answerKey": "A"
    },
    "strategyqa": {
        "question": "Would a person with a high fever typically sweat?",
        "answer": True
    },
    "date": {
        "input": "What day of the week was 2020-02-02?",
        "target_scores": {"Sunday": 1, "Monday": 0, "Tuesday": 0}
    },
    "sports": {
        "input": "The quarterback threw a touchdown pass.",
        "target_scores": {"plausible": 1}
    },
    "saycan": {
        "instruction": "Bring me a cold drink.",
        "plan": "open fridge, pick drink, close fridge, bring drink"
    }
}

def test_smoke():
    """Sanity check with the hand-crafted examples above."""
    templates = {
        "csqa": csqa_template,
        "strategyqa": strategyqa_template,
        "date": date_template,
        "sports": sports_template,
        "saycan": saycan_template,
    }

    for dataset, tpl in templates.items():
        print(f"\n--- {dataset} ---")
        q, a = tpl(REAL_EXAMPLES[dataset])
        print("Question:", q)
        print("Answer  :", a)
        # basic assertions
        assert isinstance(q, str) and q.strip()
        assert isinstance(a, str) and a.strip()

test_smoke()


--- csqa ---
Question: Where would you find a hamburger? Answer Choices: (a) fast food restaurant (b) pizza (c) kitchen (d) grocery store (e) farms
Answer  : The answer is (a)

--- strategyqa ---
Question: Would a person with a high fever typically sweat?
Answer  : The answer is yes

--- date ---
Question: What day of the week was 2020-02-02?
Answer  : The answer is Sunday

--- sports ---
Question: Is the following sentence plausible? "The quarterback threw a touchdown pass."
Answer  : The answer is yes

--- saycan ---


KeyError: 'INPUT'

In [86]:
# prompt template for CSQA

CSQA_PROMPT = """

Q: What do people use to absorb extra ink from a fountain pen? Answer Choices: (a) shirt pocket (b)
calligrapher’s hand (c) inkwell (d) desk drawer (e) blotter
A: The answer must be an item that can absorb ink. Of the above choices, only blotters are used to absorb ink.
So the answer is (e).
Q: What home entertainment equipment requires cable?
Answer Choices: (a) radio shack (b) substation (c) television (d) cabinet
A: The answer must require cable. Of the above choices, only television requires cable. So the answer is (c).
Q: The fox walked from the city into the forest, what was it looking for? Answer Choices: (a) pretty flowers (b)
hen house (c) natural habitat (d) storybook
A: The answer must be something in the forest. Of the above choices, only natural habitat is in the forest. So the
answer is (b).
Q: Sammy wanted to go to where the people were. Where might he go? Answer Choices: (a) populated areas
(b) race track (c) desert (d) apartment (e) roadblock
A: The answer must be a place with a lot of people. Of the above choices, only populated areas have a lot of
people. So the answer is (a).
Q: Where do you put your grapes just before checking out? Answer Choices: (a) mouth (b) grocery cart (c)super
market (d) fruit basket (e) fruit market
A: The answer should be the place where grocery items are placed before checking out. Of the above choices,
grocery cart makes the most sense for holding grocery items. So the answer is (b).
Q: Google Maps and other highway and street GPS services have replaced what? Answer Choices: (a) united
states (b) mexico (c) countryside (d) atlas
A: The answer must be something that used to do what Google Maps and GPS services do, which is to give
directions. Of the above choices, only atlases are used to give directions. So the answer is (d).
Q: Before getting a divorce, what did the wife feel who was doing all the work? Answer Choices: (a) harder (b)
anguish (c) bitterness (d) tears (e) sadness
A: The answer should be the feeling of someone getting divorced who was doing all the work. Of the above
choices, the closest feeling is bitterness. So the answer is (c).
Q: {Question}
A:

"""

In [87]:
# prompt for StrategyQA

StrategyQA_PROMPT = """

Q: Do hamsters provide food for any animals?
A: Hamsters are prey animals. Prey are food for predators. Thus, hamsters provide food for some animals. So
the answer is yes.
Q: Could Brooke Shields succeed at University of Pennsylvania?
A: Brooke Shields went to Princeton University. Princeton University is about as academically rigorous as the
University of Pennsylvania. Thus, Brooke Shields could also succeed at the University of Pennsylvania. So the
answer is yes.
Q: Yes or no: Hydrogen’s atomic number squared exceeds number of Spice Girls?
A: Hydrogen has an atomic number of 1. 1 squared is 1. There are 5 Spice Girls. Thus, Hydrogen’s atomic
number squared is less than 5. So the answer is no.
Q: Yes or no: Is it common to see frost during some college commencements?
A: College commencement ceremonies can happen in December, May, and June. December is in the winter, so
there can be frost. Thus, there could be frost at some commencements. So the answer is yes.
Q: Yes or no: Could a llama birth twice during War in Vietnam (1945-46)?
A: The War in Vietnam was 6 months. The gestation period for a llama is 11 months, which is more than 6
months. Thus, a llama could not give birth twice during the War in Vietnam. So the answer is no.
Q: Yes or no: Would a pear sink in water?
A: The density of a pear is about 0.6g/cm3, which is less than water. Objects less dense than water float. Thus,
a pear would float. So the answer is no.
Q: {Question}
A:

"""

In [88]:
# prompt for date understanding 

Date_PROMPT = """

Q: 2015 is coming in 36 hours. What is the date one week from today in MM/DD/YYYY?
A: If 2015 is coming in 36 hours, then it is coming in 2 days. 2 days before 01/01/2015 is 12/30/2014, so today
is 12/30/2014. So one week from today will be 01/05/2015. So the answer is 01/05/2015.
Q: The first day of 2019 is a Tuesday, and today is the first Monday of 2019. What is the date today in
MM/DD/YYYY?
A: If the first day of 2019 was Tuesday, then 01/01/2019 was a Tuesday. Today is the first monday, would be six
days later. So today is 01/07/2019. So the answer is 01/07/2019.
Q: The concert was scheduled to be on 06/01/1943, but was delayed by one day to today. What is the date 10
days ago in MM/DD/YYYY?
A: One day after 06/01/1943 is 06/02/1943, so today is 06/02/1943. 10 days before today is 05/23/1943. So the
answer is 05/23/1943.
Q: It is 4/19/1969 today. What is the date 24 hours later in MM/DD/YYYY?
A: Today is 04/19/1969. 24 hours later is one day after today, which would be 04/20/1969. So the answer is
04/20/1969.
Q: Jane thought today is 3/11/2002, but today is in fact Mar 12, which is 1 day later. What is the date 24 hours
later in MM/DD/YYYY?
A: Today is 03/12/2002. So the date 24 hours later will be 03/13/2002. So the answer is 03/13/2002.
Q: Jane was born on the last day of Feburary in 2001. Today is her 16-year-old birthday. What is the date
yesterday in MM/DD/YYYY?
A: The last day of February is the 28th, so Jane was born on 02/28/2001. Today is her 16-year old birthday, so
today is 02/28/2017. So yesterday was 02/27/2017. So the answer is 02/27/2017
Q: {Question}
A:

"""


In [89]:
# prompt for sports understanding

Sports_PROMPT = """

Q: Is the following sentence plausible? “Kyle Palmieri was called for slashing.”
A: Kyle Palmieri is a hockey player. Being called for slashing is part of hockey. So the answer is yes.
Q: Is the following sentence plausible? “Joao Moutinho caught the screen pass in the NFC championship.”
A: Joao Moutinho is a soccer player. The NFC championship is part of American football, not soccer. So the
answer is no.
Q: Is the following sentence plausible? “Carson Wentz set the pick and roll.”
A: Carson Wentz is an American football player. Pick and roll is part of basketball, not football. So the answer
is no.
Q: Is the following sentence plausible? “Jonas Valanciunas beat the buzzer.”
A: Jonas Valanciunas is a basketball player. Beating the buzzer is part of basketball. So the answer is yes.
Q: Is the following sentence plausible? “Jamel Murray was perfect from the line.”
A: Jamal Murray is a basketball player. Being perfect from the line is part of basketball. So the answer is yes.
Q: Is the following sentence plausible? “Sam Darnold passed the puck.”
A: Sam Darnold is a American football player. Passing the puck is part of hockey, not American football. So the
answer is no.
Q: Is the following sentence plausible? “Draymond Green threw a touchdown.”
A: Draymond Green is an basketball player. Throwing a touchdown is part of football, not basketball. So the
answer is no.
Q: Is the following sentence plausible? “Malcolm Brogdon banked the shot in.”
A: Malcolm Brogdon is a basketball player. Banking the shot in is part of basketball. So the answer is yes.
Q: {Question}
A:

"""

In [90]:
# prompt for saycan 

SayCan_PROMPT = """

Locations = [counter, table, user, trash, bowl].
Objects = [7up, apple, kettle chips, tea, multigrain chips, coke, lime soda, jalapeno chips, rice chips, orange,
grapefruit soda, pepsi, redbull, energy bar, sponge, water].
The robot can pick up items with pick(object) and put down items with put(object) as well as find objects or
locations with find(). The robot can only understand the explicit locations and objects listed.
Human: I’m hungry, can you bring me some chips.
Explanation: The user is hungry and has asked for chips. There are several types of chips available, I will bring
the user the kettle chips.
Plan: 1. find(kettle chips), 2. pick(kettle chips), 3. find(user), 4. put(kettle chips), 5. done().
Human: How would you move the grapefruit drink from the table to the counter?
Explanation: The user has asked me to move the grapefruit drink to the counter.
Plan: 1. find(grapefruit soda), 2. pick(grapefruit soda), 3. find(counter), 4. put(grapefruit soda), 5. done().
Human: How would you bring me some snacks?
Explanation: The user has asked for snacks, I will choose two items and bring them. I will bring jalapeno chips
and an apple.
Plan: 1. find(jalapeno chips), 2. pick(jalapeno chips), 3. find(user), 4. put(jalapeno chips), 5. find(apple), 6.
pick(apple), 7. find(user), 8. put(apple), 9. done().
Human: How would you bring me something to eat that isn’t a fruit?
Explanation: The user has asked for a food that isn’t a fruit, I will bring an energy bar to them.
Plan: 1. find(energy bar), 2. pick(energy bar), 3. find(user), 4. put(energy bar), 5. done().
Human: How would you put the rice chips in the bowl and then move the tea to the table?
Explanation: The user has asked me to do two tasks, I will do one and then the other.
Plan: 1. find(rice chips), 2. pick(rice chips), 3. find(bowl), 4. put(rice chips), 5. find(tea), 6. pick(tea), 7.
find(table), 8. put(tea), 9. done().
Human: How would you throw away a redbull?
Explanation: The user has asked me to throw away the redbull, I will move it to the trash.
Plan: 1. find(redbull), 2. pick(redbull), 3. find(trash), 4. put(redbull), 5. done().
Human: Bring me a drink.
Explanation: The user has asked for a drink and there are many options. I will bring them a water.
Plan: 1. find(water), 2. pick(water), 3. find(user), 4. put(water), 5. done().


Human: {Question}
Explanation:
Plan:
"""

In [91]:
strategyqa

DatasetDict({
    train: Dataset({
        features: ['qid', 'term', 'description', 'question', 'answer', 'facts', 'decomposition'],
        num_rows: 2290
    })
})

In [92]:
saycan

DatasetDict({
    test: Dataset({
        features: ['INPUT', 'OUTPUT'],
        num_rows: 99
    })
})

In [115]:
# Set up template functions
template_functions = {
    #"csqa": csqa_template,
    "strategyqa": strategyqa_template,
    "date": date_template,
    "sports": sports_template,
    "saycan": saycan_template
}

# Set up prompt templates
prompt_templates = {
    #"csqa": CSQA_PROMPT,
    "strategyqa": StrategyQA_PROMPT,
    "date": Date_PROMPT,
    "sports": Sports_PROMPT,
    "saycan": SayCan_PROMPT
}

# Set up datasets
datasets = {
    #"csqa": csqa,
    "strategyqa": strategyqa,
    "date": date,
    "sports": sports,
    "saycan": saycan
}

# Define models and system prompts
openai_models = [] #"gpt-4.1-nano", "gpt-4o-mini-2024-07-18"
gemini_models = [ "gemini-1.5-flash"]
system_prompts = {
    "few_shot": "You are a helpful assistant. Carefully follow the reasoning and answer format shown in the few-shot examples provided by the user. Use the same step-by-step explanation style and answer format.",
    "zero_shot": "You are a helpful assistant. First, provide a detailed answer to the question. Then, on a new line, clearly state your final answer in the format: The answer is {answer}. Only include the final answer in that line, without any explanation."
}


In [None]:

# load API keys
load_dotenv()

# Configure OpenAI
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

# Configure Gemini
from google import genai
client = genai.Client(api_key="xx")


# Add this to check if API keys loaded successfully
def check_api_keys():
    # Check keys
    if os.getenv('OPENAI_API_KEY'):
        print("✅ OpenAI API key found")
    else:
        print("❌ OpenAI API key missing")
        
    if os.getenv('GEMINI_API'):
        print("✅ Gemini API key found")
    else:
        print("❌ Gemini API key missing")

check_api_keys()

✅ OpenAI API key found
✅ Gemini API key found


In [117]:
import os
import json
import time
from tqdm import tqdm
from typing import Dict, List, Tuple, Callable, Any
from openai import OpenAI
from google import genai
from google.genai import types
from google.genai.types import GenerateContentConfig

def test_models(openai_models: List[str], gemini_models: List[str]) -> Dict[str, bool]:
    """
    Test if all models are accessible and working properly.
    """
    results = {}
    
    # Initialize clients properly
    openai_client = OpenAI()
    
    # Simple test prompt
    test_prompt = "Complete this sentence: The sky is"
    
    print("Testing OpenAI models...")
    for model in openai_models:
        print(f"  Testing {model}...", end="", flush=True)
        try:
            response = openai_client.responses.create(
                model=model,
                instructions="You are a helpful assistant.",
                input=test_prompt,
                temperature=0.0
            )
            print(f" ✓ Success: {response.output_text[:30]}...")
            results[model] = True
        except Exception as e:
            print(f" ❌ Failed: {str(e)[:100]}...")
            results[model] = False
    
    print("\nTesting Gemini models...")
    for model in gemini_models:
        print(f"  Testing {model}...", end="", flush=True)
        try:
            response = client.models.generate_content(
                model=model,
                contents=test_prompt,
                config=types.GenerateContentConfig(temperature=0)
            )
            print(f" ✓ Success: {response.text[:30]}...")
            results[model] = True
        except Exception as e:
            print(f" ❌ Failed: {str(e)[:100]}...")
            results[model] = False
        
        # Add small delay between tests
        time.sleep(1)
    
    return results

class BenchmarkRunner:
    def __init__(
        self, 
        datasets: Dict[str, any],
        template_functions: Dict[str, Callable],
        prompt_templates: Dict[str, str],
        openai_models: List[str],
        gemini_models: List[str],
        system_prompts: Dict[str, str],
        output_dir: str = "results",
        sample_size: int = 20
    ):
        """
        Initialize the benchmark runner for commonsense reasoning tasks.
        
        Args:
            datasets: Dictionary mapping dataset names to dataset objects
            template_functions: Dictionary mapping dataset names to template functions
            prompt_templates: Dictionary mapping dataset names to prompt templates
            openai_models: List of OpenAI model names to test
            gemini_models: List of Gemini model names to test
            system_prompts: Dictionary with system prompts for few_shot and zero_shot modes
            output_dir: Directory to save results
            sample_size: Number of examples to run per dataset
        """
        self.datasets = datasets
        self.template_functions = template_functions
        self.prompt_templates = prompt_templates
        self.openai_models = openai_models
        self.gemini_models = gemini_models
        self.system_prompts = system_prompts
        self.output_dir = output_dir
        self.sample_size = sample_size
        
        # Initialize clients
        self.openai_client = OpenAI()
        
        # Get Gemini API key from environment
        # We don't need to initialize Gemini client here anymore
        # since we're using the globally configured genai module
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
    
    def run_openai(self, prompt_type: str, dataset_name: str, model_name: str, examples: List[Dict]):
        """
        Run benchmarks with OpenAI models using their specific API format
        
        Args:
            prompt_type: "few_shot" or "zero_shot"
            dataset_name: Name of the dataset
            model_name: Name of the model
            examples: List of processed examples with questions and expected answers
        """
        chat_responses = []
        
        for i in tqdm(range(min(self.sample_size, len(examples))), desc=f"OpenAI {model_name} on {dataset_name}"):
            question, original_answer = examples[i]
            
            if prompt_type == "few_shot":
                prompt = self.prompt_templates[dataset_name].replace("{Question}", question)
                instructions = self.system_prompts["few_shot"]
            else:  # zero_shot
                prompt = question
                instructions = self.system_prompts["zero_shot"]
            
            try:
                response = self.openai_client.responses.create(
                    model=model_name,
                    instructions=instructions,
                    input=prompt,
                    temperature=0.0
                )
                model_answer = response.output_text
            except Exception as e:
                print(f"Error with OpenAI model {model_name}: {e}")
                model_answer = f"ERROR: {str(e)}"
            
            chat_responses.append({
                "question": question,
                "original_answer": original_answer,
                "answer_text": model_answer
            })
        
        # Save results
        output_filename = os.path.join(
            self.output_dir,
            f"{dataset_name}-{prompt_type}-{model_name.replace('.', '-')}.json"
        )
        with open(output_filename, "w", encoding="utf-8") as f:
            json.dump(chat_responses, f, ensure_ascii=False, indent=4)
        
        print(f"Results saved to {output_filename}")
    
    def run_gemini(self, prompt_type: str, dataset_name: str, model_name: str, examples: List[Dict]):
        """
        Run benchmarks with Gemini models using their specific API format
        """
        chat_responses = []
        
        for i in tqdm(range(min(self.sample_size, len(examples))), desc=f"Gemini {model_name} on {dataset_name}"):
            question, original_answer = examples[i]
            
            if prompt_type == "few_shot":
                prompt = self.prompt_templates[dataset_name].replace("{Question}", question)
                instructions = self.system_prompts["few_shot"]
            else:  # zero_shot
                prompt = question
                instructions = self.system_prompts["zero_shot"]
            
            try:
                # Use the global genai module instead of self.gemini_client
                response = client.models.generate_content(
                    model=model_name,
                    config=types.GenerateContentConfig(
                        system_instruction=instructions
                    ),
                    contents=prompt
                )
                model_answer = response.text
            except Exception as e:
                print(f"Error with Gemini model {model_name}: {e}")
                model_answer = f"ERROR: {str(e)}"
            
            chat_responses.append({
                "question": question,
                "original_answer": original_answer,
                "answer_text": model_answer
            })
            
            # Add larger delay to avoid rate limiting
            time.sleep(15)
        
        # Save results
        output_filename = os.path.join(
            self.output_dir,
            f"{dataset_name}-{prompt_type}-{model_name.replace('.', '-')}.json"
        )
        with open(output_filename, "w", encoding="utf-8") as f:
            json.dump(chat_responses, f, ensure_ascii=False, indent=4)
        
        print(f"Results saved to {output_filename}")

    def test_models(self) -> Dict[str, bool]:
        """Test if all models configured in the runner are working."""
        return test_models(self.openai_models, self.gemini_models)
        
    def prepare_examples(self, dataset_name: str):
        """
        Process examples from the dataset using the appropriate template function
        
        Args:
            dataset_name: Name of the dataset
            
        Returns:
            List of (question, answer) tuples
        """
        examples = []
        
        # if dataset_name == "csqa":
        #     dataset_examples = self.datasets["csqa"]["train"]
        if dataset_name == "strategyqa":
            dataset_examples = self.datasets["strategyqa"]["train"]
        elif dataset_name == "date":
            dataset_examples = self.datasets["date"]["examples"]
        elif dataset_name == "sports":
            dataset_examples = self.datasets["sports"]["examples"]
        elif dataset_name == "saycan":
            dataset_examples = self.datasets["saycan"]["test"]
        else:
            raise ValueError(f"Unknown dataset: {dataset_name}")
        
            
        template_func = self.template_functions[dataset_name]
        
        # Process at most sample_size examples
        for i in range(min(self.sample_size, len(dataset_examples))):
            example = dataset_examples[i]
            question, answer = template_func(example)
            examples.append((question, answer))
            
        return examples
    
    def run_all_benchmarks(self, modes=["few_shot", "zero_shot"], test_first=True):
        """
        Run benchmarks for all datasets, models and specified modes
        
        Args:
            modes: List of modes to run ("few_shot", "zero_shot")
            test_first: Whether to test models before running benchmarks
        """
        if test_first:
            model_status = self.test_models()
            working_models = [model for model, status in model_status.items() if status]
            failed_models = [model for model, status in model_status.items() if not status]
            
            if failed_models:
                print(f"⚠️ WARNING: The following models failed the test: {', '.join(failed_models)}")
                proceed = input("Do you want to proceed with the working models only? (y/n): ")
                if proceed.lower() != 'y':
                    print("Benchmark run canceled.")
                    return
                
                # Filter models to only include working ones
                self.openai_models = [model for model in self.openai_models if model in working_models]
                self.gemini_models = [model for model in self.gemini_models if model in working_models]
                
                print(f"Continuing with working models:")
                print(f"  OpenAI models: {self.openai_models}")
                print(f"  Gemini models: {self.gemini_models}")
        
        for dataset_name in self.prompt_templates.keys():
            # Prepare examples for this dataset
            examples = self.prepare_examples(dataset_name)
            
            for mode in modes:
                # Run OpenAI models
                for model_name in self.openai_models:
                    self.run_openai(mode, dataset_name, model_name, examples)
                
                # Run Gemini models
                for model_name in self.gemini_models:
                    self.run_gemini(mode, dataset_name, model_name, examples)

In [119]:
# Usage:
if __name__ == "__main__":
    # Create benchmark runner with validated models
    runner = BenchmarkRunner(
        datasets=datasets,
        template_functions=template_functions,
        prompt_templates=prompt_templates,
        openai_models=openai_models,
        gemini_models=gemini_models,
        system_prompts=system_prompts,
        output_dir="results",
        sample_size=20
    )
    
    # Run benchmarks (test_first=False since we already tested)
    runner.run_all_benchmarks(modes=["zero_shot"], test_first=False)

Gemini gemini-1.5-flash on strategyqa:  10%|█         | 2/20 [01:27<13:44, 45.83s/it]

Error with Gemini model gemini-1.5-flash: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}


Gemini gemini-1.5-flash on strategyqa:  15%|█▌        | 3/20 [01:46<09:27, 33.36s/it]

Error with Gemini model gemini-1.5-flash: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}


Gemini gemini-1.5-flash on strategyqa:  20%|██        | 4/20 [02:03<07:09, 26.82s/it]

Error with Gemini model gemini-1.5-flash: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}


Gemini gemini-1.5-flash on strategyqa:  25%|██▌       | 5/20 [02:24<06:13, 24.93s/it]

Error with Gemini model gemini-1.5-flash: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}


Gemini gemini-1.5-flash on strategyqa:  30%|███       | 6/20 [03:29<08:09, 34.97s/it]


KeyboardInterrupt: 

In [97]:
csqa

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1140
    })
})

In [None]:
# Full benchmark runner with OpenAI and Gemini client support + Gemini key rotation

import os
import time
import json
from typing import List, Dict, Callable
from dotenv import load_dotenv
from tqdm import tqdm

# Load environment variables
load_dotenv()

# Install dependencies if needed:
# pip install openai google-generativeai tqdm python-dotenv

# OpenAI Setup
from openai import OpenAI
openai_api_key = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=openai_api_key)

# Gemini Setup
from google import genai
from google.api_core.exceptions import GoogleAPICallError, RetryError, InvalidArgument

# Load Gemini API keys
def load_gemini_keys(file_path="gemini_keys.txt"):
    with open(file_path, "r") as f:
        keys = [line.strip() for line in f if line.strip()]
    if not keys:
        raise ValueError("No Gemini API keys found in file.")
    return keys

# Rotating Gemini client
class GeminiClientWithRotation:
    def __init__(self, keys: List[str], model_name: str = "gemini-pro"):
        self.keys = keys
        self.model_name = model_name
        self.current_key_index = 0
        self.configure_genai(self.keys[self.current_key_index])

    def configure_genai(self, key: str):
        genai.configure(api_key=key)
        self.model = genai.GenerativeModel(self.model_name)

    def rotate_key(self):
        self.current_key_index += 1
        if self.current_key_index >= len(self.keys):
            raise RuntimeError("All Gemini API keys failed.")
        self.configure_genai(self.keys[self.current_key_index])

    def generate(self, prompt: str, instructions: str = "", temperature: float = 0.0):
        retries = 0
        while retries < len(self.keys):
            try:
                response = self.model.generate_content(
                    prompt,
                    generation_config=genai.types.GenerationConfig(temperature=temperature),
                    safety_settings=None,
                    system_instruction=instructions
                )
                return response.text
            except (GoogleAPICallError, RetryError, InvalidArgument) as e:
                print(f"❌ Gemini Key {self.keys[self.current_key_index][:8]}... failed: {type(e).__name__}")
                retries += 1
                self.rotate_key()
                time.sleep(1)
        raise RuntimeError("Failed to generate response with all Gemini API keys.")

# Test OpenAI and Gemini models
def test_models(openai_models, gemini_models, gemini_keys):
    results = {}
    test_prompt = "Complete this sentence: The sky is"

    print("Testing OpenAI models...")
    for model in openai_models:
        print(f"  Testing {model}...", end="", flush=True)
        try:
            response = openai_client.responses.create(
                model=model,
                instructions="You are a helpful assistant.",
                input=test_prompt,
                temperature=0.0
            )
            print(f" ✓ {response.output_text[:30]}...")
            results[model] = True
        except Exception as e:
            print(f" ❌ Failed: {str(e)[:100]}")
            results[model] = False

    print("\nTesting Gemini models...")
    for model in gemini_models:
        print(f"  Testing {model}...", end="", flush=True)
        try:
            client = GeminiClientWithRotation(gemini_keys, model_name=model)
            response = client.generate(test_prompt)
            print(f" ✓ {response[:30]}...")
            results[model] = True
        except Exception as e:
            print(f" ❌ Failed: {str(e)[:100]}")
            results[model] = False

    return results

# Benchmark Runner
class BenchmarkRunner:
    def __init__(
        self,
        datasets: Dict[str, any],
        template_functions: Dict[str, Callable],
        prompt_templates: Dict[str, str],
        openai_models: List[str],
        gemini_models: List[str],
        system_prompts: Dict[str, str],
        gemini_keys: List[str],
        output_dir: str = "results",
        sample_size: int = 20
    ):
        self.datasets = datasets
        self.template_functions = template_functions
        self.prompt_templates = prompt_templates
        self.openai_models = openai_models
        self.gemini_models = gemini_models
        self.system_prompts = system_prompts
        self.gemini_keys = gemini_keys
        self.output_dir = output_dir
        self.sample_size = sample_size

        os.makedirs(output_dir, exist_ok=True)

    def prepare_examples(self, dataset_name: str):
        examples = []
        if dataset_name in self.datasets:
            dataset_examples = self.datasets[dataset_name]
        else:
            raise ValueError(f"Unknown dataset: {dataset_name}")

        template_func = self.template_functions[dataset_name]
        for i in range(min(self.sample_size, len(dataset_examples))):
            example = dataset_examples[i]
            question, answer = template_func(example)
            examples.append((question, answer))

        return examples

    def run_openai(self, prompt_type, dataset_name, model_name, examples):
        results = []

        for i in tqdm(range(len(examples)), desc=f"OpenAI {model_name} on {dataset_name}"):
            question, original_answer = examples[i]

            if prompt_type == "few_shot":
                prompt = self.prompt_templates[dataset_name].replace("{Question}", question)
                instructions = self.system_prompts["few_shot"]
            else:
                prompt = question
                instructions = self.system_prompts["zero_shot"]

            try:
                response = openai_client.responses.create(
                    model=model_name,
                    instructions=instructions,
                    input=prompt,
                    temperature=0.0
                )
                answer = response.output_text
            except Exception as e:
                answer = f"ERROR: {str(e)}"

            results.append({
                "question": question,
                "original_answer": original_answer,
                "answer_text": answer
            })

        with open(os.path.join(self.output_dir, f"{dataset_name}-{prompt_type}-{model_name}.json"), "w") as f:
            json.dump(results, f, indent=4, ensure_ascii=False)

    def run_gemini(self, prompt_type, dataset_name, model_name, examples):
        results = []
        client = GeminiClientWithRotation(self.gemini_keys, model_name=model_name)

        for i in tqdm(range(len(examples)), desc=f"Gemini {model_name} on {dataset_name}"):
            question, original_answer = examples[i]

            if prompt_type == "few_shot":
                prompt = self.prompt_templates[dataset_name].replace("{Question}", question)
                instructions = self.system_prompts["few_shot"]
            else:
                prompt = question
                instructions = self.system_prompts["zero_shot"]

            try:
                answer = client.generate(prompt, instructions=instructions)
            except Exception as e:
                answer = f"ERROR: {str(e)}"

            results.append({
                "question": question,
                "original_answer": original_answer,
                "answer_text": answer
            })
            time.sleep(2)

        with open(os.path.join(self.output_dir, f"{dataset_name}-{prompt_type}-{model_name}.json"), "w") as f:
            json.dump(results, f, indent=4, ensure_ascii=False)

    def run_all_benchmarks(self, modes=["few_shot", "zero_shot"], test_first=True):
        if test_first:
            model_status = test_models(self.openai_models, self.gemini_models, self.gemini_keys)
            self.openai_models = [m for m in self.openai_models if model_status.get(m, False)]
            self.gemini_models = [m for m in self.gemini_models if model_status.get(m, False)]

        for dataset_name in self.prompt_templates:
            examples = self.prepare_examples(dataset_name)
            for mode in modes:
                for model in self.openai_models:
                    self.run_openai(mode, dataset_name, model, examples)
                for model in self.gemini_models:
                    self.run_gemini(mode, dataset_name, model, examples)

