In [None]:
# Standard library imports
import os
import re
import json
import time
import gzip
import itertools
from typing import Dict, List, Callable, Any
from pprint import pprint

# Third-party imports
import torch
import requests
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from datasets import load_dataset

# Model and AI libraries
from openai import OpenAI
from google import genai
from google.genai import types
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

In [None]:
# load openai
load_dotenv()
# Configure OpenAI
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [None]:
# load dataset

# 1. CSQA – CommonsenseQA (train/dev/test splits)
csqa = load_dataset("commonsense_qa")          

# 2. StrategyQA – yes/no questions requiring multi-hop reasoning
strategyqa = load_dataset("metaeval/strategy-qa")

# 3. Date Understanding
date_url = "https://raw.githubusercontent.com/google/BIG-bench/main/bigbench/benchmark_tasks/date_understanding/task.json"
date_response = requests.get(date_url)
date = json.loads(date_response.text)

# 4. Sports Understanding – sports-related questions
sports_url = "https://raw.githubusercontent.com/google/BIG-bench/main/bigbench/benchmark_tasks/sports_understanding/task.json"
sports_response = requests.get(sports_url)
sports = json.loads(sports_response.text)

# 5. SayCan – Language→robot-action mapping (Google Robotics, 2022)
saycan = load_dataset("chiayewken/saycan")

In [None]:
# template functions for commonsense reasoning tasks

def csqa_template(example: dict) -> tuple:
    """
    Converts a CommonsenseQA example dict to the question template.
    Args:
        example (dict): Dictionary with 'question', 'choices', and 'answerKey'.
    Returns:
        tuple: (question string, answer string)
    """
    labels = [label.lower() for label in example['choices']['label']]
    texts = example['choices']['text']
    choices_str = " ".join([f"({label}) {text}" for label, text in zip(labels, texts)])
    question_template = f"{example['question']} Answer Choices: {choices_str}"
    answer = f"The answer is ({example['answerKey'].lower()})"
    return question_template, answer

def strategyqa_template(example: dict) -> tuple:
    """
    Converts a StrategyQA example dict to the question template.
    Args:
        example (dict): Dictionary with 'question' and 'answer'.
    Returns:
        tuple: (question string, answer string)
    """
    question = example['question']
    answer = "yes" if example['answer'] else "no"
    answer_text = f"The answer is {answer}"
    return question, answer_text

def date_template(example: dict) -> tuple:
    """
    Converts a date understanding example dict to the question template.
    Args:
        example (dict): Dictionary with 'question' and 'target_scores'.
    Returns:
        tuple: (question string, answer string)
    """
    question = example['input']
    correct_answer = str(next((k for k, v in example['target_scores'].items() if v == 1), None))
    answer = f"The answer is {correct_answer}"
    return question, answer

def sports_template(example: dict) -> tuple:
    """
    Converts a sports understanding example dict to the question template.
    Args:
        example (dict): Dictionary with 'input' and 'target_scores'.
    Returns:
        tuple: (question string, answer string)
    """
    question = f'Is the following sentence plausible? "{example["input"]}"'
    answer = "yes" if example["target_scores"].get("plausible", 0) == 1 else "no"
    answer_text = f"The answer is {answer}"
    return question, answer_text

def saycan_template(example: dict) -> tuple:
    """
    Converts a SayCan example dict to the question template.
    Args:
        example (dict): Dictionary with 'instruction' and 'plan'.
    Returns:
        tuple: (question string, answer string)
    """
    question = example['INPUT']
    answer = f"The answer is {example['OUTPUT']}"
    return question, answer
  

In [None]:
# quick smoke test for templates

REAL_EXAMPLES = {
    "csqa": {
        "question": "Where would you find a hamburger?",
        "choices": {
            "label": ["A", "B", "C", "D", "E"],
            "text": ["fast food restaurant", "pizza", "kitchen", "grocery store", "farms"]
        },
        "answerKey": "A"
    },
    "strategyqa": {
        "question": "Would a person with a high fever typically sweat?",
        "answer": True
    },
    "date": {
        "input": "What day of the week was 2020-02-02?",
        "target_scores": {"Sunday": 1, "Monday": 0, "Tuesday": 0}
    },
    "sports": {
        "input": "The quarterback threw a touchdown pass.",
        "target_scores": {"plausible": 1}
    },
    "saycan": {
        "INPUT": "Bring me a cold drink.",
        "PlAN": "open fridge, pick drink, close fridge, bring drink"
    }
}

def test_smoke():
    """Sanity check with the hand-crafted examples above."""
    templates = {
        "csqa": csqa_template,
        "strategyqa": strategyqa_template,
        "date": date_template,
        "sports": sports_template,
        "saycan": saycan_template,
    }

    for dataset, tpl in templates.items():
        print(f"\n--- {dataset} ---")
        q, a = tpl(REAL_EXAMPLES[dataset])
        print("Question:", q)
        print("Answer  :", a)
        # basic assertions
        assert isinstance(q, str) and q.strip()
        assert isinstance(a, str) and a.strip()

test_smoke()

In [None]:
# prompt template for CSQA

CSQA_PROMPT = """

Q: What do people use to absorb extra ink from a fountain pen? Answer Choices: (a) shirt pocket (b)
calligrapher’s hand (c) inkwell (d) desk drawer (e) blotter
A: The answer must be an item that can absorb ink. Of the above choices, only blotters are used to absorb ink.
So the answer is (e).
Q: What home entertainment equipment requires cable?
Answer Choices: (a) radio shack (b) substation (c) television (d) cabinet
A: The answer must require cable. Of the above choices, only television requires cable. So the answer is (c).
Q: The fox walked from the city into the forest, what was it looking for? Answer Choices: (a) pretty flowers (b)
hen house (c) natural habitat (d) storybook
A: The answer must be something in the forest. Of the above choices, only natural habitat is in the forest. So the
answer is (b).
Q: Sammy wanted to go to where the people were. Where might he go? Answer Choices: (a) populated areas
(b) race track (c) desert (d) apartment (e) roadblock
A: The answer must be a place with a lot of people. Of the above choices, only populated areas have a lot of
people. So the answer is (a).
Q: Where do you put your grapes just before checking out? Answer Choices: (a) mouth (b) grocery cart (c)super
market (d) fruit basket (e) fruit market
A: The answer should be the place where grocery items are placed before checking out. Of the above choices,
grocery cart makes the most sense for holding grocery items. So the answer is (b).
Q: Google Maps and other highway and street GPS services have replaced what? Answer Choices: (a) united
states (b) mexico (c) countryside (d) atlas
A: The answer must be something that used to do what Google Maps and GPS services do, which is to give
directions. Of the above choices, only atlases are used to give directions. So the answer is (d).
Q: Before getting a divorce, what did the wife feel who was doing all the work? Answer Choices: (a) harder (b)
anguish (c) bitterness (d) tears (e) sadness
A: The answer should be the feeling of someone getting divorced who was doing all the work. Of the above
choices, the closest feeling is bitterness. So the answer is (c).
Q: {Question}
A:

"""

In [None]:
# prompt for StrategyQA

StrategyQA_PROMPT = """

Q: Do hamsters provide food for any animals?
A: Hamsters are prey animals. Prey are food for predators. Thus, hamsters provide food for some animals. So
the answer is yes.
Q: Could Brooke Shields succeed at University of Pennsylvania?
A: Brooke Shields went to Princeton University. Princeton University is about as academically rigorous as the
University of Pennsylvania. Thus, Brooke Shields could also succeed at the University of Pennsylvania. So the
answer is yes.
Q: Yes or no: Hydrogen’s atomic number squared exceeds number of Spice Girls?
A: Hydrogen has an atomic number of 1. 1 squared is 1. There are 5 Spice Girls. Thus, Hydrogen’s atomic
number squared is less than 5. So the answer is no.
Q: Yes or no: Is it common to see frost during some college commencements?
A: College commencement ceremonies can happen in December, May, and June. December is in the winter, so
there can be frost. Thus, there could be frost at some commencements. So the answer is yes.
Q: Yes or no: Could a llama birth twice during War in Vietnam (1945-46)?
A: The War in Vietnam was 6 months. The gestation period for a llama is 11 months, which is more than 6
months. Thus, a llama could not give birth twice during the War in Vietnam. So the answer is no.
Q: Yes or no: Would a pear sink in water?
A: The density of a pear is about 0.6g/cm3, which is less than water. Objects less dense than water float. Thus,
a pear would float. So the answer is no.
Q: {Question}
A:

"""

In [None]:
# prompt for date understanding 

Date_PROMPT = """

Q: 2015 is coming in 36 hours. What is the date one week from today in MM/DD/YYYY?
A: If 2015 is coming in 36 hours, then it is coming in 2 days. 2 days before 01/01/2015 is 12/30/2014, so today
is 12/30/2014. So one week from today will be 01/05/2015. So the answer is 01/05/2015.
Q: The first day of 2019 is a Tuesday, and today is the first Monday of 2019. What is the date today in
MM/DD/YYYY?
A: If the first day of 2019 was Tuesday, then 01/01/2019 was a Tuesday. Today is the first monday, would be six
days later. So today is 01/07/2019. So the answer is 01/07/2019.
Q: The concert was scheduled to be on 06/01/1943, but was delayed by one day to today. What is the date 10
days ago in MM/DD/YYYY?
A: One day after 06/01/1943 is 06/02/1943, so today is 06/02/1943. 10 days before today is 05/23/1943. So the
answer is 05/23/1943.
Q: It is 4/19/1969 today. What is the date 24 hours later in MM/DD/YYYY?
A: Today is 04/19/1969. 24 hours later is one day after today, which would be 04/20/1969. So the answer is
04/20/1969.
Q: Jane thought today is 3/11/2002, but today is in fact Mar 12, which is 1 day later. What is the date 24 hours
later in MM/DD/YYYY?
A: Today is 03/12/2002. So the date 24 hours later will be 03/13/2002. So the answer is 03/13/2002.
Q: Jane was born on the last day of Feburary in 2001. Today is her 16-year-old birthday. What is the date
yesterday in MM/DD/YYYY?
A: The last day of February is the 28th, so Jane was born on 02/28/2001. Today is her 16-year old birthday, so
today is 02/28/2017. So yesterday was 02/27/2017. So the answer is 02/27/2017
Q: {Question}
A:

"""


In [None]:
# prompt for sports understanding

Sports_PROMPT = """

Q: Is the following sentence plausible? “Kyle Palmieri was called for slashing.”
A: Kyle Palmieri is a hockey player. Being called for slashing is part of hockey. So the answer is yes.
Q: Is the following sentence plausible? “Joao Moutinho caught the screen pass in the NFC championship.”
A: Joao Moutinho is a soccer player. The NFC championship is part of American football, not soccer. So the
answer is no.
Q: Is the following sentence plausible? “Carson Wentz set the pick and roll.”
A: Carson Wentz is an American football player. Pick and roll is part of basketball, not football. So the answer
is no.
Q: Is the following sentence plausible? “Jonas Valanciunas beat the buzzer.”
A: Jonas Valanciunas is a basketball player. Beating the buzzer is part of basketball. So the answer is yes.
Q: Is the following sentence plausible? “Jamel Murray was perfect from the line.”
A: Jamal Murray is a basketball player. Being perfect from the line is part of basketball. So the answer is yes.
Q: Is the following sentence plausible? “Sam Darnold passed the puck.”
A: Sam Darnold is a American football player. Passing the puck is part of hockey, not American football. So the
answer is no.
Q: Is the following sentence plausible? “Draymond Green threw a touchdown.”
A: Draymond Green is an basketball player. Throwing a touchdown is part of football, not basketball. So the
answer is no.
Q: Is the following sentence plausible? “Malcolm Brogdon banked the shot in.”
A: Malcolm Brogdon is a basketball player. Banking the shot in is part of basketball. So the answer is yes.
Q: {Question}
A:

"""

In [None]:
# prompt for saycan 

SayCan_PROMPT = """

Locations = [counter, table, user, trash, bowl].
Objects = [7up, apple, kettle chips, tea, multigrain chips, coke, lime soda, jalapeno chips, rice chips, orange,
grapefruit soda, pepsi, redbull, energy bar, sponge, water].
The robot can pick up items with pick(object) and put down items with put(object) as well as find objects or
locations with find(). The robot can only understand the explicit locations and objects listed.
Human: I’m hungry, can you bring me some chips.
Explanation: The user is hungry and has asked for chips. There are several types of chips available, I will bring
the user the kettle chips.
Plan: 1. find(kettle chips), 2. pick(kettle chips), 3. find(user), 4. put(kettle chips), 5. done().
Human: How would you move the grapefruit drink from the table to the counter?
Explanation: The user has asked me to move the grapefruit drink to the counter.
Plan: 1. find(grapefruit soda), 2. pick(grapefruit soda), 3. find(counter), 4. put(grapefruit soda), 5. done().
Human: How would you bring me some snacks?
Explanation: The user has asked for snacks, I will choose two items and bring them. I will bring jalapeno chips
and an apple.
Plan: 1. find(jalapeno chips), 2. pick(jalapeno chips), 3. find(user), 4. put(jalapeno chips), 5. find(apple), 6.
pick(apple), 7. find(user), 8. put(apple), 9. done().
Human: How would you bring me something to eat that isn’t a fruit?
Explanation: The user has asked for a food that isn’t a fruit, I will bring an energy bar to them.
Plan: 1. find(energy bar), 2. pick(energy bar), 3. find(user), 4. put(energy bar), 5. done().
Human: How would you put the rice chips in the bowl and then move the tea to the table?
Explanation: The user has asked me to do two tasks, I will do one and then the other.
Plan: 1. find(rice chips), 2. pick(rice chips), 3. find(bowl), 4. put(rice chips), 5. find(tea), 6. pick(tea), 7.
find(table), 8. put(tea), 9. done().
Human: How would you throw away a redbull?
Explanation: The user has asked me to throw away the redbull, I will move it to the trash.
Plan: 1. find(redbull), 2. pick(redbull), 3. find(trash), 4. put(redbull), 5. done().
Human: Bring me a drink.
Explanation: The user has asked for a drink and there are many options. I will bring them a water.
Plan: 1. find(water), 2. pick(water), 3. find(user), 4. put(water), 5. done().


Human: {Question}
Explanation:
Plan:
"""

In [None]:
# Set up template functions
template_functions = {
    "csqa": csqa_template,
    "strategyqa": strategyqa_template,
    "date": date_template,
    "sports": sports_template,
    "saycan": saycan_template
}

# Set up prompt templates
prompt_templates = {
    "csqa": CSQA_PROMPT,
    "strategyqa": StrategyQA_PROMPT,
    "date": Date_PROMPT,
    "sports": Sports_PROMPT,
    "saycan": SayCan_PROMPT
}

# Set up datasets
datasets = {
    "csqa": csqa,
    "strategyqa": strategyqa,
    "date": date,
    "sports": sports,
    "saycan": saycan
}

# Define models and system prompts
openai_models = ["gpt-4.1-nano", "gpt-4o-mini-2024-07-18"]
gemini_models = [ "gemini-1.5-flash", "gemini-2.0-flash"]
system_prompts = {
    "few_shot": "You are a helpful assistant. Carefully follow the reasoning and answer format shown in the few-shot examples provided by the user. Use the same step-by-step explanation style and answer format.",
    "zero_shot": "You are a helpful assistant. First, provide a detailed answer to the question. Then, on a new line, clearly state your final answer in the format: The answer is {answer}. Only include the final answer in that line, without any explanation."
}


In [None]:
# Run code to extract data from Gemini, OpenAI and Gemma 3 1B IT

gemini_keys = [
# add gemini keys here
]

import os
import json
import time
import itertools
from typing import Dict, List, Tuple, Callable, Any
from tqdm import tqdm
from openai import OpenAI
from google import genai
from google.genai import types

class BenchmarkRunner:
    def __init__(self,
                 datasets: Dict[str, Any],
                 template_functions: Dict[str, Callable],
                 prompt_templates: Dict[str, str],
                 openai_models: List[str],
                 gemini_models: List[str],
                 system_prompts: Dict[str, str],
                 gemini_keys: List[str],
                 output_dir: str = "results",
                 sample_size: int = 20):
        self.datasets = datasets
        self.template_functions = template_functions
        self.prompt_templates = prompt_templates
        self.openai_models = openai_models
        self.gemini_models = gemini_models
        self.system_prompts = system_prompts
        self.gemini_keys = gemini_keys
        self.key_cycle = itertools.cycle(gemini_keys)
        self.openai_client = OpenAI()
        self.output_dir = output_dir
        self.sample_size = sample_size
        os.makedirs(output_dir, exist_ok=True)

    def _get_next_key(self) -> str:
        return next(self.key_cycle)

    def _save_results(self, results: List[Dict], dataset_name: str, prompt_type: str, model_name: str):
        path = os.path.join(self.output_dir, f"{dataset_name}-{prompt_type}-{model_name}.json")
        with open(path, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        print("Saved:", path)

    def run_openai(self, prompt_type: str, dataset_name: str, model_name: str, examples: List[Tuple[str, Any]]):
        results = []
        instructions = self.system_prompts[prompt_type]
        for question, original in tqdm(examples, desc=f"OpenAI {model_name}"):
            prompt = self._prepare_prompt(prompt_type, dataset_name, question)
            try:
                resp = self.openai_client.responses.create(
                    model=model_name,
                    instructions=instructions,
                    input=prompt,
                    temperature=0.0
                )
                answer = resp.output_text
            except Exception as e:
                answer = f"ERROR: {e}"
            results.append({"question": question, "original": original, "answer": answer})
        self._save_results(results, dataset_name, prompt_type, model_name)

    def run_gemini(self, prompt_type: str, dataset_name: str, model_name: str, examples: List[Tuple[str, Any]]):
        results = []
        instructions = self.system_prompts[prompt_type]
        for question, original in tqdm(examples, desc=f"Gemini {model_name}"):
            prompt = self._prepare_prompt(prompt_type, dataset_name, question)
            max_retries = len(self.gemini_keys)
            for _ in range(max_retries):
                key = self._get_next_key()
                client = genai.Client(api_key=key)
                try:
                    resp = client.models.generate_content(
                        model=model_name,
                        contents=prompt,
                        config=types.GenerateContentConfig(system_instruction=instructions)
                    )
                    answer = resp.text
                    break
                except Exception as e:
                    answer = f"ERROR: {e}"
                    time.sleep(1)
            results.append({"question": question, "original": original, "answer": answer})
        self._save_results(results, dataset_name, prompt_type, model_name)

    def _prepare_prompt(self, prompt_type: str, dataset_name: str, question: str) -> str:
        return self.prompt_templates[dataset_name].replace("{Question}", question) if prompt_type == "few_shot" else question


    def prepare_examples(self, dataset_name: str):
            examples = []
            
            if dataset_name == "csqa":
                 dataset_examples = self.datasets["csqa"]["train"]
            elif dataset_name == "strategyqa":
                dataset_examples = self.datasets["strategyqa"]["train"]
            elif dataset_name == "date":
                dataset_examples = self.datasets["date"]["examples"]
            elif dataset_name == "sports":
                dataset_examples = self.datasets["sports"]["examples"]
            elif dataset_name == "saycan":
                dataset_examples = self.datasets["saycan"]["test"]
            else:
                raise ValueError(f"Unknown dataset: {dataset_name}")
            
                
            template_func = self.template_functions[dataset_name]
            
            # Process at most sample_size examples
            for i in range(min(self.sample_size, len(dataset_examples))):
                example = dataset_examples[i]
                question, answer = template_func(example)
                examples.append((question, answer))
                
            return examples

    def run_all_benchmarks(self, modes: List[str] = ["few_shot", "zero_shot"], test_first: bool = True):
        if test_first:
            working_models = self._test_models()
            self.openai_models = [m for m in self.openai_models if m in working_models]
            self.gemini_models = [m for m in self.gemini_models if m in working_models]

        for dataset_name in self.prompt_templates.keys():
            examples = self.prepare_examples(dataset_name)
            for mode in modes:
                for model_name in self.openai_models:
                    self.run_openai(mode, dataset_name, model_name, examples)
                for model_name in self.gemini_models:
                    self.run_gemini(mode, dataset_name, model_name, examples)

    def _test_models(self) -> List[str]:
        test_prompt = "Complete this sentence: The sky is"
        working_models = []
        print("Testing OpenAI models...")
        for model in self.openai_models:
            try:
                self.openai_client.responses.create(
                    model=model,
                    instructions="You are a helpful assistant.",
                    input=test_prompt,
                    temperature=0.0
                )
                working_models.append(model)
            except:
                pass

        print("Testing Gemini models...")
        for model in self.gemini_models:
            for _ in range(len(self.gemini_keys)):
                key = self._get_next_key()
                client = genai.Client(api_key=key)
                try:
                    client.models.generate_content(
                        model=model,
                        contents=test_prompt,
                        config=types.GenerateContentConfig(temperature=0.0)
                    )
                    working_models.append(model)
                    break
                except:
                    continue
        return working_models

In [None]:
# Usage
runner = BenchmarkRunner(
    datasets=datasets,
    template_functions=template_functions,
    prompt_templates=prompt_templates,
    openai_models=openai_models,
    gemini_models=gemini_models,
    system_prompts=system_prompts,
    gemini_keys=gemini_keys,
    output_dir="results",
    sample_size=20
)

# Run benchmarks (test_first=False since we already tested)
runner.run_all_benchmarks(modes=["zero_shot", "few_shot"], test_first=False)

In [None]:
# code in case gemini 1.5 Flash API keys need to be rotated due to errors in the JSON files

def fix_gemini_errors(results_dir: str = "results"):
    """
    Find and fix errors in Gemini 1.5 Flash JSON files by retrying with key rotation.
    """
    # Gemini API keys for rotation
    gemini_keys = [
        # Add your Gemini API keys here
    ]
    key_index = 0
    
    # Define system prompts
    system_prompts = {
        "few_shot": "You are a helpful assistant. Carefully follow the reasoning and answer format shown in the few-shot examples provided by the user. Use the same step-by-step explanation style and answer format.",
        "zero_shot": "You are a helpful assistant. First, provide a detailed answer to the question. Then, on a new line, clearly state your final answer in the format: The answer is {answer}. Only include the final answer in that line, without any explanation."
    }
    
    # Load prompt templates from notebook
    prompt_templates = {
        "csqa": CSQA_PROMPT,
        "strategyqa": StrategyQA_PROMPT,
        "date": Date_PROMPT,
        "sports": Sports_PROMPT,
        "saycan": SayCan_PROMPT
    }
    
    # Function to get the next API key in rotation
    def get_next_key():
        nonlocal key_index
        key = gemini_keys[key_index]
        key_index = (key_index + 1) % len(gemini_keys)
        return key
    
    # Function to retry a query with key rotation
    def retry_query(question, dataset_name=None, prompt_type=None):
        for attempt in range(len(gemini_keys)):
            key = get_next_key()
            client = genai.Client(api_key=key)
            
            try:
                # Determine if this is few-shot or zero-shot
                if prompt_type == "few_shot" and dataset_name in prompt_templates and prompt_templates[dataset_name]:
                    prompt = prompt_templates[dataset_name].replace("{Question}", question)
                    instructions = system_prompts["few_shot"]
                else:
                    prompt = question
                    instructions = system_prompts["zero_shot"]
                
                resp = client.models.generate_content(
                    model="gemini-1.5-flash",
                    contents=prompt,
                    config=types.GenerateContentConfig(
                        system_instruction=instructions
                    )
                )
                return resp.text, None
            except Exception as e:
                print(f"Key {attempt+1} failed: {str(e)[:100]}...")
                last_error = e
                time.sleep(2)  # Wait before trying the next key
                
        return None, f"ERROR: All keys failed. Last error: {last_error}"

    # Find all Gemini files
    gemini_files = glob.glob(os.path.join(results_dir, "*gemini-1.5-flash*.json"))
    print(f"Found {len(gemini_files)} Gemini 1.5 Flash result files.")
    
    for file_path in gemini_files:
        file_name = os.path.basename(file_path)
        print(f"\nProcessing: {file_name}")
        
        # Extract dataset name and prompt type from filename
        parts = file_name.replace(".json", "").split("-")
        if len(parts) < 3:
            print(f"  Skipping: Cannot parse filename format for {file_name}")
            continue
            
        dataset_name = parts[0]
        prompt_type = parts[1]  # "few_shot" or "zero_shot"
        
        # Load the JSON data
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        # Count errors
        error_count = sum(1 for item in data if item.get("answer", "").startswith("ERROR:"))
        if error_count == 0:
            print(f"  No errors found in {file_name}")
            continue
            
        print(f"  Found {error_count} errors to fix in {file_name}")
        
        # Process each error
        fixed_count = 0
        for i, item in enumerate(tqdm(data, desc=f"Fixing errors in {file_name}")):
            if item.get("answer", "").startswith("ERROR:"):
                question = item["question"]
                print(f"\n  Retrying question: {question[:50]}...")
                
                # Retry the query
                new_answer, error = retry_query(question, dataset_name, prompt_type)
                
                if error:
                    print(f"  Failed to fix item {i+1} after trying all keys")
                    continue
                    
                # Update the item with the new answer
                data[i]["answer"] = new_answer
                fixed_count += 1
                print(f"  Fixed item {i+1}")
                time.sleep(1)  # Rate limiting
        
        # Save the updated file
        if fixed_count > 0:
            with open(file_path, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            print(f"  Saved {fixed_count}/{error_count} fixed items in {file_name}")
        else:
            print(f"  No items were fixed in {file_name}")

# Run the script
fix_gemini_errors()

In [None]:
# use ollama 3.1 3B IT model to run the benchmarks

import os
import json
import time
import requests
from typing import Dict, List, Tuple, Callable, Any
from tqdm import tqdm

class BenchmarkRunner:
    def __init__(self,
                 datasets: Dict[str, Any],
                 template_functions: Dict[str, Callable],
                 prompt_templates: Dict[str, str],
                 system_prompts: Dict[str, str],
                 output_dir: str = "results",
                 sample_size: int = 20,
                 ollama_url: str = "http://localhost:11434/api/generate",
                 gemma_model: str = "gemma:3b"):
        self.datasets = datasets
        self.template_functions = template_functions
        self.prompt_templates = prompt_templates
        self.system_prompts = system_prompts
        self.output_dir = output_dir
        self.sample_size = sample_size
        self.ollama_url = ollama_url
        self.gemma_model = gemma_model
        os.makedirs(output_dir, exist_ok=True)

    def _save_results(self, results: List[Dict], dataset_name: str, prompt_type: str):
        path = os.path.join(self.output_dir, f"{dataset_name}-{prompt_type}-{self.gemma_model.replace(':', '_')}.json")
        with open(path, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        print("Saved:", path)

    def _prepare_prompt(self, prompt_type: str, dataset_name: str, question: str, system_prompt: str) -> str:
        prompt = question if prompt_type == "zero_shot" else self.prompt_templates[dataset_name].replace("{Question}", question)
        if system_prompt:
            return f"{system_prompt.strip()}\n\n{prompt.strip()}"
        return prompt.strip()

    def run_gemma(self, prompt_type: str, dataset_name: str, examples: List[Tuple[str, Any]]):
        results = []
        system_prompt = self.system_prompts.get(prompt_type, "")

        for question, original in tqdm(examples, desc=f"Gemma {self.gemma_model}"):
            prompt = self._prepare_prompt(prompt_type, dataset_name, question, system_prompt)
            try:
                response = requests.post(self.ollama_url, json={
                    "model": self.gemma_model,
                    "prompt": prompt,
                    "stream": False
                })
                response.raise_for_status()
                answer = response.json().get("response", "")
            except Exception as e:
                answer = f"ERROR: {e}"
            results.append({"question": question, "original": original, "answer": answer})

        self._save_results(results, dataset_name, prompt_type)


    def prepare_examples(self, dataset_name: str):
            examples = []
            
            if dataset_name == "csqa":
                 dataset_examples = self.datasets["csqa"]["train"]
            elif dataset_name == "strategyqa":
                dataset_examples = self.datasets["strategyqa"]["train"]
            elif dataset_name == "date":
                dataset_examples = self.datasets["date"]["examples"]
            elif dataset_name == "sports":
                dataset_examples = self.datasets["sports"]["examples"]
            elif dataset_name == "saycan":
                dataset_examples = self.datasets["saycan"]["test"]
            else:
                raise ValueError(f"Unknown dataset: {dataset_name}")
            
                
            template_func = self.template_functions[dataset_name]
            
            # Process at most sample_size examples
            for i in range(min(self.sample_size, len(dataset_examples))):
                example = dataset_examples[i]
                question, answer = template_func(example)
                examples.append((question, answer))
                
            return examples

    def run_all_benchmarks(self, modes: List[str] = ["few_shot", "zero_shot"]):
        for dataset_name in self.prompt_templates.keys():
            examples = self.prepare_examples(dataset_name)
            for mode in modes:
                self.run_gemma(mode, dataset_name, examples)


In [None]:
if __name__ == "__main__":
    runner = BenchmarkRunner(
        datasets=datasets,
        template_functions=template_functions,
        prompt_templates=prompt_templates,
        system_prompts=system_prompts,
        output_dir="results",
        sample_size=20,
        gemma_model="gemma3:1b",  # or another variant like gemma:2b-it
    )
    runner.run_all_benchmarks()