In [2]:
# import modules

import re
import os
import json
import time
import torch
from dotenv import load_dotenv
import pandas as pd
from tqdm import tqdm
from google import genai
from openai import OpenAI
from google.genai import types
from datasets import load_dataset
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
import requests
import gzip
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load API keys
load_dotenv()

# Configure OpenAI
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

# Configure Gemini
import google.generativeai as genai
genai.configure(api_key=os.getenv('GEMINI_API'))

In [4]:
# load dataset

# 1. CSQA – CommonsenseQA (train/dev/test splits)
csqa = load_dataset("commonsense_qa")          

# 2. StrategyQA – yes/no questions requiring multi-hop reasoning
strategyqa = load_dataset("metaeval/strategy-qa")

# 3. Date Understanding
date_url = "https://raw.githubusercontent.com/google/BIG-bench/main/bigbench/benchmark_tasks/date_understanding/task.json"
date_response = requests.get(date_url)
date = json.loads(date_response.text)

# 4. Sports Understanding – sports-related questions
sports_url = "https://raw.githubusercontent.com/google/BIG-bench/main/bigbench/benchmark_tasks/sports_understanding/task.json"
sports_response = requests.get(sports_url)
sports = json.loads(sports_response.text)

# 5. SayCan – Language→robot-action mapping (Google Robotics, 2022)
saycan = load_dataset("chiayewken/saycan")

In [44]:
# template functions for commonsense reasoning tasks

def csqa_template(example: dict) -> tuple:
    """
    Converts a CommonsenseQA example dict to the question template.
    Args:
        example (dict): Dictionary with 'question', 'choices', and 'answerKey'.
    Returns:
        tuple: (question string, answer string)
    """
    labels = [label.lower() for label in example['choices']['label']]
    texts = example['choices']['text']
    choices_str = " ".join([f"({label}) {text}" for label, text in zip(labels, texts)])
    question_template = f"{example['question']} Answer Choices: {choices_str}"
    answer = f"The answer is ({example['answerKey'].lower()})"
    return question_template, answer

def strategyqa_template(example: dict) -> tuple:
    """
    Converts a StrategyQA example dict to the question template.
    Args:
        example (dict): Dictionary with 'question' and 'answer'.
    Returns:
        tuple: (question string, answer string)
    """
    question = example['question']
    answer = "yes" if example['answer'] else "no"
    answer_text = f"The answer is {answer}"
    return question, answer_text

def date_template(example: dict) -> tuple:
    """
    Converts a date understanding example dict to the question template.
    Args:
        example (dict): Dictionary with 'question' and 'target_scores'.
    Returns:
        tuple: (question string, answer string)
    """
    question = example['input']
    correct_answer = str(next((k for k, v in example['target_scores'].items() if v == 1), None))
    answer = f"The answer is {correct_answer}"
    return question, answer

def sports_template(example: dict) -> tuple:
    """
    Converts a sports understanding example dict to the question template.
    Args:
        example (dict): Dictionary with 'input' and 'target_scores'.
    Returns:
        tuple: (question string, answer string)
    """
    question = f'Is the following sentence plausible? "{example["input"]}"'
    answer = "yes" if example["target_scores"].get("plausible", 0) == 1 else "no"
    answer_text = f"The answer is {answer}"
    return question, answer_text

def saycan_template(example: dict) -> tuple:
    """
    Converts a SayCan example dict to the question template.
    Args:
        example (dict): Dictionary with 'instruction' and 'plan'.
    Returns:
        tuple: (question string, answer string)
    """
    question = example['instruction']
    answer = f"The answer is {example['plan']}"
    return question, answer
  

In [47]:
# quick smoke test for templates

REAL_EXAMPLES = {
    "csqa": {
        "question": "Where would you find a hamburger?",
        "choices": {
            "label": ["A", "B", "C", "D", "E"],
            "text": ["fast food restaurant", "pizza", "kitchen", "grocery store", "farms"]
        },
        "answerKey": "A"
    },
    "strategyqa": {
        "question": "Would a person with a high fever typically sweat?",
        "answer": True
    },
    "date": {
        "input": "What day of the week was 2020-02-02?",
        "target_scores": {"Sunday": 1, "Monday": 0, "Tuesday": 0}
    },
    "sports": {
        "input": "The quarterback threw a touchdown pass.",
        "target_scores": {"plausible": 1}
    },
    "saycan": {
        "instruction": "Bring me a cold drink.",
        "plan": "open fridge, pick drink, close fridge, bring drink"
    }
}

def test_smoke():
    """Sanity check with the hand-crafted examples above."""
    templates = {
        "csqa": csqa_template,
        "strategyqa": strategyqa_template,
        "date": date_template,
        "sports": sports_template,
        "saycan": saycan_template,
    }

    for dataset, tpl in templates.items():
        print(f"\n--- {dataset} ---")
        q, a = tpl(REAL_EXAMPLES[dataset])
        print("Question:", q)
        print("Answer  :", a)
        # basic assertions
        assert isinstance(q, str) and q.strip()
        assert isinstance(a, str) and a.strip()

test_smoke()


--- csqa ---
Question: Where would you find a hamburger? Answer Choices: (a) fast food restaurant (b) pizza (c) kitchen (d) grocery store (e) farms
Answer  : The answer is (a)

--- strategyqa ---
Question: Would a person with a high fever typically sweat?
Answer  : The answer is yes

--- date ---
Question: What day of the week was 2020-02-02?
Answer  : The answer is Sunday

--- sports ---
Question: Is the following sentence plausible? "The quarterback threw a touchdown pass."
Answer  : The answer is yes

--- saycan ---
Question: Bring me a cold drink.
Answer  : The answer is open fridge, pick drink, close fridge, bring drink


In [None]:
sports_template(sports['examples'][0])

('Is the following sentence plausible? "Jamal Murray was perfect from the line"',
 'The answer is yes')

In [None]:
# prompt template for CSQA

CSQA_PROMPT = """

Q: What do people use to absorb extra ink from a fountain pen? Answer Choices: (a) shirt pocket (b)
calligrapher’s hand (c) inkwell (d) desk drawer (e) blotter
A: The answer must be an item that can absorb ink. Of the above choices, only blotters are used to absorb ink.
So the answer is (e).
Q: What home entertainment equipment requires cable?
Answer Choices: (a) radio shack (b) substation (c) television (d) cabinet
A: The answer must require cable. Of the above choices, only television requires cable. So the answer is (c).
Q: The fox walked from the city into the forest, what was it looking for? Answer Choices: (a) pretty flowers (b)
hen house (c) natural habitat (d) storybook
A: The answer must be something in the forest. Of the above choices, only natural habitat is in the forest. So the
answer is (b).
Q: Sammy wanted to go to where the people were. Where might he go? Answer Choices: (a) populated areas
(b) race track (c) desert (d) apartment (e) roadblock
A: The answer must be a place with a lot of people. Of the above choices, only populated areas have a lot of
people. So the answer is (a).
Q: Where do you put your grapes just before checking out? Answer Choices: (a) mouth (b) grocery cart (c)super
market (d) fruit basket (e) fruit market
A: The answer should be the place where grocery items are placed before checking out. Of the above choices,
grocery cart makes the most sense for holding grocery items. So the answer is (b).
Q: Google Maps and other highway and street GPS services have replaced what? Answer Choices: (a) united
states (b) mexico (c) countryside (d) atlas
A: The answer must be something that used to do what Google Maps and GPS services do, which is to give
directions. Of the above choices, only atlases are used to give directions. So the answer is (d).
Q: Before getting a divorce, what did the wife feel who was doing all the work? Answer Choices: (a) harder (b)
anguish (c) bitterness (d) tears (e) sadness
A: The answer should be the feeling of someone getting divorced who was doing all the work. Of the above
choices, the closest feeling is bitterness. So the answer is (c).
Q: {Question}
A:

"""

In [None]:
# prompt for StrategyQA

StrategyQA_PROMPT = """

Q: Do hamsters provide food for any animals?
A: Hamsters are prey animals. Prey are food for predators. Thus, hamsters provide food for some animals. So
the answer is yes.
Q: Could Brooke Shields succeed at University of Pennsylvania?
A: Brooke Shields went to Princeton University. Princeton University is about as academically rigorous as the
University of Pennsylvania. Thus, Brooke Shields could also succeed at the University of Pennsylvania. So the
answer is yes.
Q: Yes or no: Hydrogen’s atomic number squared exceeds number of Spice Girls?
A: Hydrogen has an atomic number of 1. 1 squared is 1. There are 5 Spice Girls. Thus, Hydrogen’s atomic
number squared is less than 5. So the answer is no.
Q: Yes or no: Is it common to see frost during some college commencements?
A: College commencement ceremonies can happen in December, May, and June. December is in the winter, so
there can be frost. Thus, there could be frost at some commencements. So the answer is yes.
Q: Yes or no: Could a llama birth twice during War in Vietnam (1945-46)?
A: The War in Vietnam was 6 months. The gestation period for a llama is 11 months, which is more than 6
months. Thus, a llama could not give birth twice during the War in Vietnam. So the answer is no.
Q: Yes or no: Would a pear sink in water?
A: The density of a pear is about 0.6g/cm3, which is less than water. Objects less dense than water float. Thus,
a pear would float. So the answer is no.
Q: {Question}
A:

"""

In [None]:
# prompt for date understanding 

Date_PROMPT = """

Q: 2015 is coming in 36 hours. What is the date one week from today in MM/DD/YYYY?
A: If 2015 is coming in 36 hours, then it is coming in 2 days. 2 days before 01/01/2015 is 12/30/2014, so today
is 12/30/2014. So one week from today will be 01/05/2015. So the answer is 01/05/2015.
Q: The first day of 2019 is a Tuesday, and today is the first Monday of 2019. What is the date today in
MM/DD/YYYY?
A: If the first day of 2019 was Tuesday, then 01/01/2019 was a Tuesday. Today is the first monday, would be six
days later. So today is 01/07/2019. So the answer is 01/07/2019.
Q: The concert was scheduled to be on 06/01/1943, but was delayed by one day to today. What is the date 10
days ago in MM/DD/YYYY?
A: One day after 06/01/1943 is 06/02/1943, so today is 06/02/1943. 10 days before today is 05/23/1943. So the
answer is 05/23/1943.
Q: It is 4/19/1969 today. What is the date 24 hours later in MM/DD/YYYY?
A: Today is 04/19/1969. 24 hours later is one day after today, which would be 04/20/1969. So the answer is
04/20/1969.
Q: Jane thought today is 3/11/2002, but today is in fact Mar 12, which is 1 day later. What is the date 24 hours
later in MM/DD/YYYY?
A: Today is 03/12/2002. So the date 24 hours later will be 03/13/2002. So the answer is 03/13/2002.
Q: Jane was born on the last day of Feburary in 2001. Today is her 16-year-old birthday. What is the date
yesterday in MM/DD/YYYY?
A: The last day of February is the 28th, so Jane was born on 02/28/2001. Today is her 16-year old birthday, so
today is 02/28/2017. So yesterday was 02/27/2017. So the answer is 02/27/2017
Q: {Question}
A:

"""


In [None]:
# prompt for sports understanding

Sports_PROMPT = """

Q: Is the following sentence plausible? “Kyle Palmieri was called for slashing.”
A: Kyle Palmieri is a hockey player. Being called for slashing is part of hockey. So the answer is yes.
Q: Is the following sentence plausible? “Joao Moutinho caught the screen pass in the NFC championship.”
A: Joao Moutinho is a soccer player. The NFC championship is part of American football, not soccer. So the
answer is no.
Q: Is the following sentence plausible? “Carson Wentz set the pick and roll.”
A: Carson Wentz is an American football player. Pick and roll is part of basketball, not football. So the answer
is no.
Q: Is the following sentence plausible? “Jonas Valanciunas beat the buzzer.”
A: Jonas Valanciunas is a basketball player. Beating the buzzer is part of basketball. So the answer is yes.
Q: Is the following sentence plausible? “Jamel Murray was perfect from the line.”
A: Jamal Murray is a basketball player. Being perfect from the line is part of basketball. So the answer is yes.
Q: Is the following sentence plausible? “Sam Darnold passed the puck.”
A: Sam Darnold is a American football player. Passing the puck is part of hockey, not American football. So the
answer is no.
Q: Is the following sentence plausible? “Draymond Green threw a touchdown.”
A: Draymond Green is an basketball player. Throwing a touchdown is part of football, not basketball. So the
answer is no.
Q: Is the following sentence plausible? “Malcolm Brogdon banked the shot in.”
A: Malcolm Brogdon is a basketball player. Banking the shot in is part of basketball. So the answer is yes.
Q: {Question}
A:

"""

In [None]:
# prompt for saycan 

SayCan_PROMPT = """

Locations = [counter, table, user, trash, bowl].
Objects = [7up, apple, kettle chips, tea, multigrain chips, coke, lime soda, jalapeno chips, rice chips, orange,
grapefruit soda, pepsi, redbull, energy bar, sponge, water].
The robot can pick up items with pick(object) and put down items with put(object) as well as find objects or
locations with find(). The robot can only understand the explicit locations and objects listed.
Human: I’m hungry, can you bring me some chips.
Explanation: The user is hungry and has asked for chips. There are several types of chips available, I will bring
the user the kettle chips.
Plan: 1. find(kettle chips), 2. pick(kettle chips), 3. find(user), 4. put(kettle chips), 5. done().
Human: How would you move the grapefruit drink from the table to the counter?
Explanation: The user has asked me to move the grapefruit drink to the counter.
Plan: 1. find(grapefruit soda), 2. pick(grapefruit soda), 3. find(counter), 4. put(grapefruit soda), 5. done().
Human: How would you bring me some snacks?
Explanation: The user has asked for snacks, I will choose two items and bring them. I will bring jalapeno chips
and an apple.
Plan: 1. find(jalapeno chips), 2. pick(jalapeno chips), 3. find(user), 4. put(jalapeno chips), 5. find(apple), 6.
pick(apple), 7. find(user), 8. put(apple), 9. done().
Human: How would you bring me something to eat that isn’t a fruit?
Explanation: The user has asked for a food that isn’t a fruit, I will bring an energy bar to them.
Plan: 1. find(energy bar), 2. pick(energy bar), 3. find(user), 4. put(energy bar), 5. done().
Human: How would you put the rice chips in the bowl and then move the tea to the table?
Explanation: The user has asked me to do two tasks, I will do one and then the other.
Plan: 1. find(rice chips), 2. pick(rice chips), 3. find(bowl), 4. put(rice chips), 5. find(tea), 6. pick(tea), 7.
find(table), 8. put(tea), 9. done().
Human: How would you throw away a redbull?
Explanation: The user has asked me to throw away the redbull, I will move it to the trash.
Plan: 1. find(redbull), 2. pick(redbull), 3. find(trash), 4. put(redbull), 5. done().
Human: Bring me a drink.
Explanation: The user has asked for a drink and there are many options. I will bring them a water.
Plan: 1. find(water), 2. pick(water), 3. find(user), 4. put(water), 5. done().


Human: {Question}
Explanation:
Plan:
"""