In [None]:

import random
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
import json
from openai import OpenAI

# ── 1) Configure & instantiate OpenAI ───────────────────────────────────────
OPENAI_KEY = os.getenv('OPENAI_KEY')
openai = OpenAI(api_key=OPENAI_KEY)

# ── 2) Load your data ────────────────────────────────────────────────────────
with open('data/tasks_qas_with_prompts.json', 'r', encoding='utf-8') as f:
    examples = json.load(f)

augmented = []

# ── Task-specific prefixes and suffixes ─────────────────────────────────────
TASK_PREFIXES = {
    "com2sense": "Evaluate if the following Q follows common sense. Answer 'True' or 'False'",
    "cause_effect": "Evaluate if the following Q follows causality. Answer 'True' or 'False'",
    "web_of_lies": "Answer 'Yes' or 'No'",
    "arithmetic": "Evaluate the following expression for arithmetic. Output the numerical result, no comma."
}

# Base COT suffix template
BASE_COT_SUFFIX = (
    "In answering this question each step should be on a separate line and start with a number and a period, followed by the reasoning. "
    "Finally the answer should be on a new line with the word 'Answer' proceeded by a colon."
    "\nA: Let's think step by step."
)

# Special COT suffix for arithmetic
ARITHMETIC_COT_SUFFIX = (
    "In answering this question each step should be on a separate line and start with a number and a period, followed by the reasoning. "
    "Finally the answer should be on a new line with the word 'Answer' proceeded by a colon."
    "\nA: Let's think step-by-step in 4 operations."
)

def get_task_prefix(task):
    """Get the appropriate prefix for the given task type."""
    return TASK_PREFIXES.get(task, TASK_PREFIXES["com2sense"])  # fallback to com2sense

def get_cot_suffix(task):
    """Get the appropriate COT suffix for the given task type."""
    if task == "arithmetic":
        return ARITHMETIC_COT_SUFFIX
    else:
        return BASE_COT_SUFFIX

# ── 3) Process each example ─────────────────────────────────────────────────
iter = 0
for ex in examples:
    if iter % 100 == 0:
        print(f"Processing example {iter} of {len(examples)}")
    iter += 1
    # 3a) Get task-specific prefix and suffix
    task = ex['task']
    prefix = get_task_prefix(task)
    cot_suffix = get_cot_suffix(task)
    q = ex['question']

    # 4a) Rewrite the ORIGINAL example's prompts correctly with task-specific content
    prompt_direct = f"{prefix}\nQ: {q}\nA:"
    prompt_cot    = f"{prefix}\nQ: {q}\n{cot_suffix}"

    ex['prompt_direct'] = prompt_direct
    ex['prompt_cot']    = prompt_cot

    augmented.append(ex)

    # 3b) Create the augmented twin
    twin = ex.copy()
    twin['id'] = ex['id'] + 'a'

    # build the synonym prompt
    synonym_prompt = (
        "Replace important words/phrases in this text by synonyms. Do not swap for synonyms that will change the logical equivalence of the sentence."
        "Do not do anything else:\n"
        + q
    )

    # call OpenAI to rewrite the question
    resp = openai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": synonym_prompt}],
        temperature=0.2,
        max_tokens=1000,
    )
    new_q = resp.choices[0].message.content.strip()

    twin['question']      = new_q
    twin['prompt_direct'] = f"{prefix}\nQ: {new_q}\nA:"
    twin['prompt_cot']    = f"{prefix}\nQ: {new_q}\n{cot_suffix}"

    augmented.append(twin)

    # 3c) Create the augmented structure twin
    struct = ex.copy()
    struct['id'] = ex['id'] + 'b'

    # build the structure change prompt
    structure_prompt = (
        "Rewrite the sentence (s) with a different sentence structure."
        "Do not change the logical ordering and reasoning behind the text, you are ONLY changing how the sentence sounds. If the sentence doesn't make sense, then don't make it make sense, just change how it sounds (cause and effect should stay the same)."
        "The meaning of the sentence should not change."
        "Do not do anything else:\n"
        + q
    )

    # call OpenAI to rewrite the question
    resp_struct = openai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": structure_prompt}],
        temperature=0.2,
        max_tokens=1000,
    )
    new_q_struct = resp_struct.choices[0].message.content.strip()
    struct['question']      = new_q_struct
    struct['prompt_direct'] = f"{prefix}\nQ: {new_q_struct}\nA:"
    struct['prompt_cot']    = f"{prefix}\nQ: {new_q_struct}\n{cot_suffix}"

    augmented.append(struct)

    print(f"\nTask: {task}\nQ: {ex['question']}\nSynonym: {new_q}\nStructure: {new_q_struct}")

# ── 4) Save everything back out ──────────────────────────────────────────────
with open('data/big_bench_augmented.json', 'w', encoding='utf-8') as f:
    json.dump(augmented, f, indent=2, ensure_ascii=False)

print(f"\nGenerated {len(augmented)} examples (original + augmented).")
print(f"Task distribution: {len([ex for ex in augmented if ex['task'] == 'com2sense'])} com2sense, {len([ex for ex in augmented if ex['task'] == 'cause_effect'])} cause_effect, {len([ex for ex in augmented if ex['task'] == 'web_of_lies'])} web_of_lies, {len([ex for ex in augmented if ex['task'] == 'arithmetic'])} arithmetic")


Processing example 0 of 400

Task: com2sense
Q: Keith is 5 feet tall so he is less likely to become an amateur basketball player than a horse jockey.
Synonym: Keith is 5 feet tall so he is less likely to become a novice basketball player than a horse rider.
Structure: Standing at 5 feet tall, Keith is more likely to pursue a career as a horse jockey than as an amateur basketball player.

Task: com2sense
Q: Keith is 5 feet tall so he is less likely to become an amateur basketball player than a horse jockey.
Synonym: Keith is 5 feet tall so he is less likely to become a novice basketball player than a horse rider.
Structure: Standing at 5 feet tall, Keith is more likely to pursue a career as a horse jockey than as an amateur basketball player.

Task: com2sense
Q: Sally needs to be at work in 5 minutes while Mark has 10 to spare, so it makes more sense for Sally to rush and forget doing the dish.
Synonym: Sally needs to be at her job in 5 minutes while Mark has 10 to spare, so it makes mo

In [4]:
with open('data/big_bench_augmented.json', 'r', encoding='utf-8') as f:
    examples = json.load(f)

# Track statistics by task type
task_stats = {}
for task in ['com2sense', 'cause_effect', 'web_of_lies', 'arithmetic']:
    task_stats[task] = {
        'not_eq_syn': 0,
        'eq_syn': 0,
        'not_eq_res': 0,
        'eq_res': 0,
        'err': 0
    }

total_not_eq_syn = 0
total_eq_syn = 0
total_not_eq_res = 0
total_eq_res = 0
total_err = 0

for i in range(0, len(examples), 3):
    if i % 120 == 0:
        print(f"\n\n---------- Iteration {i // 3} Summary --------")
        print(f"Total: not equal syn {total_not_eq_syn}, eq syn {total_eq_syn}")
        print(f"Total: not equal res {total_not_eq_res}, eq res {total_eq_res}")
        print(f"Total: parse errors {total_err}")
        
    task = examples[i]['task']
    q = examples[i]['question']
    q_syn = examples[i + 1]['question']
    q_res = examples[i + 2]['question']
    
    # verifying for synonyms
    prompt = (f"Are these two logically equivalent, answer 1 for yes, answer 0 for no."
             f"Only answer 0 if you are certain they are unequal in logical reasoning. The phrasing and syntax will always be different between the sentences.\n{q}\n{q_syn}")
    resp_struct = openai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0,
        max_tokens=1000,
    )
    resp = resp_struct.choices[0].message.content.strip()
    
    if resp == "0":
        task_stats[task]['not_eq_syn'] += 1
        total_not_eq_syn += 1
        print(f"\n[{task}] Not Equivalent for Original: {q}\n Synonyms: {q_syn}")
    elif resp == "1":
        task_stats[task]['eq_syn'] += 1
        total_eq_syn += 1
    else:
        task_stats[task]['err'] += 1
        total_err += 1
        print(f"\n[{task}] MAJOR ISSUE WITH VERIFICATION PARSING for \nquestion: {q}\n synonym: {q_syn}")
    
    # ------ NOW DEAL WITH RESTRUCTURING LOGICAL EQUIVALENCE
    res_prompt = f"are these two logically equivalent, answer 1 for yes, answer 0 for no.\n{q}\n{q_res}"
    resp_struct_res = openai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": res_prompt}],
        temperature=0.0,
        max_tokens=1000,
    )
    resp = resp_struct_res.choices[0].message.content.strip()
    
    if resp == "0":
        task_stats[task]['not_eq_res'] += 1
        total_not_eq_res += 1
        print(f"\n[{task}] Not Equivalent for Original: {q}\n Restructured: {q_res}")
    elif resp == "1":
        task_stats[task]['eq_res'] += 1
        total_eq_res += 1
    else:
        task_stats[task]['err'] += 1
        total_err += 1
        print(f"\n[{task}] MAJOR ISSUE WITH VERIFICATION PARSING for \nquestion: {q}\n restructured: {q_res}")

print(f"\n\n=== FINAL SUMMARY ===")
print(f"Total: not equal syn {total_not_eq_syn}, eq syn {total_eq_syn}, errors {total_err}")
print(f"Total: not equal res {total_not_eq_res}, eq res {total_eq_res}, errors {total_err}")

print(f"\n=== BY TASK TYPE ===")
for task, stats in task_stats.items():
    print(f"{task}: syn_neq={stats['not_eq_syn']}, syn_eq={stats['eq_syn']}, res_neq={stats['not_eq_res']}, res_eq={stats['eq_res']}, errors={stats['err']}")







---------- Iteration 0 Summary --------
Total: not equal syn 0, eq syn 0
Total: not equal res 0, eq res 0
Total: parse errors 0

[com2sense] Not Equivalent for Original: 1 day after it is born, a cat is physically capable of reproducing.
 Restructured: A cat is physically capable of reproducing just one day after it is born.

[com2sense] Not Equivalent for Original: 1 day after it is born, a cat is physically capable of reproducing.
 Restructured: A cat is physically capable of reproducing just one day after it is born.

[com2sense] Not Equivalent for Original: Tim was about to play basketball so he needed to stretch for 10 minutes.
 Synonyms: Tim was about to play basketball so he needed to warm up for 10 minutes.

[com2sense] Not Equivalent for Original: Tim was about to play basketball so he needed to stretch for 10 minutes.
 Synonyms: Tim was about to play basketball so he needed to warm up for 10 minutes.


---------- Iteration 40 Summary --------
Total: not equal syn 1, eq syn 

In [5]:
def sample_examples(json_path: str, k: int = 1, seed: int = None):
    """
    Load a list of examples from `json_path` and return k random samples.
    If seed is provided, sampling will be reproducible.
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    if seed is not None:
        random.seed(seed)

    # if k == 1 you can also do random.choice(data)
    return random.sample(data, k)

# e.g. grab 5 examples reproducibly from each task type
samples = sample_examples('data/big_bench_augmented.json', k=20, seed=123)
for ex in samples:
    print(f"\nTask: {ex['task']} | ID: {ex['id']}")
    print(f"Question: {ex['question']}")
    print(f"Answer: {ex['answer']}")
    print(f"Direct Prompt: {ex['prompt_direct']}")
    print(f"COT Prompt: {ex['prompt_cot']}")
    print("-" * 80)


Task: com2sense | ID: com2sense_35b
Question: I would use my handkerchief to clean up the water if I spill it on the floor.
Answer: True
Direct Prompt: Evaluate if the following Q follows common sense. Answer 'True' or 'False'
Q: I would use my handkerchief to clean up the water if I spill it on the floor.
A:
COT Prompt: Evaluate if the following Q follows common sense. Answer 'True' or 'False'
Q: I would use my handkerchief to clean up the water if I spill it on the floor.
In answering this question each step should be on a separate line and start with a number and a period, followed by the reasoning. Finally the answer should be on a new line with the word 'Answer' proceeded by a colon.
A: Let's think step by step.
--------------------------------------------------------------------------------

Task: cause_effect | ID: cause_effect_92b
Question: Because she didn't wear her sweater, the girl caught a cold.
Answer: True
Direct Prompt: Evaluate if the following Q follows causality. An

In [6]:
# Test task-specific prompts
print("=== TASK-SPECIFIC PROMPT EXAMPLES ===")
with open('data/big_bench_augmented.json', 'r', encoding='utf-8') as f:
    examples = json.load(f)

# Sample one example from each task type
task_examples = {}
for ex in examples:
    task = ex['task']
    if task not in task_examples:
        task_examples[task] = ex
    if len(task_examples) == 4:  # We have all 4 task types
        break

for task, ex in task_examples.items():
    print(f"\n{task.upper()} TASK:")
    print(f"Question: {ex['question']}")
    print(f"Answer: {ex['answer']}")
    print(f"Direct Prompt:\n{ex['prompt_direct']}")
    print(f"\nCOT Prompt:\n{ex['prompt_cot']}")
    print("=" * 80)

=== TASK-SPECIFIC PROMPT EXAMPLES ===

COM2SENSE TASK:
Question: Keith is 5 feet tall so he is less likely to become an amateur basketball player than a horse jockey.
Answer: True
Direct Prompt:
Evaluate if the following Q follows common sense. Answer 'True' or 'False'
Q: Keith is 5 feet tall so he is less likely to become an amateur basketball player than a horse jockey.
A:

COT Prompt:
Evaluate if the following Q follows common sense. Answer 'True' or 'False'
Q: Keith is 5 feet tall so he is less likely to become an amateur basketball player than a horse jockey.
In answering this question each step should be on a separate line and start with a number and a period, followed by the reasoning. Finally the answer should be on a new line with the word 'Answer' proceeded by a colon.
A: Let's think step by step.

CAUSE_EFFECT TASK:
Question: The man climbed in through the window because the door was locked.
Answer: True
Direct Prompt:
Evaluate if the following Q follows causality. Answer 'T