In [2]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
import pandas as pd
import json
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
from src.extract import get_response_text, extract_results
from src.clients import get_azure_openai_client
%load_ext autoreload
%autoreload 2

# Use the tqdm notebook magic to automatically patch tqdm to use notebook progress bars
%load_ext tqdm.notebook

  from .autonotebook import tqdm as notebook_tqdm


The tqdm.notebook module is not an IPython extension.


In [3]:
# Extract results
split = "train"
experiment_id = f"{split}_ollama_qwen2.5:1.5b"

input_dir = Path(f"results/gsm8k/{experiment_id}")
output_dir = Path(f"results/gsm8k/{experiment_id}_extracted")

client = get_azure_openai_client()
extract_results(client, input_dir, output_dir, "regex")

Extracting cot answers: 100%|██████████| 7124/7124 [00:00<00:00, 77036.69it/s]
Extracting debate answers: 100%|██████████| 7124/7124 [00:00<00:00, 77863.57it/s]


In [4]:
# Load GSM8K dataset for ground truth answers
gsm = load_dataset("gsm8k", "main", split=split)
gsm_df = gsm.to_pandas()
gsm_df['gold_answer'] = gsm_df['answer'].str.split("#### ").str[1].str.strip()

In [5]:
# Load extracted answers AND raw results for all methods
EXTRACTED_DIR = output_dir
RAW_DIR = input_dir
methods = [dir.name for dir in EXTRACTED_DIR.iterdir() if dir.is_dir()]

extracted_results = {}
raw_results = {}
debate_answers_list = {}  # Store full list of answers for debate

for method in methods:
    # Load extracted results
    method_dir = EXTRACTED_DIR / method
    method_results = {}
    
    for json_file in method_dir.glob("*.json"):
        question_idx = int(json_file.stem)
        with open(json_file, 'r') as f:
            data = json.load(f)
            extracted_answer = data['extracted_answer']
            
            if method == "debate" and isinstance(extracted_answer, list):
                # For debate, store the full list separately and extract final answer
                debate_answers_list[question_idx] = extracted_answer
                # Get the final answer (last tuple in the list)
                final_answer = extracted_answer[-1][0] if extracted_answer else None
                method_results[question_idx] = final_answer
            else:
                # For zero_shot and cot, use the answer directly
                method_results[question_idx] = extracted_answer
    
    extracted_results[method] = method_results
    
    # Load raw results
    raw_method_dir = RAW_DIR / method
    raw_method_results = {}
    
    for json_file in raw_method_dir.glob("*.json"):
        question_idx = int(json_file.stem)
        with open(json_file, 'r') as f:
            raw_data = json.load(f)
            raw_method_results[question_idx] = get_response_text(method, raw_data)
    
    raw_results[method] = raw_method_results
    print(f"Loaded {len(method_results)} results for method: {method}")

# Convert to DataFrames
extracted_results = pd.DataFrame(extracted_results)
raw_results = pd.DataFrame(raw_results)

df_results = pd.merge(extracted_results, raw_results, left_index=True, right_index=True, how='left', suffixes=('_extracted', '_raw'))
raw_cols = [col for col in df_results.columns if '_raw' in col] # ['debate_raw', 'zero_shot_raw', 'cot_raw']
df_results = df_results.dropna(subset=raw_cols)
df_results['debate_length'] = df_results['debate_raw'].apply(lambda x: x.count('Solver:') + x.count('Critic:'))

# Add debate answers list as a separate column for later analysis
df_results['debate_answers_list'] = df_results.index.map(debate_answers_list)
df_results['debate_first_answer'] = df_results['debate_answers_list'].apply(lambda x: x[0][0] if x else None)
df_results['debate_first_answer_idx'] = df_results['debate_answers_list'].apply(lambda x: x[0][1] if x else None)
df_results['debate_last_answer'] = df_results['debate_answers_list'].apply(lambda x: x[-1][0] if x else None)
df_results['debate_last_answer_idx'] = df_results['debate_answers_list'].apply(lambda x: x[-1][1] if x else None)

# Merge with GSM8K dataset
df = gsm_df.merge(df_results, left_index=True, right_index=True)

Loaded 7124 results for method: debate
Loaded 7124 results for method: cot


In [6]:
# Convert to long format including raw results
df_long = df.reset_index(names=['index']).melt(
    id_vars=['index', 'question', 'gold_answer', 'answer', 'debate_length', 'debate_first_answer', 'debate_answers_list', 'debate_first_answer_idx', 'debate_last_answer', 'debate_last_answer_idx'],
    value_vars=[col for col in df.columns if col.endswith('_extracted')],
    var_name='method', 
    value_name='extracted_answer'
)

# Clean method names
df_long['method'] = df_long['method'].str.replace('_extracted', '')

# Add raw results
for method in methods:
    raw_col = f"{method}_raw"
    if raw_col in df.columns:
        df_long.loc[df_long['method'] == method, 'raw_result'] = df.loc[df_long[df_long['method'] == method]['index'], raw_col].values

df_long['correct'] = df_long['extracted_answer'] == df_long['gold_answer']

# compute accuracy for each method
df_long.groupby('method')['correct'].mean()

method
cot       0.196519
debate    0.334784
Name: correct, dtype: float64

Inspection

In [7]:
# Check whether debate helps by comparing the first and last answer
df['debate_first_answer_correct'] = df['debate_first_answer'] == df['gold_answer']
print(f"Debate first answer accuracy: {df['debate_first_answer_correct'].mean()}")


df['debate_last_answer_correct'] = df['debate_last_answer'] == df['gold_answer']
print(f"Debate last answer accuracy: {df['debate_last_answer_correct'].mean()}")

Debate first answer accuracy: 0.3020774845592364
Debate last answer accuracy: 0.3347838293093768


In [8]:
dft = df_long[(df_long['correct'] == True) & (df_long['method'] == 'debate') & (df_long['debate_first_answer'] != df_long['gold_answer'])]
dft

Unnamed: 0,index,question,gold_answer,answer,debate_length,debate_first_answer,debate_answers_list,debate_first_answer_idx,debate_last_answer,debate_last_answer_idx,method,extracted_answer,raw_result,correct
9,9,Tina makes $18.00 an hour. If she works more ...,990,She works 8 hours a day for $18 per hour so sh...,9,450,"[[450, 1], [234, 3], [787.5, 5], [990, 7]]",1.0,990,7.0,debate,990,Critic: Tina makes $18.00 an hour. If she wor...,True
16,16,The profit from a business transaction is shar...,800,"According to the ratio, for every 5 parts that...",7,2555.56,"[[2555.56, 1], [1514.29, 3], [800, 5]]",1.0,800,5.0,debate,800,Critic: The profit from a business transaction...,True
40,40,A concert ticket costs $40. Mr. Benson bought ...,476,Mr. Benson had a 5% discount for each of the 1...,10,0,"[[0, 1], [0, 7], [476, 9]]",1.0,476,9.0,debate,476,Critic: A concert ticket costs $40. Mr. Benson...,True
50,50,Gerald spends $100 a month on baseball supplie...,5,He needs to save up $400 because 4 x 100 = <<4...,7,15,"[[15, 1], [16, 3], [5, 5]]",1.0,5,5.0,debate,5,Critic: Gerald spends $100 a month on baseball...,True
97,97,"Nancy, the librarian, is shelving books from t...",46,Half of the books on the bottom section of the...,9,47,"[[47, 1], [47, 3], [58, 5], [46, 7]]",1.0,46,7.0,debate,46,"Critic: Nancy, the librarian, is shelving book...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7074,7273,"Louie sells Dodge, Hyundai, and Kia vehicles, ...",100,Half of the 400 vehicles for sale on the lot a...,9,300,"[[300, 1], [200, 3], [200, 5], [100, 7]]",1.0,100,7.0,debate,100,"Critic: Louie sells Dodge, Hyundai, and Kia ve...",True
7084,7283,A farmer plants his vegetables in neat rows. H...,14,There are 64/8 = <<64/8=8>>8 rows of bean seed...,10,28,"[[28, 1], [28, 3], [98, 5], [98, 7], [14, 9]]",1.0,14,9.0,debate,14,Critic: A farmer plants his vegetables in neat...,True
7085,7284,Talia is playing football with her friends. Th...,16,Talia first drives 5 miles to the park + 3 mil...,7,17,"[[17, 1], [16, 3], [16, 5]]",1.0,16,5.0,debate,16,Critic: Talia is playing football with her fri...,True
7088,7287,Lena has 16 candy bars. She needs 5 more candy...,5,Kevin has (16 + 5)/3 = <<(16+5)/3=7>>7 candy b...,10,14,"[[14, 1], [11, 3], [5, 5], [5, 7], [5, 9]]",1.0,5,9.0,debate,5,Critic: Lena has 16 candy bars. She needs 5 mo...,True


In [9]:
idx = 54
print(f"Index: {dft.iloc[idx]['index']}\nGold Answer: {dft.iloc[idx]['gold_answer']}\n\n{dft.iloc[idx]['raw_result']}")

Index: 1070
Gold Answer: 42

Critic: Every morning, Jason eats an omelet made using three eggs.  How many eggs will Jason consume in two weeks?
Solver: To determine how many eggs Jason will consume in two weeks, we need to follow these steps:

1. Calculate the number of days in two weeks.
2. Determine how many times Jason uses an egg each day.
3. Multiply the total number of use instances by the number of eggs used per instance.

Let's break this down:

**Step 1: Number of days in two weeks**
- There are 7 days in a week.
- Therefore, two weeks would be \(2 \times 7 = 14\) days.

**Step 2: Eggs used each day**
- Jason uses one egg per omelet he makes.
- He eats one omelet every morning and has two eggs for breakfast.

So, he consumes:
\[ 1 \text{ egg/day} \]

**Step 3: Total usage instances in two weeks**
- Multiply the number of days by the number of eggs used each day:
\[ 14 \text{ days} \times 1 \text{ egg/day} = 14 \text{ use instances} \]

Therefore, Jason will consume:

{"answer"

## Write debate data as JSONL file
We will use the clean JSONL file to finally generate training data for our model

In [10]:
df_long_debate = df_long[df_long['method'] == 'debate']
json_list = []
removed = 0

for _, row in df_long_debate.iterrows():
    if not row['debate_answers_list']: # i.e., no answer was found in any turn
        removed += 1
        continue
    
    # tag question as: incorrect, already_correct, fixed_by_debate
    if row['extracted_answer'] != row['gold_answer']:
        trace_type = "INCORRECT"
    else:
        if (row['debate_first_answer'] == row['gold_answer']) and (row['debate_first_answer_idx'] == 1):
            trace_type = "ALREADY_CORRECT" # solver's first answer is correct
        else:
            trace_type = "FIXED_BY_DEBATE"
    
    # load raw result
    raw_result = json.load(open(f'{RAW_DIR}/debate/{row["index"]}.json'))
    
    # need to go through the conversation and trim it to the first time the answer is correct
    for ans, turn_idx in row['debate_answers_list']:
        if ans == row['gold_answer']:
            break
    raw_result = raw_result[:turn_idx + 1]

    d = {
        'question': row['question'],
        'gold_answer': row['gold_answer'],
        'trace_type': trace_type,
        'turns': raw_result
    }
    json_list.append(d)

with open('generated_data/debate_traces.jsonl', 'w') as f:
    for item in json_list:
        f.write(json.dumps(item) + '\n')

print(f"Removed {removed} rows with no answer found in any turn")

Removed 3798 rows with no answer found in any turn
