In [32]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
import pandas as pd
import json
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
from src.extract import get_response_text, extract_results
from src.clients import get_azure_openai_client
%load_ext autoreload
%autoreload 2

# Use the tqdm notebook magic to automatically patch tqdm to use notebook progress bars
%load_ext tqdm.notebook

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The tqdm.notebook module is not an IPython extension.


In [33]:
# Extract results
split = "train"
experiment_id = f"{split}_ollama_llama3.1:8b"

input_dir = Path(f"results/gsm8k/{experiment_id}")
output_dir = Path(f"results/gsm8k/{experiment_id}_extracted")

client = get_azure_openai_client()
extract_results(client, input_dir, output_dir, "regex")

Extracting debate answers: 100%|██████████| 7473/7473 [00:00<00:00, 101342.88it/s]
Extracting cot answers:   0%|          | 0/7473 [00:00<?, ?it/s]

Extracting cot answers: 100%|██████████| 7473/7473 [00:00<00:00, 100455.85it/s]


In [34]:
# Load GSM8K dataset for ground truth answers
gsm = load_dataset("gsm8k", "main", split=split)
gsm_df = gsm.to_pandas()
gsm_df['gold_answer'] = gsm_df['answer'].str.split("#### ").str[1].str.strip()

In [42]:
# Load extracted answers AND raw results for all methods
EXTRACTED_DIR = output_dir
RAW_DIR = input_dir
methods = [dir.name for dir in EXTRACTED_DIR.iterdir() if dir.is_dir()]

extracted_results = {}
raw_results = {}
debate_answers_list = {}  # Store full list of answers for debate

for method in methods:
    # Load extracted results
    method_dir = EXTRACTED_DIR / method
    method_results = {}
    
    for json_file in method_dir.glob("*.json"):
        question_idx = int(json_file.stem)
        with open(json_file, 'r') as f:
            data = json.load(f)
            extracted_answer = data['extracted_answer']
            
            if method == "debate" and isinstance(extracted_answer, list):
                # For debate, store the full list separately and extract final answer
                debate_answers_list[question_idx] = extracted_answer
                # Get the final answer (last tuple in the list)
                final_answer = extracted_answer[-1][0] if extracted_answer else None
                method_results[question_idx] = final_answer
            else:
                # For zero_shot and cot, use the answer directly
                method_results[question_idx] = extracted_answer
    
    extracted_results[method] = method_results
    
    # Load raw results
    raw_method_dir = RAW_DIR / method
    raw_method_results = {}
    
    for json_file in raw_method_dir.glob("*.json"):
        question_idx = int(json_file.stem)
        with open(json_file, 'r') as f:
            raw_data = json.load(f)
            raw_method_results[question_idx] = get_response_text(method, raw_data)
    
    raw_results[method] = raw_method_results
    print(f"Loaded {len(method_results)} results for method: {method}")

# Convert to DataFrames
extracted_results = pd.DataFrame(extracted_results)
raw_results = pd.DataFrame(raw_results)

df_results = pd.merge(extracted_results, raw_results, left_index=True, right_index=True, how='left', suffixes=('_extracted', '_raw'))
raw_cols = [col for col in df_results.columns if '_raw' in col] # ['debate_raw', 'zero_shot_raw', 'cot_raw']
df_results = df_results.dropna(subset=raw_cols)
df_results['debate_length'] = df_results['debate_raw'].apply(lambda x: x.count('Solver:') + x.count('Critic:'))

# Add debate answers list as a separate column for later analysis
df_results['debate_answers_list'] = df_results.index.map(debate_answers_list)
df_results['debate_first_answer'] = df_results['debate_answers_list'].apply(lambda x: x[0][0] if x else None)
df_results['debate_first_answer_idx'] = df_results['debate_answers_list'].apply(lambda x: x[0][1] if x else None)
df_results['debate_last_answer'] = df_results['debate_answers_list'].apply(lambda x: x[-1][0] if x else None)
df_results['debate_last_answer_idx'] = df_results['debate_answers_list'].apply(lambda x: x[-1][1] if x else None)

# Merge with GSM8K dataset
df = gsm_df.merge(df_results, left_index=True, right_index=True)

Loaded 7473 results for method: cot
Loaded 7473 results for method: debate


In [43]:
# Convert to long format including raw results
df_long = df.reset_index(names=['index']).melt(
    id_vars=['index', 'question', 'gold_answer', 'answer', 'debate_length', 'debate_first_answer', 'debate_answers_list', 'debate_first_answer_idx', 'debate_last_answer', 'debate_last_answer_idx'],
    value_vars=[col for col in df.columns if col.endswith('_extracted')],
    var_name='method', 
    value_name='extracted_answer'
)

# Clean method names
df_long['method'] = df_long['method'].str.replace('_extracted', '')

# Add raw results
for method in methods:
    raw_col = f"{method}_raw"
    if raw_col in df.columns:
        df_long.loc[df_long['method'] == method, 'raw_result'] = df.loc[df_long[df_long['method'] == method]['index'], raw_col].values

df_long['correct'] = df_long['extracted_answer'] == df_long['gold_answer']

# compute accuracy for each method
df_long.groupby('method')['correct'].mean()

method
cot       0.865248
debate    0.825907
Name: correct, dtype: float64

Inspection

In [44]:
# Check whether debate helps by comparing the first and last answer
df['debate_first_answer_correct'] = df['debate_first_answer'] == df['gold_answer']
print(f"Debate first answer accuracy: {df['debate_first_answer_correct'].mean()}")


df['debate_last_answer_correct'] = df['debate_last_answer'] == df['gold_answer']
print(f"Debate last answer accuracy: {df['debate_last_answer_correct'].mean()}")

Debate first answer accuracy: 0.8107854944466747
Debate last answer accuracy: 0.8259065970828315


In [45]:
dft = df_long[(df_long['correct'] == True) & (df_long['method'] == 'debate') & (df_long['debate_first_answer'] != df_long['gold_answer'])]
dft.head()

Unnamed: 0,index,question,gold_answer,answer,debate_length,debate_first_answer,debate_answers_list,debate_first_answer_idx,debate_last_answer,debate_last_answer_idx,method,extracted_answer,raw_result,correct
7494,21,"Each bird eats 12 beetles per day, each snake ...",1080,First find the total number of snakes eaten: 5...,9,360,"[[360, 1], [1080, 3], [1080, 5], [1080, 7]]",1.0,1080,7.0,debate,1080,"Critic: Each bird eats 12 beetles per day, eac...",True
7516,43,An earthquake caused four buildings to collaps...,60,The second earthquake caused 2 * 4 = <<2*4=8>>...,5,53,"[[53, 1], [60, 3]]",1.0,60,3.0,debate,60,Critic: An earthquake caused four buildings to...,True
7534,61,A bear is preparing to hibernate for the winte...,200,The bear gained 1 / 5 * 1000 = <<1/5*1000=200>...,5,250,"[[250, 1], [200, 3]]",1.0,200,3.0,debate,200,Critic: A bear is preparing to hibernate for t...,True
7540,67,Jesse and Mia are competing in a week long rac...,6,Jesse runs 2 miles in the first three days bec...,5,2,"[[2, 1], [6, 3]]",1.0,6,3.0,debate,6,Critic: Jesse and Mia are competing in a week ...,True
7558,85,Four classmates were comparing their ages base...,5,Jolyn is 2 + 5 = <<2+5=7>>7 months older than ...,7,3,"[[3, 1], [7, 3], [5, 5]]",1.0,5,5.0,debate,5,Critic: Four classmates were comparing their a...,True


In [46]:
idx = 1
print(f"Index: {dft.iloc[idx]['index']}\nGold Answer: {dft.iloc[idx]['gold_answer']}\n\n{dft.iloc[idx]['raw_result']}")

Index: 43
Gold Answer: 60

Critic: An earthquake caused four buildings to collapse. Experts predicted that each following earthquake would have double the number of collapsing buildings as the previous one, since each one would make the foundations less stable. After three more earthquakes, how many buildings had collapsed including those from the first earthquake?
Solver: {"answer": 53} 

Let's break it down step by step:

1. The first earthquake caused 4 buildings to collapse.
2. The second earthquake caused double the number of collapsing buildings as the previous one: 4 x 2 = 8
3. The third earthquake caused double the number of collapsing buildings as the previous one: 8 x 2 = 16
4. The fourth earthquake caused double the number of collapsing buildings as the previous one: 16 x 2 = 32
5. The fifth earthquake caused double the number of collapsing buildings as the previous one: 32 x 2 = 64

Adding up all the collapsed buildings from each earthquake: 
4 + 8 + 16 + 32 + 64 = 124 

Ho

## Write debate data as JSONL file
We will use the clean JSONL file to finally generate training data for our model

In [47]:
df_long_debate = df_long[df_long['method'] == 'debate']
json_list = []

for _, row in df_long_debate.iterrows():
    # tag question as: incorrect, already_correct, fixed_by_debate
    if row['extracted_answer'] != row['gold_answer']:
        trace_type = "INCORRECT"
    else:
        if (row['debate_first_answer'] == row['gold_answer']) and (row['debate_first_answer_idx'] == 1):
            trace_type = "ALREADY_CORRECT" # solver's first answer is correct
        else:
            trace_type = "FIXED_BY_DEBATE"
    
    # load raw result
    raw_result = json.load(open(f'{RAW_DIR}/debate/{row["index"]}.json'))
    
    # need to go through the conversation and trim it to the first time the answer is correct
    for ans, turn_idx in row['debate_answers_list']:
        if ans == row['gold_answer']:
            break
    raw_result = raw_result[:turn_idx + 1]

    d = {
        'question': row['question'],
        'gold_answer': row['gold_answer'],
        'trace_type': trace_type,
        'turns': raw_result
    }
    json_list.append(d)

with open('generated_data/debate_traces.jsonl', 'w') as f:
    for item in json_list:
        f.write(json.dumps(item) + '\n')