In [51]:
import pandas as pd
import json
from pathlib import Path
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
from extract import get_response_text

In [57]:
# Load GSM8K dataset for ground truth answers
gsm = load_dataset("gsm8k", "main", split="test")
gsm_df = gsm.to_pandas()
gsm_df['gold_answer'] = gsm_df['answer'].str.split("#### ").str[1].str.strip()

In [58]:
# Load extracted answers AND raw results for all methods
EXTRACTED_DIR = Path("results/gsm8k-extracted")
RAW_DIR = Path("results/gsm8k")
methods = [dir.name for dir in EXTRACTED_DIR.iterdir() if dir.is_dir()]

extracted_results = {}
raw_results = {}

for method in methods:
    # Load extracted results
    method_dir = EXTRACTED_DIR / method
    method_results = {}
    
    for json_file in method_dir.glob("*.json"):
        question_idx = int(json_file.stem)
        with open(json_file, 'r') as f:
            data = json.load(f)
            method_results[question_idx] = data['extracted_answer']
    
    extracted_results[method] = method_results
    
    # Load raw results
    raw_method_dir = RAW_DIR / method
    raw_method_results = {}
    
    for json_file in raw_method_dir.glob("*.json"):
        question_idx = int(json_file.stem)
        with open(json_file, 'r') as f:
            raw_data = json.load(f)
            raw_method_results[question_idx] = get_response_text(method, raw_data)
    
    raw_results[method] = raw_method_results
    print(f"Loaded {len(method_results)} results for method: {method}")

# Convert to DataFrames
extracted_results = pd.DataFrame(extracted_results)
raw_results = pd.DataFrame(raw_results)

Loaded 100 results for method: cot
Loaded 100 results for method: debate
Loaded 100 results for method: debate-both35
Loaded 100 results for method: zero_shot


In [59]:
# Merge with GSM8K dataset
df = gsm_df.merge(extracted_results, left_index=True, right_index=True)
df = df.merge(raw_results, left_index=True, right_index=True, suffixes=('_extracted', '_raw'))

# Convert to long format including raw results
df_long = df.reset_index(names=['index']).melt(
    id_vars=['index', 'question', 'gold_answer', 'answer'], 
    value_vars=[col for col in df.columns if col.endswith('_extracted')],
    var_name='method', 
    value_name='extracted_answer'
)

# Clean method names
df_long['method'] = df_long['method'].str.replace('_extracted', '')

# Add raw results
for method in methods:
    raw_col = f"{method}_raw"
    if raw_col in df.columns:
        df_long.loc[df_long['method'] == method, 'raw_result'] = df.loc[df_long[df_long['method'] == method]['index'], raw_col].values

df_long['correct'] = df_long['extracted_answer'] == df_long['gold_answer']

In [60]:
# compute accuracy for each method
df_long.groupby('method')['correct'].mean()

method
cot              0.85
debate           0.92
debate-both35    0.80
zero_shot        0.87
Name: correct, dtype: float64

In [41]:
dft = df_long[(df_long['correct'] == False) & (df_long['method'] == 'debate')]

In [48]:
idx = 1
print(dft.iloc[idx]['index'], dft.iloc[idx]['gold_answer'], dft.iloc[idx]['raw_result'])

7 160 Agent 1: Carla is downloading a 200 GB file. Normally she can download 2 GB/minute, but 40% of the way through the download, Windows forces a restart to install updates, which takes 20 minutes. Then Carla has to restart the download from the beginning. How load does it take to download the file?
Agent 2: To solve this problem, we can break it down into two parts: the time it takes to download 40% of the file and the time it takes to download the remaining 60% of the file.

First, we find the time it takes to download 40% of the file:
40% of 200 GB = 0.4 * 200 = 80 GB
Time to download 80 GB at 2 GB/minute = 80 / 2 = 40 minutes

Then, we add the 20 minutes it takes for the Windows update:
40 minutes + 20 minutes = 60 minutes

So, it takes Carla 60 minutes to download 40% of the file.

Now, Carla has to restart the download from the beginning, so she needs to download the remaining 60% of the file:
60% of 200 GB = 0.6 * 200 = 120 GB
Time to download 120 GB at 2 GB/minute = 120 / 2 =

In [50]:
gsm_df.loc[7]['answer']

'First find how many gigabytes are in 40% of the file: 200 GB * 40% = <<200*40*.01=80>>80 GB\nThen divide that number by the download rate to find the time until Windows restarts: 80 GB / 2 GB/minute = <<80/2=40>>40 minutes\nThen find the time to download the whole file after the restart: 200 GB / 2 GB/minute = <<200/2=100>>100 minutes\nThen add the time to download 40% of the file, to download the whole file, and to wait for Windows to update: 40 minutes + 100 minutes + 20 minutes = <<40+100+20=160>>160 minutes\n#### 160'