In [27]:
import pandas as pd
import json
from pathlib import Path
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
from src.extract import get_response_text, extract_results
from src.clients import get_azure_openai_client
%load_ext autoreload
%autoreload 2

# Use the tqdm notebook magic to automatically patch tqdm to use notebook progress bars
%load_ext tqdm.notebook

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The tqdm.notebook module is not an IPython extension.


In [28]:
# Extract results
experiment_id = "train_ollama_llama3.1:8b"

input_dir = Path(f"results/gsm8k/{experiment_id}")
output_dir = Path(f"results/gsm8k/{experiment_id}_extracted")

client = get_azure_openai_client()
extract_results(client, input_dir, output_dir)

Extracting zero_shot answers: 100%|██████████| 7473/7473 [00:00<00:00, 73891.15it/s]
Extracting cot answers: 100%|██████████| 7473/7473 [00:00<00:00, 72817.57it/s]
Extracting debate answers: 100%|██████████| 7473/7473 [00:00<00:00, 31373.05it/s]


In [29]:
# Load GSM8K dataset for ground truth answers
gsm = load_dataset("gsm8k", "main", split="train")
gsm_df = gsm.to_pandas()
gsm_df['gold_answer'] = gsm_df['answer'].str.split("#### ").str[1].str.strip()

In [30]:
# Load extracted answers AND raw results for all methods
EXTRACTED_DIR = output_dir
RAW_DIR = input_dir
methods = [dir.name for dir in EXTRACTED_DIR.iterdir() if dir.is_dir()]

extracted_results = {}
raw_results = {}

for method in methods:
    # Load extracted results
    method_dir = EXTRACTED_DIR / method
    method_results = {}
    
    for json_file in method_dir.glob("*.json"):
        question_idx = int(json_file.stem)
        with open(json_file, 'r') as f:
            data = json.load(f)
            method_results[question_idx] = data['extracted_answer']
    
    extracted_results[method] = method_results
    
    # Load raw results
    raw_method_dir = RAW_DIR / method
    raw_method_results = {}
    
    for json_file in raw_method_dir.glob("*.json"):
        question_idx = int(json_file.stem)
        with open(json_file, 'r') as f:
            raw_data = json.load(f)
            raw_method_results[question_idx] = get_response_text(method, raw_data)
    
    raw_results[method] = raw_method_results
    print(f"Loaded {len(method_results)} results for method: {method}")

# Convert to DataFrames
extracted_results = pd.DataFrame(extracted_results)
raw_results = pd.DataFrame(raw_results)

Loaded 7473 results for method: zero_shot
Loaded 7473 results for method: cot
Loaded 7473 results for method: debate


In [31]:
# Merge with GSM8K dataset
df = gsm_df.merge(extracted_results, left_index=True, right_index=True, how='left')
df = df.merge(raw_results, left_index=True, right_index=True, suffixes=('_extracted', '_raw'), how='left')

# Convert to long format including raw results
df_long = df.reset_index(names=['index']).melt(
    id_vars=['index', 'question', 'gold_answer', 'answer'], 
    value_vars=[col for col in df.columns if col.endswith('_extracted')],
    var_name='method', 
    value_name='extracted_answer'
)

# Clean method names
df_long['method'] = df_long['method'].str.replace('_extracted', '')

# Add raw results
for method in methods:
    raw_col = f"{method}_raw"
    if raw_col in df.columns:
        df_long.loc[df_long['method'] == method, 'raw_result'] = df.loc[df_long[df_long['method'] == method]['index'], raw_col].values

df_long['correct'] = df_long['extracted_answer'] == df_long['gold_answer']

In [32]:
# compute accuracy for each method
df_long.groupby('method')['correct'].mean()

method
cot          0.892948
debate       0.906062
zero_shot    0.886391
Name: correct, dtype: float64

In [24]:
dft = df_long[(df_long['correct'] == False) & (df_long['method'] == 'debate')]

In [26]:
idx = 23
print(dft.iloc[idx]['index'], dft.iloc[idx]['gold_answer'], dft.iloc[idx]['raw_result'])

266 4 Critic: Johns goes to the gym 3 times a week.  He spends 1 hour each day lifting weight. Additionally, he also spends a third of his weightlifting time warming up and doing cardio each day.  How many hours does he spend at the gym a week?
Solver: Let's break this down step by step:

1. John goes to the gym 3 times a week.
2. He spends 1 hour lifting weights each day, so that's 1 hour x 3 days = 3 hours of weightlifting per week.
3. He also spends a third of his weightlifting time warming up and doing cardio each day. Since he lifts for 1 hour per day, a third of that is:
   1/3 x 1 hour = 0.33 hours (or approximately 20 minutes)
4. Therefore, the total time spent on weightlifting and cardio/warming up per week is:
   3 hours (weightlifting) + 3 days x 0.33 hours (cardio/warming up) ≈ 3 + 1 hour
5. Adding these together, John spends approximately 4 hours at the gym each week.

Am I correct?
Critic: Your calculation has a flaw in step 4. While you correctly calculated the cardio/wa

In [50]:
gsm_df.loc[7]['answer']

'First find how many gigabytes are in 40% of the file: 200 GB * 40% = <<200*40*.01=80>>80 GB\nThen divide that number by the download rate to find the time until Windows restarts: 80 GB / 2 GB/minute = <<80/2=40>>40 minutes\nThen find the time to download the whole file after the restart: 200 GB / 2 GB/minute = <<200/2=100>>100 minutes\nThen add the time to download 40% of the file, to download the whole file, and to wait for Windows to update: 40 minutes + 100 minutes + 20 minutes = <<40+100+20=160>>160 minutes\n#### 160'