# Set up

## Import libraries

In [1]:
import argparse
import glob
import importlib
import json
import os
import statistics
import sys
import time
from concurrent.futures import ProcessPoolExecutor
from functools import partial

import numpy as np
import pandas as pd
from openai import OpenAI
from pydantic import ValidationError
from sklearn.metrics import (accuracy_score, f1_score,
                             precision_score, recall_score)
from tqdm import tqdm

# Ensure project root is on sys.path (required for papermill fresh kernels)
current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)

import llm_calls.deepseek_evaluator as etb
from llm_calls.deepseek_evaluator import EntailmentEvaluator
from llm_calls.prompts import *

## Declare paths 

# Execute Entailment API calls 

In [2]:
# Check API key and environment
print("Validating environment setup...")

# Check for OpenAI/DeepSeek API key
api_key = os.environ.get('OPENAI_API_KEY') or os.environ.get('DEEPSEEK_API_KEY')
if api_key:
    print(f"✓ API key found (length: {len(api_key)} chars)")
else:
    print("⚠ WARNING: No API key found in environment variables")
    print("  Looking for OPENAI_API_KEY or DEEPSEEK_API_KEY")
    print("  LLM API calls will likely fail without proper credentials")

# Check if deepseek_evaluator module is importable
try:
    import llm_calls.deepseek_evaluator as etb
    print(f"✓ deepseek_evaluator module loaded from: {etb.__file__}")
except Exception as e:
    print(f"✗ ERROR: Cannot import deepseek_evaluator: {e}")
    raise

# Check OpenAI library
try:
    from openai import OpenAI
    print(f"✓ OpenAI library available")
except Exception as e:
    print(f"⚠ WARNING: OpenAI library issue: {e}")

print("\nEnvironment validation complete.\n")


Validating environment setup...
✓ API key found (length: 35 chars)
✓ deepseek_evaluator module loaded from: c:\Users\aesteva\Dropbox\Culture\3_data_processing\10_Argumentation\Entailment\CODE\free_entailment_algorithm\fea_project\llm_calls\deepseek_evaluator.py
✓ OpenAI library available

Environment validation complete.



In [3]:
llm_model = "deepseek-reasoner"
input_file = "fea_iterations/loop_data/df_to_llm_iter_0.csv"
args_file = "ArgLevel_ClauseIds_df.xlsx"
prompt = "test_prompt_tot_json2"
output = "labeled_pairs/Results_DS_BtoS_iteration_1.csv"
previous_input_file = "Results_DS_BtoS_iteration_0.csv"

In [4]:
# Parameters
llm_model = "deepseek-reasoner"
input_file = "fea_iterations\\loop_data/df_to_llm_iter_0.csv"
args_file = "ArgLevel_ClauseIds_df.xlsx"
prompt = "test_prompt_tot_json2"
output = "labeled_pairs/Results_DS_BtoS_iteration_1_one_way"
previous_input_file = ""


In [5]:
# Validate input files exist
print(f"\n{'='*60}")
print("INPUT FILE VALIDATION")
print(f"{'='*60}")

print(f"Checking input_file: {input_file}")
if os.path.exists(input_file):
    df_input = pd.read_csv(input_file)
    print(f"✓ Input file found: {len(df_input)} rows")
    print(f"  Columns: {list(df_input.columns)}")
else:
    raise FileNotFoundError(f"Input file not found: {input_file}")

print(f"\nChecking args_file: {args_file}")
if os.path.exists(args_file):
    if args_file.endswith('.csv'):
        df_args = pd.read_csv(args_file)
    elif args_file.endswith('.xlsx'):
        df_args = pd.read_excel(args_file)
    else:
        raise ValueError(f"Unsupported file format: {args_file}")
    print(f"✓ Args file found: {len(df_args)} rows")
else:
    raise FileNotFoundError(f"Args file not found: {args_file}")

if previous_input_file:
    print(f"\nChecking previous_input_file: {previous_input_file}")
    if os.path.exists(previous_input_file):
        df_prev = pd.read_csv(previous_input_file)
        print(f"✓ Previous input file found: {len(df_prev)} rows")
    else:
        print(f"⚠ WARNING: Previous input file not found: {previous_input_file}")
        print(f"  Continuing without merging previous results")

# Create output directory if needed
output_dir = os.path.dirname(output)
if output_dir and not os.path.exists(output_dir):
    os.makedirs(output_dir, exist_ok=True)
    print(f"\n✓ Created output directory: {output_dir}")

print(f"{'='*60}\n")



INPUT FILE VALIDATION
Checking input_file: fea_iterations\loop_data/df_to_llm_iter_0.csv
✓ Input file found: 1000 rows
  Columns: ['sentence_id_2', 'sentence_id_1', 'sentence_text_2', 'argument_id_2', 'sentence_text_1', 'argument_id_1', 'score']

Checking args_file: ArgLevel_ClauseIds_df.xlsx


✓ Args file found: 4015 rows



In [6]:
# Strip .csv extension since evaluator adds it automatically
output_arg = output.replace('.csv', '') if output.endswith('.csv') else output

sys.argv = [
    "deepseek_evaluator.py",
    "--model", llm_model,
    "--file", input_file,
    "--external", args_file,
    "--prompt", prompt,
    "--output", output_arg
]

print(f"\n{'='*60}")
print("EXECUTING LLM API CALLS")
print(f"{'='*60}")
print(f"Model: {llm_model}")
print(f"Input: {input_file}")
print(f"Output (arg): {output_arg}")
print(f"Expected file: {output}")
print(f"{'='*60}\n")

try:
    etb.main()
    print(f"\n✓ LLM API calls completed")
except Exception as e:
    print(f"\n✗ ERROR during LLM API execution: {e}")
    print(f"Error type: {type(e).__name__}")
    import traceback
    traceback.print_exc()
    raise

# Check for output file - evaluator adds .csv extension
if not os.path.exists(output):
    # Check if double extension was created
    double_ext = output + '.csv' if not output.endswith('.csv') else output.replace('.csv', '') + '.csv.csv'
    if os.path.exists(double_ext):
        print(f"⚠ WARNING: Output created with double extension: {double_ext}")
        print(f"  Renaming to: {output}")
        os.rename(double_ext, output)
    else:
        # List files in output directory for debugging
        output_dir = os.path.dirname(output) or '.'
        print(f"⚠ Files in {output_dir}:")
        for f in os.listdir(output_dir):
            print(f"  - {f}")
        raise FileNotFoundError(f"Output file was not created: {output}")

print(f"\n{'='*60}")
print("VALIDATING OUTPUT")
print(f"{'='*60}")

df_new = pd.read_csv(output)
print(f"✓ Output file loaded: {len(df_new)} rows")

# Check for required columns
required_cols = ['sentence_id_1', 'sentence_id_2']
missing_cols = [col for col in required_cols if col not in df_new.columns]
if missing_cols:
    print(f"⚠ WARNING: Missing expected columns: {missing_cols}")
    print(f"  Available columns: {list(df_new.columns)}")

# Check for errors in the data
if 'ERROR' in df_new.columns:
    error_count = df_new['ERROR'].notna().sum()
    if error_count > 0:
        print(f"⚠ WARNING: {error_count}/{len(df_new)} rows contain errors")
        print(f"  First error: {df_new[df_new['ERROR'].notna()]['ERROR'].iloc[0]}")

# Check for empty/null critical fields
null_counts = df_new[required_cols].isnull().sum()
if null_counts.any():
    print(f"⚠ WARNING: Null values found in critical columns:")
    for col, count in null_counts[null_counts > 0].items():
        print(f"  {col}: {count}/{len(df_new)} rows")

print(f"✓ Output validation complete\n")

# Merge with previous results
if previous_input_file and os.path.exists(previous_input_file):
    print(f"{'='*60}")
    print("MERGING WITH PREVIOUS RESULTS")
    print(f"{'='*60}")
    
    df_previous = pd.read_csv(previous_input_file)
    print(f"✓ Loaded previous results: {len(df_previous)} rows")
    
    df_merged = pd.concat([df_previous, df_new], ignore_index=True)
    print(f"✓ Merged: {len(df_previous)} previous + {len(df_new)} new = {len(df_merged)} total")
    
    # Validate merged data
    if len(df_merged) != len(df_previous) + len(df_new):
        print(f"⚠ WARNING: Merged row count doesn't match expected sum")
    
    df_merged.to_csv(output, index=False)
    print(f"✓ Saved merged results to {output}")
    print(f"{'='*60}\n")
else:
    print(f"\n✓ No previous input file to merge")
    print(f"✓ Output saved to {output}\n")



EXECUTING LLM API CALLS
Model: deepseek-reasoner
Input: fea_iterations\loop_data/df_to_llm_iter_0.csv
Output (arg): labeled_pairs/Results_DS_BtoS_iteration_1_one_way
Expected file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way

Loading data from fea_iterations\loop_data/df_to_llm_iter_0.csv...
Loading data from ArgLevel_ClauseIds_df.xlsx...


Loaded 1000 sentence pairs


Using model: deepseek-reasoner
Using prompt type: test_prompt_tot_json2
Running batch evaluation...


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 5/1000 [00:00<00:21, 46.35it/s]

  1%|          | 10/1000 [00:00<00:21, 45.70it/s]

  2%|▏         | 15/1000 [00:00<00:23, 41.19it/s]

  2%|▏         | 20/1000 [00:00<00:25, 37.89it/s]

  2%|▏         | 24/1000 [00:00<00:25, 37.82it/s]

  3%|▎         | 28/1000 [00:00<00:30, 31.68it/s]

  3%|▎         | 32/1000 [00:00<00:28, 33.46it/s]

  4%|▎         | 36/1000 [00:01<00:39, 24.33it/s]

  4%|▍         | 39/1000 [00:01<00:40, 23.58it/s]

  4%|▍         | 42/1000 [00:01<00:40, 23.48it/s]

  4%|▍         | 45/1000 [00:01<00:41, 22.98it/s]

  5%|▍         | 48/1000 [00:01<00:42, 22.34it/s]

  5%|▌         | 51/1000 [00:01<00:45, 21.08it/s]

  5%|▌         | 54/1000 [00:01<00:43, 21.69it/s]

  6%|▌         | 57/1000 [00:02<00:50, 18.56it/s]

  6%|▌         | 60/1000 [00:02<00:45, 20.73it/s]

  6%|▋         | 63/1000 [00:02<00:42, 22.27it/s]

  7%|▋         | 66/1000 [00:02<00:39, 23.51it/s]

  7%|▋         | 69/1000 [00:02<00:38, 24.31it/s]

  7%|▋         | 72/1000 [00:02<00:37, 24.65it/s]

  8%|▊         | 75/1000 [00:02<00:37, 24.69it/s]

  8%|▊         | 78/1000 [00:02<00:36, 25.57it/s]

  8%|▊         | 81/1000 [00:03<00:34, 26.35it/s]

  8%|▊         | 84/1000 [00:03<00:34, 26.50it/s]

  9%|▊         | 87/1000 [00:03<00:34, 26.67it/s]

  9%|▉         | 90/1000 [00:03<00:33, 27.56it/s]

  9%|▉         | 93/1000 [00:03<00:32, 27.89it/s]

 10%|▉         | 97/1000 [00:03<00:31, 28.77it/s]

 10%|█         | 100/1000 [00:03<00:33, 26.96it/s]

 10%|█         | 103/1000 [00:03<00:33, 26.89it/s]

 11%|█         | 107/1000 [00:04<00:30, 29.24it/s]

 11%|█         | 111/1000 [00:04<00:28, 31.25it/s]

 12%|█▏        | 115/1000 [00:04<00:26, 33.31it/s]

 12%|█▏        | 119/1000 [00:04<00:26, 33.57it/s]

 12%|█▏        | 123/1000 [00:04<00:25, 34.31it/s]

 13%|█▎        | 127/1000 [00:04<00:24, 34.99it/s]

 13%|█▎        | 131/1000 [00:04<00:24, 35.32it/s]

 14%|█▎        | 135/1000 [00:04<00:24, 34.85it/s]

 14%|█▍        | 140/1000 [00:04<00:22, 38.59it/s]

 15%|█▍        | 146/1000 [00:05<00:20, 41.99it/s]

 15%|█▌        | 151/1000 [00:05<00:19, 43.34it/s]

 16%|█▌        | 156/1000 [00:05<00:18, 44.88it/s]

 16%|█▌        | 161/1000 [00:05<00:18, 46.09it/s]

 17%|█▋        | 166/1000 [00:05<00:18, 46.31it/s]

 17%|█▋        | 171/1000 [00:05<00:17, 46.46it/s]

 18%|█▊        | 176/1000 [00:05<00:17, 46.64it/s]

 18%|█▊        | 181/1000 [00:05<00:17, 46.03it/s]

 19%|█▊        | 186/1000 [00:05<00:18, 44.43it/s]

 19%|█▉        | 192/1000 [00:06<00:17, 46.85it/s]

 20%|█▉        | 197/1000 [00:06<00:17, 46.05it/s]

 20%|██        | 203/1000 [00:06<00:16, 47.61it/s]

 21%|██        | 208/1000 [00:06<00:16, 48.20it/s]

 21%|██▏       | 214/1000 [00:06<00:15, 50.67it/s]

 22%|██▏       | 220/1000 [00:06<00:15, 49.57it/s]

 23%|██▎       | 226/1000 [00:06<00:15, 51.55it/s]

 23%|██▎       | 232/1000 [00:06<00:16, 46.30it/s]

 24%|██▍       | 238/1000 [00:06<00:15, 48.29it/s]

 24%|██▍       | 243/1000 [00:07<00:16, 47.29it/s]

 25%|██▍       | 249/1000 [00:07<00:15, 48.94it/s]

 25%|██▌       | 254/1000 [00:07<00:15, 47.94it/s]

 26%|██▌       | 260/1000 [00:07<00:15, 48.73it/s]

 26%|██▋       | 265/1000 [00:07<00:15, 48.32it/s]

 27%|██▋       | 271/1000 [00:07<00:14, 49.74it/s]

 28%|██▊       | 276/1000 [00:07<00:14, 49.60it/s]

 28%|██▊       | 281/1000 [00:07<00:14, 48.32it/s]

 29%|██▊       | 286/1000 [00:07<00:14, 47.85it/s]

 29%|██▉       | 292/1000 [00:08<00:14, 50.30it/s]

 30%|██▉       | 298/1000 [00:08<00:14, 49.61it/s]

 30%|███       | 304/1000 [00:08<00:13, 50.74it/s]

 31%|███       | 310/1000 [00:08<00:13, 49.51it/s]

 32%|███▏      | 316/1000 [00:08<00:13, 51.85it/s]

 32%|███▏      | 322/1000 [00:08<00:13, 50.48it/s]

 33%|███▎      | 328/1000 [00:08<00:13, 50.55it/s]

 33%|███▎      | 334/1000 [00:08<00:14, 45.70it/s]

 34%|███▍      | 340/1000 [00:09<00:13, 47.97it/s]

 34%|███▍      | 345/1000 [00:09<00:13, 47.29it/s]

 35%|███▌      | 351/1000 [00:09<00:13, 49.23it/s]

 36%|███▌      | 356/1000 [00:09<00:13, 47.58it/s]

 36%|███▌      | 362/1000 [00:09<00:13, 48.83it/s]

 37%|███▋      | 367/1000 [00:09<00:13, 47.93it/s]

 37%|███▋      | 373/1000 [00:09<00:12, 50.60it/s]

 38%|███▊      | 379/1000 [00:09<00:14, 44.28it/s]

 38%|███▊      | 384/1000 [00:09<00:13, 44.59it/s]

 39%|███▉      | 389/1000 [00:10<00:13, 43.79it/s]

 39%|███▉      | 394/1000 [00:10<00:14, 43.00it/s]

 40%|████      | 400/1000 [00:10<00:12, 47.18it/s]

 40%|████      | 405/1000 [00:10<00:12, 46.14it/s]

 41%|████      | 411/1000 [00:10<00:12, 48.47it/s]

 42%|████▏     | 416/1000 [00:10<00:12, 48.33it/s]

 42%|████▏     | 423/1000 [00:10<00:11, 50.26it/s]

 43%|████▎     | 429/1000 [00:10<00:11, 48.66it/s]

 43%|████▎     | 434/1000 [00:11<00:11, 48.64it/s]

 44%|████▍     | 440/1000 [00:11<00:11, 50.22it/s]

 45%|████▍     | 446/1000 [00:11<00:11, 49.59it/s]

 45%|████▌     | 452/1000 [00:11<00:10, 51.09it/s]

 46%|████▌     | 458/1000 [00:11<00:10, 50.81it/s]

 46%|████▋     | 464/1000 [00:11<00:10, 50.56it/s]

 47%|████▋     | 470/1000 [00:11<00:10, 52.51it/s]

 48%|████▊     | 476/1000 [00:11<00:11, 45.89it/s]

 48%|████▊     | 481/1000 [00:11<00:11, 46.58it/s]

 49%|████▊     | 487/1000 [00:12<00:10, 49.78it/s]

 49%|████▉     | 493/1000 [00:12<00:09, 51.09it/s]

 50%|█████     | 500/1000 [00:12<00:09, 55.00it/s]

 51%|█████     | 506/1000 [00:12<00:09, 53.88it/s]

 51%|█████     | 512/1000 [00:12<00:09, 53.40it/s]

 52%|█████▏    | 518/1000 [00:12<00:09, 52.42it/s]

 52%|█████▏    | 524/1000 [00:12<00:09, 52.46it/s]

 53%|█████▎    | 530/1000 [00:12<00:10, 46.79it/s]

 54%|█████▎    | 535/1000 [00:13<00:09, 47.39it/s]

 54%|█████▍    | 541/1000 [00:13<00:09, 48.33it/s]

 55%|█████▍    | 547/1000 [00:13<00:09, 49.23it/s]

 55%|█████▌    | 553/1000 [00:13<00:08, 50.54it/s]

 56%|█████▌    | 559/1000 [00:13<00:08, 50.65it/s]

 57%|█████▋    | 566/1000 [00:13<00:08, 53.67it/s]

 57%|█████▋    | 572/1000 [00:13<00:08, 52.98it/s]

 58%|█████▊    | 578/1000 [00:13<00:08, 50.63it/s]

 58%|█████▊    | 584/1000 [00:13<00:08, 48.75it/s]

 59%|█████▉    | 589/1000 [00:14<00:08, 49.01it/s]

 59%|█████▉    | 594/1000 [00:14<00:08, 49.24it/s]

 60%|█████▉    | 599/1000 [00:14<00:08, 48.45it/s]

 60%|██████    | 604/1000 [00:14<00:08, 48.57it/s]

 61%|██████    | 609/1000 [00:14<00:08, 48.81it/s]

 62%|██████▏   | 615/1000 [00:14<00:07, 49.44it/s]

 62%|██████▏   | 620/1000 [00:14<00:07, 48.49it/s]

 62%|██████▎   | 625/1000 [00:14<00:08, 46.58it/s]

 63%|██████▎   | 631/1000 [00:14<00:07, 49.59it/s]

 64%|██████▍   | 640/1000 [00:15<00:06, 59.89it/s]

 65%|██████▌   | 650/1000 [00:15<00:04, 70.26it/s]

 66%|██████▌   | 661/1000 [00:15<00:04, 80.49it/s]

 67%|██████▋   | 672/1000 [00:15<00:03, 88.89it/s]

 68%|██████▊   | 684/1000 [00:15<00:03, 96.94it/s]

 70%|██████▉   | 696/1000 [00:15<00:02, 103.34it/s]

 71%|███████   | 708/1000 [00:15<00:02, 108.04it/s]

 72%|███████▏  | 721/1000 [00:15<00:02, 113.01it/s]

 73%|███████▎  | 733/1000 [00:15<00:02, 112.26it/s]

 74%|███████▍  | 745/1000 [00:16<00:02, 105.64it/s]

 76%|███████▌  | 757/1000 [00:16<00:02, 107.97it/s]

 77%|███████▋  | 769/1000 [00:16<00:02, 110.69it/s]

 78%|███████▊  | 781/1000 [00:16<00:01, 112.50it/s]

 79%|███████▉  | 793/1000 [00:16<00:02, 87.83it/s] 

 80%|████████  | 803/1000 [00:16<00:02, 74.26it/s]

 81%|████████  | 812/1000 [00:16<00:02, 64.21it/s]

 82%|████████▏ | 820/1000 [00:17<00:03, 56.79it/s]

 83%|████████▎ | 827/1000 [00:17<00:03, 54.86it/s]

 83%|████████▎ | 833/1000 [00:17<00:03, 54.21it/s]

 84%|████████▍ | 839/1000 [00:17<00:03, 53.11it/s]

 84%|████████▍ | 845/1000 [00:17<00:03, 51.48it/s]

 85%|████████▌ | 851/1000 [00:17<00:02, 51.25it/s]

 86%|████████▌ | 857/1000 [00:17<00:02, 50.33it/s]

 86%|████████▋ | 863/1000 [00:18<00:02, 46.87it/s]

 87%|████████▋ | 868/1000 [00:18<00:02, 46.41it/s]

 87%|████████▋ | 873/1000 [00:18<00:02, 47.23it/s]

 88%|████████▊ | 878/1000 [00:18<00:02, 47.94it/s]

 88%|████████▊ | 884/1000 [00:18<00:02, 48.99it/s]

 89%|████████▉ | 890/1000 [00:18<00:02, 50.20it/s]

 90%|████████▉ | 896/1000 [00:18<00:02, 50.01it/s]

 90%|█████████ | 902/1000 [00:18<00:01, 50.60it/s]

 91%|█████████ | 908/1000 [00:18<00:01, 50.80it/s]

 91%|█████████▏| 914/1000 [00:19<00:01, 47.01it/s]

 92%|█████████▏| 919/1000 [00:19<00:01, 47.19it/s]

 92%|█████████▏| 924/1000 [00:19<00:01, 46.94it/s]

 93%|█████████▎| 929/1000 [00:19<00:01, 39.76it/s]

 93%|█████████▎| 934/1000 [00:19<00:01, 35.77it/s]

 94%|█████████▍| 939/1000 [00:19<00:01, 38.08it/s]

 94%|█████████▍| 944/1000 [00:19<00:01, 40.46it/s]

 95%|█████████▍| 949/1000 [00:19<00:01, 41.15it/s]

 95%|█████████▌| 954/1000 [00:20<00:01, 41.70it/s]

 96%|█████████▌| 960/1000 [00:20<00:00, 44.01it/s]

 97%|█████████▋| 966/1000 [00:20<00:00, 46.35it/s]

 97%|█████████▋| 972/1000 [00:20<00:00, 47.64it/s]

 98%|█████████▊| 977/1000 [00:20<00:00, 47.87it/s]

 98%|█████████▊| 982/1000 [00:20<00:00, 47.83it/s]

 99%|█████████▊| 987/1000 [00:20<00:00, 47.40it/s]

 99%|█████████▉| 993/1000 [00:20<00:00, 48.93it/s]

100%|█████████▉| 998/1000 [00:20<00:00, 48.95it/s]

100%|██████████| 1000/1000 [00:20<00:00, 47.62it/s]




[DEBUG] content length: 1438, reasoning_content length: 4766
[DEBUG] content preview: {
  "sentence_id_1": "B0416010p",
  "sentence_id_2": "S0019226008p",
  "answers": "NO, NO, NO",
  "reasoning": "1. The statements originate from entirely distinct historical and political contexts: Statement 1 discusses the governance structure of the Holy Roman Empire in the 16th century, while Sta
[DEBUG] reasoning preview: We are given two statements derived from two different arguments. We need to determine if Statement 1 entails Statement 2. That is, does the truth of Statement 1 logically imply the truth of Statement 2? Entailment means that if Statement 1 is true, then Statement 2 must also be true, given the cont


Saving progress at batch 100...


Saving progress at batch 200...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_100.csv


Saving progress at batch 300...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_200.csv


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Saving progress at batch 400...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_300.csv


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Connection error.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Saving progress at batch 500...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_400.csv


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Saving progress at batch 600...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_500.csv


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Saving progress at batch 700...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_600.csv


Saving progress at batch 800...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_700.csv


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Saving progress at batch 900...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_800.csv


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Saving progress at batch 1000...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_900.csv
Saved results to labeled_pairs/Results_DS_BtoS_iteration_1_one_way.csv

✓ LLM API calls completed
  Renaming to: labeled_pairs/Results_DS_BtoS_iteration_1_one_way

VALIDATING OUTPUT
✓ Output file loaded: 1000 rows
  sentence_id_1: 31/1000 rows
  sentence_id_2: 31/1000 rows
✓ Output validation complete


✓ No previous input file to merge
✓ Output saved to labeled_pairs/Results_DS_BtoS_iteration_1_one_way



In [7]:
# Strip .csv extension since evaluator adds it automatically
output_arg = output.replace('.csv', '') if output.endswith('.csv') else output

sys.argv = [
    "deepseek_evaluator.py",
    "--model", llm_model,
    "--file", input_file,
    "--external", args_file,
    "--prompt", prompt,
    "--output", output_arg
]

print(f"\n{'='*60}")
print("EXECUTING LLM API CALLS")
print(f"{'='*60}")
print(f"Model: {llm_model}")
print(f"Input: {input_file}")
print(f"Output (arg): {output_arg}")
print(f"Expected file: {output}")
print(f"{'='*60}\n")

try:
    etb.main()
    print(f"\n✓ LLM API calls completed")
except Exception as e:
    print(f"\n✗ ERROR during LLM API execution: {e}")
    print(f"Error type: {type(e).__name__}")
    import traceback
    traceback.print_exc()
    raise

# Resolve actual output path — evaluator adds .csv to the base name.
# Do NOT rename/move files — just find the correct path.
if os.path.exists(output):
    output_actual = output
elif os.path.exists(f"{output}.csv"):
    output_actual = f"{output}.csv"
    print(f"✓ Output found at {output_actual}")
elif output.endswith('.csv') and os.path.exists(output.replace('.csv', '') + '.csv'):
    output_actual = output
else:
    output_dir_check = os.path.dirname(output) or '.'
    print(f"⚠ Files in {output_dir_check}:")
    for f in os.listdir(output_dir_check):
        print(f"  - {f}")
    raise FileNotFoundError(f"Output file was not created: {output}")

print(f"\n{'='*60}")
print("VALIDATING OUTPUT")
print(f"{'='*60}")


EXECUTING LLM API CALLS
Model: deepseek-reasoner
Input: fea_iterations\loop_data/df_to_llm_iter_0.csv
Output (arg): labeled_pairs/Results_DS_BtoS_iteration_1_one_way
Expected file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way

Loading data from fea_iterations\loop_data/df_to_llm_iter_0.csv...
Loading data from ArgLevel_ClauseIds_df.xlsx...


Loaded 1000 sentence pairs


Using model: deepseek-reasoner
Using prompt type: test_prompt_tot_json2
Running batch evaluation...


  0%|          | 0/1000 [00:00<?, ?it/s]

  1%|          | 11/1000 [00:00<00:09, 106.50it/s]

  2%|▏         | 22/1000 [00:00<00:13, 75.22it/s] 

  3%|▎         | 33/1000 [00:00<00:11, 86.55it/s]

  4%|▍         | 45/1000 [00:00<00:09, 95.66it/s]

  6%|▌         | 56/1000 [00:00<00:09, 99.66it/s]

  7%|▋         | 67/1000 [00:00<00:09, 99.60it/s]

  8%|▊         | 78/1000 [00:00<00:09, 97.31it/s]

  9%|▉         | 88/1000 [00:00<00:09, 97.03it/s]

 10%|▉         | 98/1000 [00:01<00:09, 97.39it/s]

 11%|█         | 112/1000 [00:01<00:08, 107.00it/s]

 12%|█▎        | 125/1000 [00:01<00:07, 113.05it/s]

 14%|█▍        | 141/1000 [00:01<00:06, 124.80it/s]

 16%|█▌        | 157/1000 [00:01<00:06, 134.18it/s]

 17%|█▋        | 174/1000 [00:01<00:05, 143.38it/s]

 19%|█▉        | 190/1000 [00:01<00:05, 147.58it/s]

 21%|██        | 207/1000 [00:01<00:05, 153.69it/s]

 22%|██▏       | 224/1000 [00:01<00:04, 158.46it/s]

 24%|██▍       | 241/1000 [00:01<00:04, 159.65it/s]

 26%|██▌       | 259/1000 [00:02<00:04, 164.11it/s]

 28%|██▊       | 276/1000 [00:02<00:04, 159.96it/s]

 29%|██▉       | 293/1000 [00:02<00:04, 161.37it/s]

 31%|███       | 310/1000 [00:02<00:04, 159.35it/s]

 33%|███▎      | 327/1000 [00:02<00:04, 161.75it/s]

 34%|███▍      | 344/1000 [00:02<00:04, 158.35it/s]

 36%|███▌      | 360/1000 [00:02<00:04, 158.14it/s]

 38%|███▊      | 376/1000 [00:02<00:03, 156.15it/s]

 39%|███▉      | 392/1000 [00:02<00:03, 152.16it/s]

 41%|████      | 408/1000 [00:03<00:03, 153.13it/s]

 42%|████▎     | 425/1000 [00:03<00:03, 156.44it/s]

 44%|████▍     | 441/1000 [00:03<00:03, 154.95it/s]

 46%|████▌     | 458/1000 [00:03<00:03, 158.58it/s]

 48%|████▊     | 475/1000 [00:03<00:03, 161.74it/s]

 49%|████▉     | 493/1000 [00:03<00:03, 165.74it/s]

 51%|█████     | 510/1000 [00:03<00:02, 165.85it/s]

 53%|█████▎    | 527/1000 [00:03<00:02, 166.35it/s]

 55%|█████▍    | 545/1000 [00:03<00:02, 167.62it/s]

 56%|█████▌    | 562/1000 [00:03<00:02, 167.48it/s]

 58%|█████▊    | 580/1000 [00:04<00:02, 168.94it/s]

 60%|█████▉    | 597/1000 [00:04<00:02, 168.64it/s]

 61%|██████▏   | 614/1000 [00:04<00:02, 163.65it/s]

 63%|██████▎   | 632/1000 [00:04<00:02, 166.26it/s]

 65%|██████▌   | 650/1000 [00:04<00:02, 167.59it/s]

 67%|██████▋   | 668/1000 [00:04<00:01, 169.15it/s]

 68%|██████▊   | 685/1000 [00:04<00:01, 169.17it/s]

 70%|███████   | 703/1000 [00:04<00:01, 170.15it/s]

 72%|███████▏  | 721/1000 [00:04<00:01, 170.86it/s]

 74%|███████▍  | 739/1000 [00:04<00:01, 171.58it/s]

 76%|███████▌  | 757/1000 [00:05<00:01, 169.91it/s]

 77%|███████▋  | 774/1000 [00:05<00:01, 163.92it/s]

 79%|███████▉  | 791/1000 [00:05<00:01, 164.36it/s]

 81%|████████  | 808/1000 [00:05<00:01, 163.34it/s]

 83%|████████▎ | 826/1000 [00:05<00:01, 165.78it/s]

 84%|████████▍ | 843/1000 [00:05<00:00, 164.99it/s]

 86%|████████▌ | 860/1000 [00:05<00:00, 165.17it/s]

 88%|████████▊ | 877/1000 [00:05<00:00, 166.15it/s]

 89%|████████▉ | 894/1000 [00:05<00:00, 166.62it/s]

 91%|█████████ | 912/1000 [00:06<00:00, 168.13it/s]

 93%|█████████▎| 929/1000 [00:06<00:00, 162.62it/s]

 95%|█████████▍| 946/1000 [00:06<00:00, 159.14it/s]

 96%|█████████▋| 963/1000 [00:06<00:00, 161.43it/s]

 98%|█████████▊| 980/1000 [00:06<00:00, 158.50it/s]

100%|█████████▉| 997/1000 [00:06<00:00, 161.33it/s]

100%|██████████| 1000/1000 [00:06<00:00, 151.62it/s]




Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}
Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


[DEBUG] content length: 1194, reasoning_content length: 3158
[DEBUG] content preview: {
  "sentence_id_1": "B0978002p",
  "sentence_id_2": "S0017264004p",
  "answers": "NO, NO, NO",
  "reasoning": "1. NO: Statement 1 is a descriptive claim about historical attitudes towards female and foreign rule, while Statement 2 is a prescriptive recommendation for parliamentary conduct; there is
[DEBUG] reasoning preview: We are given two arguments and two statements derived from them. We need to determine if Statement 1 entails Statement 2. Statement 1: "The content clearly states that the nobles found it intolerable for a great kingdom to be governed by a woman, particularly one who was a stranger." This is from Ar


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Connection error.'}


Saving progress at batch 100...


Saving progress at batch 200...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_100.csv


Saving progress at batch 300...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_200.csv


Saving progress at batch 400...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_300.csv


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Saving progress at batch 500...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_400.csv


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Saving progress at batch 600...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_500.csv


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Saving progress at batch 700...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_600.csv


Saving progress at batch 800...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_700.csv


Saving progress at batch 900...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_800.csv


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}


Exception 'dict' object has no attribute 'sentence_id_1' in: 
 {'ERROR': 'API call failed: Request timed out.'}
Saving progress at batch 1000...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way_progress_batch_900.csv
Saved results to labeled_pairs/Results_DS_BtoS_iteration_1_one_way.csv

✓ LLM API calls completed

VALIDATING OUTPUT


In [8]:
# Summary Report
print(f"\n{'='*60}")
print("EXECUTION SUMMARY")
print(f"{'='*60}")

df_final = pd.read_csv(output_actual)
print(f"Total rows in final output: {len(df_final)}")
print(f"Output file: {output_actual}")

if 'llm_conclusion_12' in df_final.columns:
    conclusion_counts = df_final['llm_conclusion_12'].value_counts()
    print(f"\nLLM Conclusions:")
    for conclusion, count in conclusion_counts.items():
        print(f"  {conclusion}: {count} ({count/len(df_final)*100:.1f}%)")
    
    # Check for failed calls
    failed = df_final['llm_conclusion_12'].isnull().sum()
    if failed > 0:
        print(f"\n⚠ Failed API calls: {failed}/{len(df_final)} ({failed/len(df_final)*100:.1f}%)")
else:
    print("\n⚠ Column 'llm_conclusion_12' not found in output")

print(f"\n{'='*60}")
print("✓ Pipeline execution complete")
print(f"{'='*60}\n")


EXECUTION SUMMARY
Total rows in final output: 1000
Output file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way

LLM Conclusions:
  NO: 953 (95.3%)
  YES: 16 (1.6%)

⚠ Failed API calls: 31/1000 (3.1%)

✓ Pipeline execution complete

