# Set up

## Import libraries

In [1]:
import argparse
import glob
import importlib
import json
import os
import statistics
import sys
import time
from concurrent.futures import ProcessPoolExecutor
from functools import partial

import numpy as np
import pandas as pd
from openai import OpenAI
from pydantic import ValidationError
from sklearn.metrics import (accuracy_score, f1_score,
                             precision_score, recall_score)
from tqdm import tqdm

# Ensure project root is on sys.path (required for papermill fresh kernels)
current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)

import llm_calls.deepseek_evaluator as etb
from llm_calls.deepseek_evaluator import EntailmentEvaluator
from llm_calls.prompts import *

## Declare paths 

# Execute Entailment API calls 

In [2]:
# Check API key and environment
print("Validating environment setup...")

# Check for OpenAI/DeepSeek API key
api_key = os.environ.get('OPENAI_API_KEY') or os.environ.get('DEEPSEEK_API_KEY')
if api_key:
    print(f"✓ API key found (length: {len(api_key)} chars)")
else:
    print("⚠ WARNING: No API key found in environment variables")
    print("  Looking for OPENAI_API_KEY or DEEPSEEK_API_KEY")
    print("  LLM API calls will likely fail without proper credentials")

# Check if deepseek_evaluator module is importable
try:
    import llm_calls.deepseek_evaluator as etb
    print(f"✓ deepseek_evaluator module loaded from: {etb.__file__}")
except Exception as e:
    print(f"✗ ERROR: Cannot import deepseek_evaluator: {e}")
    raise

# Check OpenAI library
try:
    from openai import OpenAI
    print(f"✓ OpenAI library available")
except Exception as e:
    print(f"⚠ WARNING: OpenAI library issue: {e}")

print("\nEnvironment validation complete.\n")


Validating environment setup...
✓ API key found (length: 35 chars)
✓ deepseek_evaluator module loaded from: c:\Users\aesteva\Documents\GitHub\fea_project\llm_calls\deepseek_evaluator.py
✓ OpenAI library available

Environment validation complete.



In [3]:
llm_model = "deepseek-reasoner"
input_file = "fea_iterations/loop_data/df_to_llm_iter_0.csv"
args_file = "ArgLevel_ClauseIds_df.xlsx"
prompt = "test_prompt_tot_json2"
output = "labeled_pairs/Results_DS_BtoS_iteration_1.csv"
previous_input_file = "Results_DS_BtoS_iteration_0.csv"

In [4]:
# Parameters
llm_model = "deepseek-reasoner"
input_file = "fea_iterations\\loop_data/df_to_llm_iter_0.csv"
args_file = "ArgLevel_ClauseIds_df.xlsx"
prompt = "test_prompt_tot_json2"
output = "labeled_pairs/Results_DS_BtoS_iteration_1_one_way"
previous_input_file = ""


In [5]:
# Validate input files exist
print(f"\n{'='*60}")
print("INPUT FILE VALIDATION")
print(f"{'='*60}")

print(f"Checking input_file: {input_file}")
if os.path.exists(input_file):
    df_input = pd.read_csv(input_file)
    print(f"✓ Input file found: {len(df_input)} rows")
    print(f"  Columns: {list(df_input.columns)}")
else:
    raise FileNotFoundError(f"Input file not found: {input_file}")

print(f"\nChecking args_file: {args_file}")
if os.path.exists(args_file):
    if args_file.endswith('.csv'):
        df_args = pd.read_csv(args_file)
    elif args_file.endswith('.xlsx'):
        df_args = pd.read_excel(args_file)
    else:
        raise ValueError(f"Unsupported file format: {args_file}")
    print(f"✓ Args file found: {len(df_args)} rows")
else:
    raise FileNotFoundError(f"Args file not found: {args_file}")

if previous_input_file:
    print(f"\nChecking previous_input_file: {previous_input_file}")
    if os.path.exists(previous_input_file):
        df_prev = pd.read_csv(previous_input_file)
        print(f"✓ Previous input file found: {len(df_prev)} rows")
    else:
        print(f"⚠ WARNING: Previous input file not found: {previous_input_file}")
        print(f"  Continuing without merging previous results")

# Create output directory if needed
output_dir = os.path.dirname(output)
if output_dir and not os.path.exists(output_dir):
    os.makedirs(output_dir, exist_ok=True)
    print(f"\n✓ Created output directory: {output_dir}")

print(f"{'='*60}\n")



INPUT FILE VALIDATION
Checking input_file: fea_iterations\loop_data/df_to_llm_iter_0.csv
✓ Input file found: 1 rows
  Columns: ['sentence_id_2', 'sentence_id_1', 'sentence_text_2', 'argument_id_2', 'sentence_text_1', 'argument_id_1', 'score']

Checking args_file: ArgLevel_ClauseIds_df.xlsx


✓ Args file found: 6181 rows



In [6]:
# Strip .csv extension since evaluator adds it automatically
output_arg = output.replace('.csv', '') if output.endswith('.csv') else output

sys.argv = [
    "deepseek_evaluator.py",
    "--model", llm_model,
    "--file", input_file,
    "--external", args_file,
    "--prompt", prompt,
    "--output", output_arg
]

print(f"\n{'='*60}")
print("EXECUTING LLM API CALLS")
print(f"{'='*60}")
print(f"Model: {llm_model}")
print(f"Input: {input_file}")
print(f"Output (arg): {output_arg}")
print(f"Expected file: {output}")
print(f"{'='*60}\n")

try:
    etb.main()
    print(f"\n✓ LLM API calls completed")
except Exception as e:
    print(f"\n✗ ERROR during LLM API execution: {e}")
    print(f"Error type: {type(e).__name__}")
    import traceback
    traceback.print_exc()
    raise

# Resolve actual output path — evaluator adds .csv to the base name.
# Do NOT rename/move files — just find the correct path.
if os.path.exists(output):
    output_actual = output
elif os.path.exists(f"{output}.csv"):
    output_actual = f"{output}.csv"
    print(f"✓ Output found at {output_actual}")
elif output.endswith('.csv') and os.path.exists(output.replace('.csv', '') + '.csv'):
    output_actual = output
else:
    output_dir_check = os.path.dirname(output) or '.'
    print(f"⚠ Files in {output_dir_check}:")
    for f in os.listdir(output_dir_check):
        print(f"  - {f}")
    raise FileNotFoundError(f"Output file was not created: {output}")

print(f"\n{'='*60}")
print("VALIDATING OUTPUT")
print(f"{'='*60}")


EXECUTING LLM API CALLS
Model: deepseek-reasoner
Input: fea_iterations\loop_data/df_to_llm_iter_0.csv
Output (arg): labeled_pairs/Results_DS_BtoS_iteration_1_one_way
Expected file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way

Loading data from fea_iterations\loop_data/df_to_llm_iter_0.csv...
Loading data from ArgLevel_ClauseIds_df.xlsx...


Loaded 1 sentence pairs


Using model: deepseek-reasoner
Using prompt type: test_prompt_tot_json2
Running batch evaluation...


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 47.02it/s]




[DEBUG] content length: 919, reasoning_content length: 10431
[DEBUG] content preview: {
  "sentence_id_1": "B0653002sc",
  "sentence_id_2": "S0053530002sc",
  "answers": "YES, NO, NO",
  "reasoning": "1. (YES) In a representative democracy, parliament is the primary institution for expressing consent; thus, if consent is essential, parliament must accurately represent the will of the
[DEBUG] reasoning preview: We are asked to assess whether Statement 1 entails Statement 2. Statement 1: "Consent of the governed is essential for government effectiveness." Statement 2: "Parliament must accurately represent the will of the people."

We are given Argument 1 (from Juan de Mariana, 1598) and its context. Argumen
Saving progress at batch 1...
Saved results to labeled_pairs/Results_DS_BtoS_iteration_1_one_way.csv

✓ LLM API calls completed
✓ Output found at labeled_pairs/Results_DS_BtoS_iteration_1_one_way.csv

VALIDATING OUTPUT


In [7]:
# Summary Report
print(f"\n{'='*60}")
print("EXECUTION SUMMARY")
print(f"{'='*60}")

df_final = pd.read_csv(output_actual)
print(f"Total rows in final output: {len(df_final)}")
print(f"Output file: {output_actual}")

if 'llm_conclusion_12' in df_final.columns:
    conclusion_counts = df_final['llm_conclusion_12'].value_counts()
    print(f"\nLLM Conclusions:")
    for conclusion, count in conclusion_counts.items():
        print(f"  {conclusion}: {count} ({count/len(df_final)*100:.1f}%)")
    
    # Check for failed calls
    failed = df_final['llm_conclusion_12'].isnull().sum()
    if failed > 0:
        print(f"\n⚠ Failed API calls: {failed}/{len(df_final)} ({failed/len(df_final)*100:.1f}%)")
else:
    print("\n⚠ Column 'llm_conclusion_12' not found in output")

print(f"\n{'='*60}")
print("✓ Pipeline execution complete")
print(f"{'='*60}\n")


EXECUTION SUMMARY
Total rows in final output: 1
Output file: labeled_pairs/Results_DS_BtoS_iteration_1_one_way.csv

LLM Conclusions:
  NO: 1 (100.0%)

✓ Pipeline execution complete

