# Set up

## Import libraries

In [3]:
import argparse
import glob
import importlib
import json
import os
import statistics
import sys
import time
from concurrent.futures import ProcessPoolExecutor
from functools import partial

import numpy as np
import pandas as pd
from openai import OpenAI
from pydantic import ValidationError
from sklearn.metrics import (accuracy_score, f1_score,
                             precision_score, recall_score)
from tqdm import tqdm

import llm_calls.deepseek_evaluator as etb
from llm_calls.deepseek_evaluator import EntailmentEvaluator
from llm_calls.prompts import *

ModuleNotFoundError: No module named 'openai'

## Declare paths 

# Execute Entailment API calls 

In [None]:
# Check API key and environment
print("Validating environment setup...")

# Check for OpenAI/DeepSeek API key
api_key = os.environ.get('OPENAI_API_KEY') or os.environ.get('DEEPSEEK_API_KEY')
if api_key:
    print(f"✓ API key found (length: {len(api_key)} chars)")
else:
    print("⚠ WARNING: No API key found in environment variables")
    print("  Looking for OPENAI_API_KEY or DEEPSEEK_API_KEY")
    print("  LLM API calls will likely fail without proper credentials")

# Check if deepseek_evaluator module is importable
try:
    import llm_calls.deepseek_evaluator as etb
    print(f"✓ deepseek_evaluator module loaded from: {etb.__file__}")
except Exception as e:
    print(f"✗ ERROR: Cannot import deepseek_evaluator: {e}")
    raise

# Check OpenAI library
try:
    from openai import OpenAI
    print(f"✓ OpenAI library available")
except Exception as e:
    print(f"⚠ WARNING: OpenAI library issue: {e}")

print("\nEnvironment validation complete.\n")


In [None]:
llm_model = "deepseek-reasoner"
input_file = "fea_iterations/loop_data/df_to_llm_iter_0.csv"
args_file = "ArgLevel_ClauseIds_df.xlsx"
prompt = "test_prompt_tot_json2"
output = "labeled_pairs/Results_DS_BtoS_iteration_1.csv"
previous_input_file = "Results_DS_BtoS_iteration_0.csv"

In [None]:
# Validate input files exist
print(f"\n{'='*60}")
print("INPUT FILE VALIDATION")
print(f"{'='*60}")

print(f"Checking input_file: {input_file}")
if os.path.exists(input_file):
    df_input = pd.read_csv(input_file)
    print(f"✓ Input file found: {len(df_input)} rows")
    print(f"  Columns: {list(df_input.columns)}")
else:
    raise FileNotFoundError(f"Input file not found: {input_file}")

print(f"\nChecking args_file: {args_file}")
if os.path.exists(args_file):
    if args_file.endswith('.csv'):
        df_args = pd.read_csv(args_file)
    elif args_file.endswith('.xlsx'):
        df_args = pd.read_excel(args_file)
    else:
        raise ValueError(f"Unsupported file format: {args_file}")
    print(f"✓ Args file found: {len(df_args)} rows")
else:
    raise FileNotFoundError(f"Args file not found: {args_file}")

if previous_input_file:
    print(f"\nChecking previous_input_file: {previous_input_file}")
    if os.path.exists(previous_input_file):
        df_prev = pd.read_csv(previous_input_file)
        print(f"✓ Previous input file found: {len(df_prev)} rows")
    else:
        print(f"⚠ WARNING: Previous input file not found: {previous_input_file}")
        print(f"  Continuing without merging previous results")

# Create output directory if needed
output_dir = os.path.dirname(output)
if output_dir and not os.path.exists(output_dir):
    os.makedirs(output_dir, exist_ok=True)
    print(f"\n✓ Created output directory: {output_dir}")

print(f"{'='*60}\n")


In [None]:
# Strip .csv extension since evaluator adds it automatically
output_arg = output.replace('.csv', '') if output.endswith('.csv') else output

sys.argv = [
    "deepseek_evaluator.py",
    "--model", llm_model,
    "--file", input_file,
    "--external", args_file,
    "--prompt", prompt,
    "--output", output_arg
]

print(f"\n{'='*60}")
print("EXECUTING LLM API CALLS")
print(f"{'='*60}")
print(f"Model: {llm_model}")
print(f"Input: {input_file}")
print(f"Output (arg): {output_arg}")
print(f"Expected file: {output}")
print(f"{'='*60}\n")

try:
    etb.main()
    print(f"\n✓ LLM API calls completed")
except Exception as e:
    print(f"\n✗ ERROR during LLM API execution: {e}")
    print(f"Error type: {type(e).__name__}")
    import traceback
    traceback.print_exc()
    raise

# Check for output file - evaluator adds .csv extension
if not os.path.exists(output):
    # Check if double extension was created
    double_ext = output + '.csv' if not output.endswith('.csv') else output.replace('.csv', '') + '.csv.csv'
    if os.path.exists(double_ext):
        print(f"⚠ WARNING: Output created with double extension: {double_ext}")
        print(f"  Renaming to: {output}")
        os.rename(double_ext, output)
    else:
        # List files in output directory for debugging
        output_dir = os.path.dirname(output) or '.'
        print(f"⚠ Files in {output_dir}:")
        for f in os.listdir(output_dir):
            print(f"  - {f}")
        raise FileNotFoundError(f"Output file was not created: {output}")

print(f"\n{'='*60}")
print("VALIDATING OUTPUT")
print(f"{'='*60}")

df_new = pd.read_csv(output)
print(f"✓ Output file loaded: {len(df_new)} rows")

# Check for required columns
required_cols = ['sentence_id_1', 'sentence_id_2']
missing_cols = [col for col in required_cols if col not in df_new.columns]
if missing_cols:
    print(f"⚠ WARNING: Missing expected columns: {missing_cols}")
    print(f"  Available columns: {list(df_new.columns)}")

# Check for errors in the data
if 'ERROR' in df_new.columns:
    error_count = df_new['ERROR'].notna().sum()
    if error_count > 0:
        print(f"⚠ WARNING: {error_count}/{len(df_new)} rows contain errors")
        print(f"  First error: {df_new[df_new['ERROR'].notna()]['ERROR'].iloc[0]}")

# Check for empty/null critical fields
null_counts = df_new[required_cols].isnull().sum()
if null_counts.any():
    print(f"⚠ WARNING: Null values found in critical columns:")
    for col, count in null_counts[null_counts > 0].items():
        print(f"  {col}: {count}/{len(df_new)} rows")

print(f"✓ Output validation complete\n")

# Merge with previous results
if previous_input_file and os.path.exists(previous_input_file):
    print(f"{'='*60}")
    print("MERGING WITH PREVIOUS RESULTS")
    print(f"{'='*60}")
    
    df_previous = pd.read_csv(previous_input_file)
    print(f"✓ Loaded previous results: {len(df_previous)} rows")
    
    df_merged = pd.concat([df_previous, df_new], ignore_index=True)
    print(f"✓ Merged: {len(df_previous)} previous + {len(df_new)} new = {len(df_merged)} total")
    
    # Validate merged data
    if len(df_merged) != len(df_previous) + len(df_new):
        print(f"⚠ WARNING: Merged row count doesn't match expected sum")
    
    df_merged.to_csv(output, index=False)
    print(f"✓ Saved merged results to {output}")
    print(f"{'='*60}\n")
else:
    print(f"\n✓ No previous input file to merge")
    print(f"✓ Output saved to {output}\n")


In [None]:
# Summary Report
print(f"\n{'='*60}")
print("EXECUTION SUMMARY")
print(f"{'='*60}")

df_final = pd.read_csv(output)
print(f"Total rows in final output: {len(df_final)}")

if 'llm_conclusion_12' in df_final.columns:
    conclusion_counts = df_final['llm_conclusion_12'].value_counts()
    print(f"\nLLM Conclusions:")
    for conclusion, count in conclusion_counts.items():
        print(f"  {conclusion}: {count} ({count/len(df_final)*100:.1f}%)")
    
    # Check for failed calls
    failed = df_final['llm_conclusion_12'].isnull().sum()
    if failed > 0:
        print(f"\n⚠ Failed API calls: {failed}/{len(df_final)} ({failed/len(df_final)*100:.1f}%)")
else:
    print("\n⚠ Column 'llm_conclusion_12' not found in output")

print(f"\n{'='*60}")
print("✓ Pipeline execution complete")
print(f"{'='*60}\n")
