In [None]:
# import sys
# import os

# ### Installs all the packages in the install library (my_custom_libs) ##

# # --- 1. SETUP PATHS FIRST (you are going to need to change to whatever your path to the instal library is in) ---
# username = os.environ.get('USER')
# scratch_base = f"/scratch/midway3/{username}/fea_project"
# custom_lib_path = os.path.join(scratch_base, "my_custom_libs")
# hf_cache_path = os.path.join(scratch_base, "hf_cache")

# py_version = f"python{sys.version_info.major}.{sys.version_info.minor}"
# site_packages = os.path.join(custom_lib_path, "lib", py_version, "site-packages")
# sys.path.insert(0, site_packages)

# # FORCE custom path to the FRONT of the list
# if site_packages not in sys.path:
#     sys.path.insert(0, site_packages)

# print(f"Python is looking here first: {sys.path[0]}")

# # --- Ensure current directory is in sys.path for Windows ---
# current_dir = os.getcwd()
# if current_dir not in sys.path:
#     sys.path.insert(0, current_dir)
#     print(f"Added current directory to sys.path: {current_dir}")

# # --- 2. ENV VARIABLES ---
# os.environ['HF_HOME'] = hf_cache_path
# os.environ['TRANSFORMERS_CACHE'] = hf_cache_path
# os.environ['PYTHONUSERBASE'] = custom_lib_path

# # --- 3. NOW IMPORT LIBRARIES ---
# # Only import AFTER sys.path is updated
# import datasets 
# import sentence_transformers
# from sentence_transformers import SentenceTransformer

# # --- 4. VERIFY PATHS ---
# print(f"Datasets is loaded from: {os.path.dirname(datasets.__file__)}")

# if "software" in os.path.dirname(datasets.__file__):
#     print("FAILURE: Still loading system datasets.")
# else:
#     print("SUCCESS: All libraries isolated in scratch.")


Python is looking here first: /scratch/midway3/None/fea_project\my_custom_libs\lib\python3.13\site-packages
Added current directory to sys.path: c:\Users\aesteva\Documents\GitHub\fea_project


KeyboardInterrupt: 

In [65]:
import sys
import os
current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)
    print(f"Added current directory to sys.path: {current_dir}")

import pandas as pd
import numpy as np
import importlib
import free_entailments_algorithm_utils as fea
import plotly.express as px

importlib.reload(fea)

# Load premise and conclusion dataframes
df_p = pd.read_excel("ClauseLevel_df_p.xlsx")
df_sc = pd.read_excel("ClauseLevel_df_sc.xlsx")

print(f"Premises:    {len(df_p):,} rows")
print(f"Conclusions: {len(df_sc):,} rows")

Premises:    49,155 rows
Conclusions: 15,289 rows


In [67]:
# Count total valid BB and BS candidate pairs (before any threshold)
# Valid pairs are within-group only (premise-premise OR conclusion-conclusion, never cross)

for label, df_src in [("Premises", df_p), ("Conclusions", df_sc)]:
    book_ids = df_src.loc[df_src['sentence_id'].str.startswith('B'), 'sentence_id']
    speech_ids = df_src.loc[df_src['sentence_id'].str.startswith('S'), 'sentence_id']
    n_b, n_s = len(book_ids), len(speech_ids)
    bb = n_b * (n_b - 1) // 2
    bs = n_b * n_s
    print(f"{label}: {n_b:,} book IDs, {n_s:,} speech IDs → {bb:,} BB pairs, {bs:,} BS pairs")

# Total across both groups
total_bb = sum(
    len(df_src.loc[df_src['sentence_id'].str.startswith('B')]) *
    (len(df_src.loc[df_src['sentence_id'].str.startswith('B')]) - 1) // 2
    for df_src in [df_p, df_sc]
)
total_bs = sum(
    len(df_src.loc[df_src['sentence_id'].str.startswith('B')]) *
    len(df_src.loc[df_src['sentence_id'].str.startswith('S')])
    for df_src in [df_p, df_sc]
)
print(f"\nTotal candidate BB pairs: {total_bb:,}")
print(f"Total candidate BS pairs: {total_bs:,}")
print(f"Total:              {total_bb + total_bs:,}")

Premises: 9,564 book IDs, 39,591 speech IDs → 45,730,266 BB pairs, 378,648,324 BS pairs
Conclusions: 2,511 book IDs, 12,778 speech IDs → 3,151,305 BB pairs, 32,085,558 BS pairs

Total candidate BB pairs: 48,881,571
Total candidate BS pairs: 410,733,882
Total:              459,615,453


In [68]:
import pickle
with open("embedding_cache_finetuned.pkl", 'rb') as f:
    embedding_cache_finetuned = pickle.load(f)
print(f"Loaded embedding cache: {len(embedding_cache_finetuned)} embeddings")

Loaded embedding cache: 63909 embeddings


In [69]:
# Number of pairs per type
N = 100000

# Generate N Book-Book pairs and N Book-Speech pairs
df_pairs_bb, df_pairs_bs = fea.generate_valid_pairs_by_type(
    df_p, df_sc,
    n=N,
    id_col='sentence_id',
    text_col='sentence',
    random_seed=67
)

print(f"\ndf_pairs_bb shape: {df_pairs_bb.shape}")
print(f"df_pairs_bs shape: {df_pairs_bs.shape}")
df_pairs_bb.head()


=== Generating 100,000 Book-Book pairs ===
  Generated 100,000 Book-Book pairs

=== Generating 100,000 Book-Speech pairs ===
  Generated 100,000 Book-Speech pairs

=== SUMMARY ===
Book-Book pairs:   100,000
Book-Speech pairs: 100,000

df_pairs_bb shape: (100000, 4)
df_pairs_bs shape: (100000, 4)


Unnamed: 0,id1,id2,text1,text2
0,B0492001sc,B0674001sc,The preservation of freedom,Punishing instigators of riots is essential fo...
1,B0614002p,B0732004p,Historical perspectives from early Church Fath...,Subjects can withdraw their support from a tyr...
2,B0721003sc,B0818003sc,The magistrate must maintain order,A Democracy becomes corrupt
3,B0125001sc,B0360002sc,Ruler authority does not depend on the consent...,their rights are compromised
4,B0115005p,B0206005p,A constitution that is inherently flawed can b...,Historical examples illustrate that even the m...


In [70]:
# Compute cosine similarity using the fine-tuned bi-encoder model
df_pairs_bb = fea.generate_new_bert_results(
    df_pairs_bb,
    text_col1='text1',
    text_col2='text2',
    model_path='./fine_tuned_bi_model',
    new_col='cosine_sim',
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)

df_pairs_bs = fea.generate_new_bert_results(
    df_pairs_bs,
    text_col1='text1',
    text_col2='text2',
    model_path='./fine_tuned_bi_model',
    new_col='cosine_sim',
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)

print("Book-Book cosine_sim stats:")
print(df_pairs_bb['cosine_sim'].describe())
print("\nBook-Speech cosine_sim stats:")
print(df_pairs_bs['cosine_sim'].describe())

Using pre-computed embeddings from cache...
Using pre-computed embeddings from cache...
Book-Book cosine_sim stats:
count    100000.000000
mean          0.000000
std           0.000000
min           0.270020
25%           0.521973
50%           0.573242
75%           0.625488
max           1.000000
Name: cosine_sim, dtype: float64

Book-Speech cosine_sim stats:
count    100000.000000
mean          0.000000
std           0.000000
min           0.240967
25%           0.486328
50%           0.533691
75%           0.582520
max           0.881348
Name: cosine_sim, dtype: float64



overflow encountered in cast


overflow encountered in cast



In [71]:
# Plot cosine similarity distributions
fig_bb = px.histogram(
    df_pairs_bb, x='cosine_sim', nbins=200,
    title='Cosine Similarity Distribution — Book-Book Pairs',
    labels={'cosine_sim': 'Cosine Similarity'},
    opacity=0.75,
)
fig_bb.show()

fig_bs = px.histogram(
    df_pairs_bs, x='cosine_sim', nbins=200,
    title='Cosine Similarity Distribution — Book-Speech Pairs',
    labels={'cosine_sim': 'Cosine Similarity'},
    opacity=0.75,
)
fig_bs.show()

In [72]:
# Find the 99th-percentile threshold (top 1% = cos_sim above this value)
threshold_bb = np.percentile(df_pairs_bb['cosine_sim'].dropna(), 99)
threshold_bs = np.percentile(df_pairs_bs['cosine_sim'].dropna(), 99)

print(f"Book-Book   — top 1% threshold: {threshold_bb:.6f}")
print(f"Book-Speech — top 1% threshold: {threshold_bs:.6f}")

Book-Book   — top 1% threshold: 0.753906
Book-Speech — top 1% threshold: 0.707520


In [73]:
importlib.reload(fea)

threshold_stats = input("Want to see spread of thresholds over n trials? (y/n) ")

if threshold_stats.lower() == 'y':
# Run 20 trials with different random seeds to check threshold stability
    df_thresholds = fea.estimate_thresholds(
        df_p, df_sc,
        n_trials=10,
        n_pairs=100000,
        percentile=99.0,
        model_path='./fine_tuned_bi_model',
        embedding_cache=embedding_cache_finetuned,
    )

    print("\n=== Threshold Stability Summary ===")
    print(df_thresholds[['threshold_bb', 'threshold_bs']].describe())

    print(f"\nthreshold_bb  →  mean={df_thresholds['threshold_bb'].mean():.6f}  std={df_thresholds['threshold_bb'].std():.6f}")
    print(f"threshold_bs  →  mean={df_thresholds['threshold_bs'].mean():.6f}  std={df_thresholds['threshold_bs'].std():.6f}")

    # Use the mean thresholds for the final scan
    threshold_bb = df_thresholds['threshold_bb'].mean()
    threshold_bs = df_thresholds['threshold_bs'].mean()
    print(f"\nUsing mean thresholds: BB={threshold_bb:.6f}, BS={threshold_bs:.6f}")
else:
    print("Skipping threshold stability analysis.")

Skipping threshold stability analysis.


In [74]:
# Generate threshold-filtered pairs, keeping:
#   - 50k randomly sampled B-B pairs (from all ~350k above threshold)
#   - 50k randomly sampled B-S pairs (from top 1M above threshold by cos sim)
# BS results are trimmed per-group to cap memory during the scan.
importlib.reload(fea)

df_all_filtered = fea.generate_valid_pairs(
    df_p, df_sc,
    id_col='sentence_id',
    text_col='sentence',
    embedding_cache=embedding_cache_finetuned,
    threshold_bb=threshold_bb,
    threshold_bs=threshold_bs,
    batch_size=64,
    sample_n_bb=50_000,
    sample_n_bs=50_000,
    top_k_bs=1_000_000,
)

# Split by pair type
df_pairs_bb_final = df_all_filtered[df_all_filtered['pair_type'] == 'BB'].reset_index(drop=True)
df_pairs_bs_final = df_all_filtered[df_all_filtered['pair_type'] == 'BS'].reset_index(drop=True)

print(f"\nBook-Book pairs:   {len(df_pairs_bb_final):,}")
print(f"Book-Speech pairs: {len(df_pairs_bs_final):,}")

print("\n--- Book-Book sample ---")
display(df_pairs_bb_final.head())

print("\n--- Book-Speech sample ---")
display(df_pairs_bs_final.head())


=== Generating filtered premise-premise pairs ===
    [Premises] Scanning 45,730,266 B-B candidates in batches of 64...
    [Premises] Found 309,794 B-B pairs above threshold 0.7539
    [Premises] Scanning 378,648,324 B-S candidates  (book batch=64, speech chunk=2048)...
    [Premises] Found 1,842,492 B-S pairs above threshold 0.7075
    [Premises] Trimmed B-S to top 1,000,000 by cosine_sim

=== Generating filtered conclusion-conclusion pairs ===
    [Conclusions] Scanning 3,151,305 B-B candidates in batches of 64...
    [Conclusions] Found 44,580 B-B pairs above threshold 0.7539
    [Conclusions] Scanning 32,085,558 B-S candidates  (book batch=64, speech chunk=2048)...
    [Conclusions] Found 472,313 B-S pairs above threshold 0.7075
Randomly sampled 50,000 B-B pairs from 354,374
Randomly sampled 50,000 B-S pairs from top 1,000,000

=== SUMMARY (threshold-filtered) ===
Book-Book pairs:    50,000
Book-Speech pairs:  50,000
Total pairs:        100,000
No labeled pairs file found at labe

Unnamed: 0,id1,id2,cosine_sim,pair_type,text1,text2
0,B0295006p,B0506004p,0.754081,BB,Without the checks and balances provided by a ...,"Without established laws, individuals would be..."
1,B0837006p,B1150005p,0.765749,BB,"The king, as the constituted authority, posses...","In royal monarchy, a single man rules with the..."
2,B0083004p,B0132002p,0.882592,BB,The legitimacy of governance is rooted in the ...,The essence of legitimate governance lies in t...
3,B0376006p,B0737001p,0.765472,BB,The king's authority is limited by the need to...,The power of a monarch is not absolute and unr...
4,B0193007p,B0757002p,0.773724,BB,The power of the king is contingent upon adher...,It is essential for kings to be circumscribed ...



--- Book-Speech sample ---


Unnamed: 0,id1,id2,cosine_sim,pair_type,text1,text2
0,B1135002sc,S0018445003sc,0.803271,BS,Effective monarchy governance requires laws to...,The King must ensure governance
1,B0205003sc,S0000738001sc,0.745131,BS,A king must uphold governance,Parliament's sovereignty must be maintained
2,B0227008p,S0019155002p,0.728429,BS,The Parliament represents the interests of the...,Making these documents public allows the membe...
3,B0669001p,S0000720001p,0.740574,BS,It is necessary to hold public officials accou...,The necessity of upholding the principles of j...
4,B0086002sc,S0000962002sc,0.765238,BS,Subjects must safeguard freedoms,Parliament must ensure the kingdom's safety


In [75]:
merged_df = pd.concat([df_pairs_bs_final, df_pairs_bb_final], ignore_index=True)
merged_df = merged_df.drop(columns='pair_type')
merged_df = merged_df.rename(columns={'cosine_sim': 'score'})
merged_df.shape

(100000, 5)

In [76]:
df_round_0 = merged_df

df_round_0.head()

Unnamed: 0,id1,id2,score,text1,text2
0,B1135002sc,S0018445003sc,0.803271,Effective monarchy governance requires laws to...,The King must ensure governance
1,B0205003sc,S0000738001sc,0.745131,A king must uphold governance,Parliament's sovereignty must be maintained
2,B0227008p,S0019155002p,0.728429,The Parliament represents the interests of the...,Making these documents public allows the membe...
3,B0669001p,S0000720001p,0.740574,It is necessary to hold public officials accou...,The necessity of upholding the principles of j...
4,B0086002sc,S0000962002sc,0.765238,Subjects must safeguard freedoms,Parliament must ensure the kingdom's safety


In [77]:
df_round_0.to_excel('df_round_0.xlsx', index=False)

In [78]:
df_round_0 = pd.read_excel("df_round_0.xlsx")


In [80]:
row1 = df_round_0[(df_round_0['id1'] == 'B0083004p') & (df_round_0['id2'] == 'B0132002p')]

df_round_0_test = df_round_0.sample(n=5, random_state= 98)
df_round_0 = df_round_0_test
df_round_0 = pd.concat([df_round_0, row1], ignore_index=True)
df_round_0.head()

Unnamed: 0,id1,id2,score,text1,text2
0,B0674004p,S0051696006p,0.738866,Maintaining order and justice is essential for...,Maintaining democratic oversight is crucial to...
1,B0083004p,B0132002p,0.882592,The legitimacy of governance is rooted in the ...,The essence of legitimate governance lies in t...
2,B0448006p,B1089003p,0.763192,The emphasis on a singular sovereign power in ...,The accountability of the sovereign to God emp...
3,B0875001sc,B1099003sc,0.75528,The emperor's authority in temporal matters co...,Christian kings are accountable solely to divi...
4,B0427001sc,B0596001sc,0.765492,Popular Estates are essential for a just monar...,Lawful political authority is essential for so...


In [None]:
import os, sys

verification = input(f"Do you want to proceed with {len(df_round_0)} LLM calls on this sample? (y/n) ")
if verification.lower() != 'y':
    raise KeyError("LLM calls aborted by user")

# --- Set DeepSeek API key (always overwrite to ensure correct key is used) ---
os.environ["DEEPSEEK_API_KEY"] = "sk-"

# Format df_round_0 to match evaluator's expected schema
df_llm_input = df_round_0.rename(columns={'score': 'entailment_probability'})
df_llm_input = fea.format_df_to_llm(df_llm_input)
print(f"Formatted columns: {list(df_llm_input.columns)}")

# Save formatted CSV for the evaluator
temp_dir = "fea_iterations/temp_data"
os.makedirs(temp_dir, exist_ok=True)
input_csv = os.path.join(temp_dir, "df_round_0.csv")
df_llm_input.to_csv(input_csv, index=False)
print(f"Saved df_llm_input ({len(df_llm_input)} rows) to {input_csv}")

# Add llm_calls to sys.path so we can import the evaluator directly
llm_calls_dir = os.path.join(os.getcwd(), "llm_calls")
if llm_calls_dir not in sys.path:
    sys.path.insert(0, llm_calls_dir)

import importlib
import deepseek_evaluator as etb
importlib.reload(etb)
from llm_calls.prompts import *

# Configure arguments as if called from CLI
output_base = "labeled_pairs/Results_DS_BtoS_iteration_0_one_way"  # evaluator appends .csv
sys.argv = [
    "deepseek_evaluator.py",
    "--model", "deepseek-reasoner",
    "--file", input_csv,
    "--external", "ArgLevel_ClauseIds_df.xlsx",
    "--prompt", "test_prompt_tot_json2",
    "--output", output_base,
]

print(f"Running evaluator with {len(df_llm_input)} pairs...")
etb.main()
print("✓ Evaluator complete")


Formatted columns: ['sentence_id_2', 'sentence_id_1', 'sentence_text_2', 'argument_id_2', 'sentence_text_1', 'argument_id_1', 'score']
Saved df_llm_input (6 rows) to fea_iterations/temp_data\df_round_0.csv
Running evaluator with 6 pairs...
Loading data from fea_iterations/temp_data\df_round_0.csv...
Loading data from ArgLevel_ClauseIds_df.xlsx...
Loaded 6 sentence pairs
Using model: deepseek-reasoner
Using prompt type: test_prompt_tot_json2
Running batch evaluation...


100%|██████████| 6/6 [00:00<00:00, 86.64it/s]


[DEBUG] content length: 1009, reasoning_content length: 7575
[DEBUG] content preview: {
  "sentence_id_1": "B0083004p",
  "sentence_id_2": "B0132002p",
  "answers": "YES, YES, YES",
  "reasoning": "1. Logical entailment: If legitimacy is 'rooted in' consent, then consent is a fundamental basis, which aligns with being the 'essence'. Thus, Statement 1 implies Statement 2. 2. Contextua
[DEBUG] reasoning preview: We are given two statements derived from two arguments. The task is to determine if Statement 1 entails Statement 2. That is, if Statement 1 is true, does it logically imply that Statement 2 is true? We are to consider the context of the arguments and the authors' perspectives, but we are assessing 
Saving progress at batch 1...
Saving progress at batch 2...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_0_one_way_progress_batch_1.csv
Saving progress at batch 3...
Deleting previous file: labeled_pairs/Results_DS_BtoS_iteration_0_one_way_progress_batch_2.csv
Savin

In [83]:
importlib.reload(fea)

# Read one-way LLM results
df_one_way = pd.read_csv("labeled_pairs/Results_DS_BtoS_iteration_0_one_way.csv")
print(f"One-way results: {len(df_one_way)} rows")
df_one_way.head()

One-way results: 6 rows


Unnamed: 0,sentence_id_1,sentence_id_2,answers_12,reasonings_12,comment_12,llm_confidence_12,llm_conclusion_12
0,B0083004p,B0132002p,"YES, YES, YES",1. Logical entailment: If legitimacy is 'roote...,I am completely sure because Statement 1's phr...,4,YES
1,B0083004p,B0132002p,"YES, YES, YES",1. Logical analysis: Statement 1 asserts legit...,The statements are semantically and contextual...,4,YES
2,B0448006p,B1089003p,"YES, NO, NO",1. YES: Both statements reflect a concern with...,While both statements deal with sovereignty an...,3,NO
3,B0674004p,S0051696006p,"NO, NO, NO",1. Statement 1 is derived from a 16th-century ...,The two statements are conceptually distinct a...,4,NO
4,B0427001sc,B0596001sc,"NO, NO, NO",1. Statement 1 is a specific claim about the n...,The statements are logically independent; enta...,4,NO


In [None]:
# Process one-way results into bidirectional verdicts:
# 1) All pairs sent to LLM are recorded in llm_labeled_pairs.csv
# 2) Pairs where both (A,B) and (B,A) exist → immediate verdict
# 3) Pairs where (A,B) is YES but (B,A) is missing → send reverse to LLM
# 4) All results saved to Results_DS_BtoS_iteration_0.csv

verification = input(f"Do you want to proceed with at most {len(df_one_way)} LLM calls on this sample? (y/n) ")
if verification.lower() != 'y':
    raise KeyError("LLM calls aborted by user")

api_key = os.environ.get("DEEPSEEK_API_KEY", "sk-")

# Build df_clause for text lookup (needed for reverse-pair formatting)
df_clause = pd.concat([df_p, df_sc]).drop_duplicates(subset='sentence_id')

df_results_iter0 = fea.process_llm_results_bidirectional(
    df_one_way=df_one_way,
    df_clause=df_clause,
    results_output_path="labeled_pairs/Results_DS_BtoS_iteration_0.csv",
    model="deepseek-reasoner",
    prompt_type="test_prompt_tot_json2",
    args_file="ArgLevel_ClauseIds_df.xlsx",
    output_dir="labeled_pairs",
    batch_label="reverse_iter_0",
    deepseek_api_key=api_key,
    max_reverse_pairs=100_000,
)

print(f"\nFinal Results_DS_BtoS_iteration_0: {len(df_results_iter0)} rows")
df_results_iter0.head()

✓ LLM labeled pairs updated: 5 total in labeled_pairs/llm_labeled_pairs.csv

ONE-WAY RESULTS PROCESSING
Total input pairs: 6
Resolved rows: 8 (YES=0, NO=8)
  ↳ Inferred reverse NO (money saved): 4
Need reverse LLM call: 1

✓ LLM labeled pairs updated: 9 total in labeled_pairs/llm_labeled_pairs.csv
✓ Recorded 4 inferred reverse-NO pairs in labeled_pairs/llm_labeled_pairs.csv

SENDING 1 REVERSE PAIRS TO LLM
Model: deepseek-reasoner
Input: labeled_pairs\reverse_iter_0_input.csv
Output: labeled_pairs\reverse_iter_0_output.csv
Loading data from labeled_pairs\reverse_iter_0_input.csv...
Loading data from ArgLevel_ClauseIds_df.xlsx...
Loaded 1 sentence pairs
Using model: deepseek-reasoner
Using prompt type: test_prompt_tot_json2
Running batch evaluation...


100%|██████████| 1/1 [00:00<00:00, 91.90it/s]


[DEBUG] content length: 1297, reasoning_content length: 13639
[DEBUG] content preview: {
  "sentence_id_1": "B0132002p",
  "sentence_id_2": "B0083004p",
  "answers": "YES, YES, YES",
  "reasoning": "1. In the context of Sidney's arguments, consent is explicitly linked with the right to establish laws and hold leaders accountable, as seen in Argument 2 where he states that legitimacy i
[DEBUG] reasoning preview: We are given two arguments from the same book and author, Algernon Sidney's "Discourses Concerning Government" (1698). We need to determine if Statement 1 (from Argument 1) entails Statement 2 (from Argument 2). The statements are:

Statement 1: "The essence of legitimate governance lies in the cons
Saving progress at batch 1...
Saved results to labeled_pairs\reverse_iter_0_output.csv
✓ Reverse LLM evaluation complete
✓ LLM labeled pairs updated: 10 total in labeled_pairs/llm_labeled_pairs.csv

REVERSE RESULTS SUMMARY
Total reverse pairs resolved: 2
  Bidirectional YES: 2
  Bidi

Unnamed: 0,sentence_id_1,sentence_id_2,answers_12,reasonings_12,comment_12,llm_confidence_12,llm_conclusion_12,verdict
0,B0448006p,B1089003p,"YES, NO, NO",1. YES: Both statements reflect a concern with...,While both statements deal with sovereignty an...,3,NO,NO
1,B1089003p,B0448006p,"YES, NO, NO",1. YES: Both statements reflect a concern with...,While both statements deal with sovereignty an...,3,NO (inferred),NO
2,B0674004p,S0051696006p,"NO, NO, NO",1. Statement 1 is derived from a 16th-century ...,The two statements are conceptually distinct a...,4,NO,NO
3,S0051696006p,B0674004p,"NO, NO, NO",1. Statement 1 is derived from a 16th-century ...,The two statements are conceptually distinct a...,4,NO (inferred),NO
4,B0427001sc,B0596001sc,"NO, NO, NO",1. Statement 1 is a specific claim about the n...,The statements are logically independent; enta...,4,NO,NO


# Threshold Prediction (tau) for Results_DS_BtoS_iteration_0

Uses the labeled data from iteration 0 to train a model and find optimal thresholds,
replicating the approach from FreeEntailmentAlgorithm.ipynb.

In [None]:
importlib.reload(fea)

# Load the bidirectional results
df_iter0 = pd.read_csv("labeled_pairs/Results_DS_BtoS_iteration_0.csv")

# Build df_clause for merging
df_clause = pd.concat([df_p, df_sc]).drop_duplicates(subset='sentence_id')

# Step 1: Create labeled df with verdict (already has verdict from process_llm_results_bidirectional)
df_labeled = fea.merge_pairwise_texts(
    df1=df_clause,
    df2=df_iter0,
    df1_cols=['sentence_id', 'sentence'],
    df2_cols=['sentence_id_1', 'sentence_id_2', 'verdict']
)

# Step 2: Get entailed pairs and build equivalence classes
df_obs_ent = df_labeled.loc[df_labeled['verdict'] == 'YES']

df_labeled = fea.add_equivalents_from_pairs(
    df3=df_obs_ent, df4=df_labeled,
    df3_cols=["id1", "id2"], df4_cols=["id1", "id2"],
    new_cols=("equivalents1", "equivalents2"), include_self=False,
)
df_labeled = fea.add_alpha_weight_column(
    df=df_labeled, list_col1='equivalents1', list_col2='equivalents2', new_col="alpha"
)

# Step 3: Build crossed pairs
df_labeled_crossed = fea.build_equiv_pair_candidates(
    df=df_labeled, id1_col="id1", id2_col="id2",
    equiv1_col="equivalents1", equiv2_col="equivalents2",
)
df_labeled_crossed = fea.merge_pairwise_texts(
    df1=df_clause, df2=df_labeled_crossed,
    df1_cols=['sentence_id', 'sentence'], df2_cols=['id1', 'id2']
)

print(f"Labeled: {len(df_labeled)}, Entailed: {len(df_obs_ent)}, Crossed: {len(df_labeled_crossed)}")

In [None]:
# Step 4: Compute cosine similarity features
df_labeled = fea.generate_new_bert_results(
    df_labeled,
    text_col1='text1', text_col2='text2',
    model_path='./fine_tuned_bi_model',
    new_col='new_cos_sim_score',
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1', id_col2='id2'
)

df_labeled_crossed = fea.generate_new_bert_results(
    df_labeled_crossed,
    text_col1='text1', text_col2='text2',
    model_path='./fine_tuned_bi_model',
    new_col='cosine_sim',
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1', id_col2='id2'
)

# Step 5: Compute neighbor-weighted score
df_labeled = fea.compute_neighbor_weighted_score(
    df5=df_labeled_crossed, df6=df_labeled,
    id1_col="id1", id2_col="id2",
    cosim_df5_col="cosine_sim", cosim_df6_col="new_cos_sim_score",
    alpha_col="alpha", eq1_col="equivalents1", eq2_col="equivalents2",
    new_col="cos_sim_neighbor_score",
)

# Step 6: Graph features
df_labeled = fea.add_graph_features(
    df=df_labeled, entailment_df=df_obs_ent,
    id1_col="id1", id2_col="id2", verdict_col="verdict",
    positive_label="YES", decay=0.9, max_hops=5
)
df_labeled = df_labeled.dropna(subset=['cos_sim_neighbor_score'])

print(f"Labeled with features: {len(df_labeled)} rows")
df_labeled.head()

In [None]:
# Step 7: Train model and predict entailment probabilities
features = ['cos_sim_neighbor_score']
target = 'verdict'
positive_label = 'YES'

print(f"Training on {len(df_labeled)} labeled pairs...")

# Optimize hyperparameters
try:
    best_params = fea.optimize_boosting_hyperparameters(
        df=df_labeled, feature_cols=features, target_col=target,
        positive_label=positive_label, n_trials=30
    )
    if best_params:
        best_params['enforce_monotonicity'] = True
except Exception as e:
    print(f"Optimization skipped: {e}")
    best_params = {'learning_rate': 0.05, 'max_iter': 300, 'enforce_monotonicity': True}

# Compare models
comparison_df, best_model_name = fea.compare_entailment_models(
    df=df_labeled, feature_cols=features, target_col=target,
    model_names=["logistic", "spline", "tree", "boosting"],
    positive_label=positive_label, **best_params
)
print(f"\nSelected model: {best_model_name}")
display(comparison_df)

In [None]:
# Step 8: Find optimal threshold (tau)
best_pipeline = fea.train_entailment_model(
    df=df_labeled, feature_cols=features, target_col=target,
    method=best_model_name, positive_label=positive_label, **best_params
)

df_labeled_with_probs = fea.predict_entailment_probabilities(
    df_labeled, model_pipeline=best_pipeline,
    feature_cols=features, new_col='entailment_probability'
)

# Find best thresholds using labeled data
results = fea.find_best_thresholds(
    df=df_labeled_with_probs,
    score_col="entailment_probability",
    verdict_col="verdict",
    positive_label="YES"
)

print("Best tau (accuracy):", results["best_tau_accuracy"], "Accuracy:", results["best_accuracy"])
print("Best tau (F1):", results["best_tau_f1"], "F1:", results["best_f1"])
print("Best tau (precision):", results["best_tau_precision"], "Prec:", results["best_precision"])
print("Best tau (recall):", results["best_tau_recall"], "Rec:", results["best_recall"])

display(results["best_taus_table"])

In [None]:
# Step 9: Minimize FN threshold (cost-sensitive)
tau = fea.get_optimal_threshold_minimize_fn(strategy='cost', cost_fn=5.0)
print(f"Selected threshold tau = {tau:.4f}")
print(f"Logic: Send pairs with score > {tau:.4f} to LLM. Auto-reject scores ≤ {tau:.4f}.")

# Save threshold for use in FEA_Loop
import json
threshold_info = {
    'tau': tau,
    'best_tau_accuracy': results['best_tau_accuracy'],
    'best_tau_f1': results['best_tau_f1'],
    'best_model': best_model_name,
}
with open("labeled_pairs/threshold_info_iter0.json", 'w') as f:
    json.dump(threshold_info, f, indent=2)
print(f"✓ Saved threshold info to labeled_pairs/threshold_info_iter0.json")