# Set up 

## Import libraries 

In [1]:
# ALTERED: Removed STANDALONE_TEST feature
# ALTERED: Removed total_cost parameter (cost calculated in FEA_Loop only)
# Parameters - papermill will inject these values
# Tag this cell with "parameters" in the notebook

df_candidates_path = ""
df_crossed_path = ""
df_labeled_path = ""
df_labeled_crossed_path = ""
df_obs_ent_path = ""
df_clause_path = ""
embedding_cache_path = ""

In [2]:
# Parameters
df_candidates_path = "fea_iterations/temp_data/df_candidates.pkl"
df_crossed_path = "fea_iterations/temp_data/df_crossed.pkl"
df_labeled_path = "fea_iterations/temp_data/df_labeled.pkl"
df_labeled_crossed_path = "fea_iterations/temp_data/df_labeled_crossed.pkl"
df_obs_ent_path = "fea_iterations/temp_data/df_obs_ent.pkl"
df_clause_path = "fea_iterations/temp_data/df_clause.pkl"
embedding_cache_path = "fea_iterations/temp_data/embedding_cache.pkl"


In [3]:

import pandas as pd
import pickle
import importlib
import numpy as np
import sys
import os

current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)
    print(f"Added current directory to sys.path: {current_dir}")
import free_entailments_algorithm_utils as fea
import scrapbook as sb

print(f"Loading data from pickle files...")
print(f"  Loading from: {df_candidates_path}")

df_candidates = pd.read_pickle(df_candidates_path)
df_crossed = pd.read_pickle(df_crossed_path)
df_labeled = pd.read_pickle(df_labeled_path)
df_labeled_crossed = pd.read_pickle(df_labeled_crossed_path)
df_obs_ent = pd.read_pickle(df_obs_ent_path)
df_clause = pd.read_pickle(df_clause_path)

with open(embedding_cache_path, 'rb') as f:
    embedding_cache_finetuned = pickle.load(f)

print(f"✓ Successfully loaded all data:")
print(f"  - df_candidates: {len(df_candidates)} rows")
print(f"  - df_crossed: {len(df_crossed)} rows")
print(f"  - df_labeled: {len(df_labeled)} rows")
print(f"  - df_labeled_crossed: {len(df_labeled_crossed)} rows")
print(f"  - df_obs_ent: {len(df_obs_ent)} rows")
print(f"  - df_clause: {len(df_clause)} rows")
print(f"  - embedding_cache: {len(embedding_cache_finetuned)} entries")

Added current directory to sys.path: c:\Users\aesteva\Documents\GitHub\fea_project


Loading data from pickle files...
  Loading from: fea_iterations/temp_data/df_candidates.pkl


✓ Successfully loaded all data:
  - df_candidates: 999 rows
  - df_crossed: 1 rows
  - df_labeled: 7 rows
  - df_labeled_crossed: 2 rows
  - df_obs_ent: 1 rows
  - df_clause: 63909 rows
  - embedding_cache: 63909 entries


In [4]:
importlib.reload(fea)

<module 'free_entailments_algorithm_utils' from 'c:\\Users\\aesteva\\Documents\\GitHub\\fea_project\\free_entailments_algorithm_utils.py'>

# Calculate Similarity On LLM Results

In [5]:
# OPTIMIZED: Using fine-tuned embeddings from cache (instant!)
# Old way: Re-encoded all texts (~3-5 minutes)
# New way: Lookup from cache (< 1 second)

df_candidates_with_scores = fea.generate_new_bert_results(
    df_candidates,
    text_col1='text1',
    text_col2='text2',
    model_path="./fine_tuned_bi_model",
    new_col="new_cos_sim_score",
    # NEW PARAMETERS: Use the fine-tuned embedding cache!
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)
df_candidates_with_scores.head()

Using pre-computed embeddings from cache...


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score
0,B0628003p,B1143008p,The treasonous plot specifically involved an a...,This structure enhances the likelihood of wise...,,[],[],,0.353027
1,B0423006p,B0530003p,Magistrates are bound to obey the sovereign's ...,Usurpation is defined as a domestic conquest w...,,[],[],,0.527832
2,B0934002p,B1065002p,"Historically, the governance of the Kingdom wa...",Civil law is defined as the rules that the Com...,,[],[],,0.554199
3,B0716001p,B1125006p,The promise of obedience and compliance is ess...,"True peace is a harmonious society, not merely...",,[],[],,0.556152
4,B0560009p,B1204002p,The papal authority challenged the sanctity of...,The position of rulers is described as a deleg...,,[],[],,0.630371


In [6]:
df_labeled = fea.generate_new_bert_results(
    df_labeled,
    text_col1='text1',
    text_col2='text2',
    model_path="./fine_tuned_bi_model",
    new_col="new_cos_sim_score",
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)

df_labeled.head()

Using pre-computed embeddings from cache...


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score
0,B0674004p,S0051696006p,Maintaining order and justice is essential for...,Maintaining democratic oversight is crucial to...,NO,[],[],,0.73877
1,B1135002sc,S0018445003sc,Effective monarchy governance requires laws to...,The King must ensure governance,NO,[],[],,0.803223
2,B0448006p,B1089003p,The emphasis on a singular sovereign power in ...,The accountability of the sovereign to God emp...,NO,[],[],,0.763184
3,B0427001sc,B0596001sc,Popular Estates are essential for a just monar...,Lawful political authority is essential for so...,NO,[],[],,0.765625
4,B0083004p,B0132002p,The legitimacy of governance is rooted in the ...,The essence of legitimate governance lies in t...,YES,[B0132002p],[B0083004p],0.5,0.882324


In [7]:
## Takes a few minutes depending on computing power


# df_main = fea.add_cross_encoder_score(
#     df_main,                 
#     text_col1='text1',
#     text_col2='text2',
#     model_name="./fine_tuned_nli_model",  
#     new_col="nli_score",
#     batch_size=128              # Keep batch size lower for Cross-Encoders, go easy on your computer
# )

# df_main.head()
# df_main.shape

In [8]:
df_crossed = fea.add_cosine_similarity_from_text(
    df_crossed,
    text_col1="text1",
    text_col2="text2",
    model_name="./fine_tuned_bi_model",  # Ignored when cache provided
    batch_size=128,
    show_progress_bar=False,  # No need since we're using cache
    # NEW PARAMETERS: Use the fine-tuned embedding cache!
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)

df_crossed.head()

Using pre-computed embeddings from cache...


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,cosine_sim
0,S13558006p,B0132002p,The proposed Bill disregards established legal...,The essence of legitimate governance lies in t...,,0.445068


In [9]:
df_labeled_crossed = fea.add_cosine_similarity_from_text(
    df_labeled_crossed,
    text_col1="text1",
    text_col2="text2",
    model_name="./fine_tuned_bi_model",  # Ignored when cache provided
    batch_size=128,
    show_progress_bar=False,  # No need since we're using cache
    # NEW PARAMETERS: Use the fine-tuned embedding cache!
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)

df_labeled_crossed.head()

Using pre-computed embeddings from cache...


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,cosine_sim
0,B0083004p,B0083004p,The legitimacy of governance is rooted in the ...,The legitimacy of governance is rooted in the ...,,1.0
1,B0132002p,B0132002p,The essence of legitimate governance lies in t...,The essence of legitimate governance lies in t...,,1.0


# Features

## Compute Cos Sim Neighborhood Score

In [10]:
df_candidates = fea.compute_neighbor_weighted_score(
    df5 = df_crossed,
    df6 = df_candidates_with_scores,
    id1_col = "id1",
    id2_col = "id2",
    cosim_df5_col = "cosine_sim",
    cosim_df6_col = "new_cos_sim_score",
    alpha_col = "alpha",
    eq1_col = "equivalents1",
    eq2_col = "equivalents2",
    new_col = "cos_sim_neighbor_score",
)
df_candidates.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score,cos_sim_neighbor_score
0,B0628003p,B1143008p,The treasonous plot specifically involved an a...,This structure enhances the likelihood of wise...,,[],[],,0.353027,0.353027
1,B0423006p,B0530003p,Magistrates are bound to obey the sovereign's ...,Usurpation is defined as a domestic conquest w...,,[],[],,0.527832,0.527832
2,B0934002p,B1065002p,"Historically, the governance of the Kingdom wa...",Civil law is defined as the rules that the Com...,,[],[],,0.554199,0.554199
3,B0716001p,B1125006p,The promise of obedience and compliance is ess...,"True peace is a harmonious society, not merely...",,[],[],,0.556152,0.556152
4,B0560009p,B1204002p,The papal authority challenged the sanctity of...,The position of rulers is described as a deleg...,,[],[],,0.630371,0.630371


In [11]:
df_labeled = fea.compute_neighbor_weighted_score(
    df5 = df_labeled_crossed,
    df6 = df_labeled,
    id1_col = "id1",
    id2_col = "id2",
    cosim_df5_col = "cosine_sim",
    cosim_df6_col = "new_cos_sim_score",
    alpha_col = "alpha",
    eq1_col = "equivalents1",
    eq2_col = "equivalents2",
    new_col = "cos_sim_neighbor_score",
)
df_labeled.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score,cos_sim_neighbor_score
0,B0674004p,S0051696006p,Maintaining order and justice is essential for...,Maintaining democratic oversight is crucial to...,NO,[],[],,0.73877,0.73877
1,B1135002sc,S0018445003sc,Effective monarchy governance requires laws to...,The King must ensure governance,NO,[],[],,0.803223,0.803223
2,B0448006p,B1089003p,The emphasis on a singular sovereign power in ...,The accountability of the sovereign to God emp...,NO,[],[],,0.763184,0.763184
3,B0427001sc,B0596001sc,Popular Estates are essential for a just monar...,Lawful political authority is essential for so...,NO,[],[],,0.765625,0.765625
4,B0083004p,B0132002p,The legitimacy of governance is rooted in the ...,The essence of legitimate governance lies in t...,YES,[B0132002p],[B0083004p],0.5,0.882324,0.882324


In [12]:

df_candidates.shape, df_labeled.shape

((999, 10), (7, 10))

## Compute NLI Score 

In [13]:
# df_crossed = fea.add_cross_encoder_score(
#     df_crossed,
#     text_col1="text1",
#     text_col2="text2",
    
#     # FIX: Use the relative path with ./ just like before
#     model_name="./fine_tuned_nli_model", 
#     batch_size=128,
#     new_col="nli_score" 
# )

# # (add_cross_encoder_score adds the column in-place)
# if "nli_score" not in df_candidates.columns:
#     print("Scores added to df_crossed!")
    
# df_crossed.head()

# # We reuse the same function used for Cosine Similarity, but point to NLI columns.
# df_candidates = fea.compute_neighbor_weighted_score(
#     df5=df_crossed,
#     df6=df_candidates,
#     id1_col="id1",
#     id2_col="id2",
#     cosim_df5_col="nli_score",    # The 'sigma' lookup table uses NLI
#     cosim_df6_col="nli_score",    # The 'sigma_ij' value uses NLI
#     alpha_col="alpha",
#     eq1_col="equivalents1",
#     eq2_col="equivalents2",
#     new_col="nli_neighbor_score"
# )

# cols_to_show = ['text1', 'text2', 'nli_score', 'nli_neighbor_score', 'verdict']
# df_candidates[cols_to_show].head()

## Compute Transitivity Score

In [14]:
# We calculate graph-based scores (Transitivity).
# 'graph_entailment_score': A -> B (Path Decay applied)
# 'graph_equivalence_score': A <-> B (Stronger constraint)

df_candidates = fea.add_graph_features(
    df=df_candidates,
    entailment_df=df_obs_ent,
    id1_col="id1",
    id2_col="id2",
    verdict_col="verdict",
    positive_label="YES",
    decay=0.9,  # Confidence drops by 10% per extra hop
    max_hops=5
)

print("\nGraph Score Stats:")
print(df_candidates[['graph_entailment_score', 'graph_equivalence_score']].describe())

Building Directed Entailment Graph...
Computing graph features for 999 pairs...

Graph Score Stats:
       graph_entailment_score  graph_equivalence_score
count                   999.0                    999.0
mean                      0.0                      0.0
std                       0.0                      0.0
min                       0.0                      0.0
25%                       0.0                      0.0
50%                       0.0                      0.0
75%                       0.0                      0.0
max                       0.0                      0.0


In [15]:
df_labeled = fea.add_graph_features(
    df=df_labeled,
    entailment_df=df_obs_ent,
    id1_col="id1",
    id2_col="id2",
    verdict_col="verdict",
    positive_label="YES",
    decay=0.9,  # Confidence drops by 10% per extra hop
    max_hops=5
)

print("\nGraph Score Stats:")
print(df_candidates[['graph_entailment_score', 'graph_equivalence_score']].describe())

Building Directed Entailment Graph...
Computing graph features for 7 pairs...

Graph Score Stats:
       graph_entailment_score  graph_equivalence_score
count                   999.0                    999.0
mean                      0.0                      0.0
std                       0.0                      0.0
min                       0.0                      0.0
25%                       0.0                      0.0
50%                       0.0                      0.0
75%                       0.0                      0.0
max                       0.0                      0.0


In [16]:
df_candidates = df_candidates.dropna(subset=['cos_sim_neighbor_score'])
df_labeled = df_labeled.dropna(subset=['cos_sim_neighbor_score'])

print(f"After dropna: {len(df_candidates)} candidates, {len(df_labeled)} labeled")
if len(df_labeled) == 0:
    print("WARNING: No labeled rows with valid scores — model training will be skipped upstream.")

After dropna: 999 candidates, 7 labeled


# Predicting Entailment (can change model pipeline to something other than logistic)

In [17]:
# Feature Engineering & Model Training
# Updated features list to include robust graph metrics
features = [
    'cos_sim_neighbor_score', 
    #'nli_neighbor_score', 
    #'graph_entailment_score', 
    #'graph_equivalence_score'
]
target = 'verdict'
positive_label = 'YES'

# Generate BERT results for df_labeled


print(f"Training dataset: {len(df_labeled)} pairs with features and verdicts")
print(f"Prediction dataset: {len(df_candidates)} pairs with features (no verdicts)")

# 2. (Optional) Run Hyperparameter Optimization with Optuna
try:
    print("\n>>> Optimizing Boosting Hyperparameters with Optuna...")
    # This might take a minute but will find scientifically best parameters
    best_params = fea.optimize_boosting_hyperparameters(
        df=df_labeled,  # TRAIN ON LABELED DATA!
        feature_cols=features,
        target_col=target,
        positive_label=positive_label,
        n_trials=30 
    )
    
    # Add monotonic constraint assumption back if we believe in it
    # (Optuna doesn't optimize this structure, it optimizes numbers)
    if best_params:
        best_params['enforce_monotonicity'] = True 
        
except Exception as e:
    print(f"\nOptimization skipped or failed: {e}")
    print("Using conservative defaults.")
    best_params = {
        'learning_rate': 0.05,
        'max_iter': 300,
        'enforce_monotonicity': True
    }


# 3. Run Comparative Analysis using helper function
# Updated: Now uses Cross-Validation internally to prevent Overfitting!
comparison_df, best_model_name = fea.compare_entailment_models(
    df=df_labeled,  # TRAIN ON LABELED DATA!
    feature_cols=features,
    target_col=target,
    model_names=["logistic", "spline", "tree", "boosting"], 
    positive_label=positive_label,
    **best_params # Unpack the best parameters here
)

# --- Display Results ---
print("\nComparison Results (Sorted by ROC-AUC):")
display(comparison_df)

print(f"\n>>> Selected '{best_model_name}' model for downstream processing.")

Training dataset: 7 pairs with features and verdicts
Prediction dataset: 999 pairs with features (no verdicts)

>>> Optimizing Boosting Hyperparameters with Optuna...
Optuna not installed. Please run: pip install optuna
Running comparative analysis on 7 samples...
Features: ['cos_sim_neighbor_score']

--- Training logistic ---
Training Logistic Regression...
Model (logistic) Train Accuracy: 0.8571
  ⚠ Too few samples for CV (7 total, min class=1). Using train predictions.


--- Training spline ---
Training Spline Logistic Regression...
Model (spline) Train Accuracy: 1.0000
  ⚠ Too few samples for CV (7 total, min class=1). Using train predictions.
--- Training tree ---
Training Decision Tree Classifier...
Model (tree) Train Accuracy: 1.0000
  ⚠ Too few samples for CV (7 total, min class=1). Using train predictions.
--- Training boosting ---
Training Histogram Gradient Boosting Classifier (lr=0.05, iter=200)...


Model (boosting) Train Accuracy: 0.8571
  ⚠ Too few samples for CV (7 total, min class=1). Using train predictions.

Comparison Results (Sorted by ROC-AUC):


Unnamed: 0_level_0,ROC-AUC (CV),Log Loss,Separation,Mean Prob (YES),Mean Prob (NO)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
logistic,1.0,0.3178057,0.591827,0.875899,0.284073
spline,1.0,0.4723039,0.26,0.638514,0.378514
tree,1.0,2.220446e-16,1.0,1.0,0.0
boosting,0.5,0.6931472,0.0,0.5,0.5



>>> Selected 'logistic' model for downstream processing.


In [18]:
print(f"Retraining '{best_model_name}' with optimized parameters on labeled data...")

# Train the model on df_labeled_with_features (which has verdicts)
best_pipeline = fea.train_entailment_model(
    df=df_labeled,  # TRAIN ON LABELED DATA!
    feature_cols=features,
    target_col=target,
    method=best_model_name,
    positive_label=positive_label,
    **best_params
)


df_candidates = fea.predict_entailment_probabilities(
    df_candidates,
    model_pipeline=best_pipeline,
    feature_cols=features,
    new_col='entailment_probability'
)

print(f"\nPrediction stats:")
print(f"  Min probability: {df_candidates['entailment_probability'].min():.4f}")
print(f"  Max probability: {df_candidates['entailment_probability'].max():.4f}")
print(f"  Mean probability: {df_candidates['entailment_probability'].mean():.4f}")

df_labeled.head()

Retraining 'logistic' with optimized parameters on labeled data...
Training Logistic Regression...
Model (logistic) Train Accuracy: 0.8571

Prediction stats:
  Min probability: 0.0000
  Max probability: 0.1675
  Mean probability: 0.0063


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score,cos_sim_neighbor_score,graph_entailment_score,graph_equivalence_score
0,B0674004p,S0051696006p,Maintaining order and justice is essential for...,Maintaining democratic oversight is crucial to...,NO,[],[],,0.73877,0.73877,0.0,0.0
1,B1135002sc,S0018445003sc,Effective monarchy governance requires laws to...,The King must ensure governance,NO,[],[],,0.803223,0.803223,0.0,0.0
2,B0448006p,B1089003p,The emphasis on a singular sovereign power in ...,The accountability of the sovereign to God emp...,NO,[],[],,0.763184,0.763184,0.0,0.0
3,B0427001sc,B0596001sc,Popular Estates are essential for a just monar...,Lawful political authority is essential for so...,NO,[],[],,0.765625,0.765625,0.0,0.0
4,B0083004p,B0132002p,The legitimacy of governance is rooted in the ...,The essence of legitimate governance lies in t...,YES,[B0132002p],[B0083004p],0.5,0.882324,0.882324,1.0,0.0


# Optimize threshold 

In [19]:
# Predict on labeled data to find optimal thresholds
print("Predicting on labeled data for threshold optimization...")
df_labeled_with_features = fea.predict_entailment_probabilities(
    df_labeled,
    model_pipeline=best_pipeline,
    feature_cols=features,
    new_col='entailment_probability'
)

# We use the generic 'entailment_probability' column which now holds the best model's output
importlib.reload(fea)

results = fea.find_best_thresholds(
    df=df_labeled_with_features,  # USE LABELED DATA FOR THRESHOLD TUNING!
    score_col="entailment_probability",
    verdict_col="verdict",
    positive_label="YES"
)

df_labeled_with_features.head()

Predicting on labeled data for threshold optimization...


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score,cos_sim_neighbor_score,graph_entailment_score,graph_equivalence_score,entailment_probability
0,B0674004p,S0051696006p,Maintaining order and justice is essential for...,Maintaining democratic oversight is crucial to...,NO,[],[],,0.73877,0.73877,0.0,0.0,0.182808
1,B1135002sc,S0018445003sc,Effective monarchy governance requires laws to...,The King must ensure governance,NO,[],[],,0.803223,0.803223,0.0,0.0,0.513062
2,B0448006p,B1089003p,The emphasis on a singular sovereign power in ...,The accountability of the sovereign to God emp...,NO,[],[],,0.763184,0.763184,0.0,0.0,0.286912
3,B0427001sc,B0596001sc,Popular Estates are essential for a just monar...,Lawful political authority is essential for so...,NO,[],[],,0.765625,0.765625,0.0,0.0,0.29907
4,B0083004p,B0132002p,The legitimacy of governance is rooted in the ...,The essence of legitimate governance lies in t...,YES,[B0132002p],[B0083004p],0.5,0.882324,0.882324,1.0,0.0,0.875899


In [20]:
print("Best tau (accuracy):", results["best_tau_accuracy"],
      "Accuracy:", results["best_accuracy"])

print("Best tau (F1):", results["best_tau_f1"],
      "F1:", results["best_f1"])

print("Best tau (TP):", results["best_tau_tp"],
      "TP:", results["max_true_positives"])

print("Best tau (precision):", results["best_tau_precision"],
      "prec:", results["best_precision"])

print("Best tau (recall):",    results["best_tau_recall"],
      "rec:",  results["best_recall"])

Best tau (accuracy): 0.5130621638053446 Accuracy: 1.0
Best tau (F1): 0.5130621638053446 F1: 1.0
Best tau (TP): 0.17251944134900796 TP: 1
Best tau (precision): 0.5130621638053446 prec: 1.0
Best tau (recall): 0.17251944134900796 rec: 1.0


In [21]:
results["best_taus_table"]

Unnamed: 0,tau,TP,TN,FP,FN,accuracy,precision,recall,f1
0,0.172519,1,1,5,0,0.285714,0.166667,1.0,0.285714
1,0.513062,1,6,0,0,1.0,1.0,1.0,1.0


In [22]:
import importlib
import plotly.io as pio
import free_entailments_algorithm_utils as fea
importlib.reload(fea) 

# Ensure Plotly renders appropriately for notebook/vscode context
pio.renderers.default = "notebook_connected" 

# This also calculates 'best_tau_low_send' (Top 1-5% Candidates)
# Use labeled data (with verdicts) for threshold finding!

results = fea.find_best_thresholds(
    df=df_labeled_with_features,  # FIX: Use labeled data, not candidates!
    score_col="entailment_probability", 
    verdict_col="verdict",
    positive_label="YES"
)


tau_low_send = results.get('best_tau_low_send', 0.95)

print("\nLow-Send Optimization (Candidate Selection):")
print(f"Selected Low-Send Threshold: {tau_low_send:.6f}")
if "low_send_table" in results:
    display(results["low_send_table"])

print("\n>>> Interactive Analysis: LLM Savings vs Threshold")

markers_to_show = {
    "Optimization (Top %)":    tau_low_send,
    "Max Accuracy":            results["best_tau_accuracy"],
    "Max F1":                  results["best_tau_f1"]
}


# Sent = Prob > Threshold
fig = fea.plot_llm_savings_over_thresholds(
    df=df_labeled_with_features,  # FIX: Use labeled data for plotting too!
    prob_col="entailment_probability",
    verdict_col="verdict",
    positive_label="YES",
    step=0.01,
    markers=markers_to_show
)
fig.show()


Low-Send Optimization (Candidate Selection):
Selected Low-Send Threshold: 0.854129


Unnamed: 0,target_percentile,tau,sent_rate,FN,TP,FP,TN
0,0.01,0.854129,0.142857,0,1,0,6
1,0.02,0.832359,0.142857,0,1,0,6
2,0.03,0.810589,0.142857,0,1,0,6
3,0.04,0.788819,0.142857,0,1,0,6
4,0.05,0.767048,0.142857,0,1,0,6



>>> Interactive Analysis: LLM Savings vs Threshold


In [23]:
import importlib
import free_entailments_algorithm_utils as fea
importlib.reload(fea)

print("--- Defining Threshold for LLM ---")

# Strategy: Send everything above a certain confidence threshold.
# We use the 'Minimize False Negatives' strategy (Cost Sensitive) to find a threshold
# effectively filtering out 'Definite Negatives' while keeping all potential Candidates.
# Cost Ratio 1:5 means we punish missing a Yes (FN) 5x more than sending a useless No (FP).

tau = fea.get_optimal_threshold_minimize_fn(strategy='cost', cost_fn=5.0)

print(f"Selected Threshold: {tau:.4f} (Send if Score > {tau:.4f})")
print(f"Logic: Minimize FN (Don't miss Entailments). Auto-Reject scores <= {tau:.4f}.")

# # 2. Estimate Cost
# print("\n--- Cost Analysis ---")
# cost = fea.estimate_deepseek_cost(
#     df=df_candidates, 
#     prob_col='entailment_probability', 
#     threshold=tau,
#     model="deepseek-reasoner"
# )

# 3. Generate Final DataFrame for the LLM
print("\n--- Generating File ---")
df_final = fea.generate_final_df(
    df=df_candidates, 
    prob_col='entailment_probability', 
    threshold=tau
)
# df_final.to_csv("llm_batch_final.csv", index=False)

--- Defining Threshold for LLM ---
Selected Threshold: 0.1667 (Send if Score > 0.1667)
Logic: Minimize FN (Don't miss Entailments). Auto-Reject scores <= 0.1667.

--- Generating File ---
--- Generating LLM Batch ---
Original Count: 999
Filtered Count: 1 (0.1%)
Condition:      P > 0.1667 (Send High Confidence Pairs)


In [24]:
# Record outputs for papermill using scrapbook
import scrapbook as sb

# Save df_final and cost for this iteration
sb.glue('df_final', df_final)

# Convert Plotly figure to HTML for serialization
fig_html = fig.to_html(include_plotlyjs='cdn')
sb.glue('fig_html', fig_html)

print("\n✓ Outputs recorded for papermill retrieval")


✓ Outputs recorded for papermill retrieval
