# Set up 

## Import libraries 

In [1]:
# ALTERED: Removed STANDALONE_TEST feature
# ALTERED: Removed total_cost parameter (cost calculated in FEA_Loop only)
# Parameters - papermill will inject these values
# Tag this cell with "parameters" in the notebook

df_candidates_path = ""
df_crossed_path = ""
df_labeled_path = ""
df_labeled_crossed_path = ""
df_obs_ent_path = ""
df_clause_path = ""
embedding_cache_path = ""

In [2]:
# Parameters
df_candidates_path = "fea_iterations/temp_data/df_candidates.pkl"
df_crossed_path = "fea_iterations/temp_data/df_crossed.pkl"
df_labeled_path = "fea_iterations/temp_data/df_labeled.pkl"
df_labeled_crossed_path = "fea_iterations/temp_data/df_labeled_crossed.pkl"
df_obs_ent_path = "fea_iterations/temp_data/df_obs_ent.pkl"
df_clause_path = "fea_iterations/temp_data/df_clause.pkl"
embedding_cache_path = "fea_iterations/temp_data/embedding_cache.pkl"


In [3]:
# ALTERED: Removed STANDALONE_TEST feature - always load from pickle files
import pandas as pd
import pickle
import importlib
import numpy as np
import free_entailments_algorithm_utils as fea
import scrapbook as sb

print(f"Loading data from pickle files...")
print(f"  Loading from: {df_candidates_path}")

df_candidates = pd.read_pickle(df_candidates_path)
df_crossed = pd.read_pickle(df_crossed_path)
df_labeled = pd.read_pickle(df_labeled_path)
df_labeled_crossed = pd.read_pickle(df_labeled_crossed_path)
df_obs_ent = pd.read_pickle(df_obs_ent_path)
df_clause = pd.read_pickle(df_clause_path)

with open(embedding_cache_path, 'rb') as f:
    embedding_cache_finetuned = pickle.load(f)

print(f"✓ Successfully loaded all data:")
print(f"  - df_candidates: {len(df_candidates)} rows")
print(f"  - df_crossed: {len(df_crossed)} rows")
print(f"  - df_labeled: {len(df_labeled)} rows")
print(f"  - df_labeled_crossed: {len(df_labeled_crossed)} rows")
print(f"  - df_obs_ent: {len(df_obs_ent)} rows")
print(f"  - df_clause: {len(df_clause)} rows")
print(f"  - embedding_cache: {len(embedding_cache_finetuned)} entries")

Loading data from pickle files...
  Loading from: fea_iterations/temp_data/df_candidates.pkl


✓ Successfully loaded all data:
  - df_candidates: 4818 rows
  - df_crossed: 879 rows
  - df_labeled: 5182 rows
  - df_labeled_crossed: 2427 rows
  - df_obs_ent: 657 rows
  - df_clause: 63909 rows
  - embedding_cache: 63909 entries


In [4]:
importlib.reload(fea)

<module 'free_entailments_algorithm_utils' from '/scratch/midway3/aesteva/fea_project/free_entailments_algorithm_utils.py'>

# Calculate Similarity On LLM Results

In [5]:
# OPTIMIZED: Using fine-tuned embeddings from cache (instant!)
# Old way: Re-encoded all texts (~3-5 minutes)
# New way: Lookup from cache (< 1 second)

df_candidates_with_scores = fea.generate_new_bert_results(
    df_candidates,
    text_col1='text1',
    text_col2='text2',
    model_path="./fine_tuned_bi_model",
    new_col="new_cos_sim_score",
    # NEW PARAMETERS: Use the fine-tuned embedding cache!
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)
df_candidates_with_scores.head()

Using pre-computed embeddings from cache...


Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score
0,B0783006p,S0019961006p,The Parliament holds the power to regulate the...,"By promptly seeking the King's guidance, Parli...",,[S0003551004p],[],1.0,0.780273
1,B1129001sc,S0003329002sc,A monarchical dominion needs a clear hierarchy,to enhance the monarchy-Parliament relationship,,[],[],,0.694824
2,B0287002sc,S0019015004sc,A government must maintain societal integrity,Accountability ensures government integrity,,[],[],,0.862305
3,B1086002sc,S0023795002sc,threatens the stability of a Commonwealth,Preserving Parliament's authority is essential...,,[],[],,0.673828
4,B0336005p,S0016856004p,Such focused discussions would enable the coun...,Established procedures should guide discussion...,,[],[],,0.71582


In [6]:
df_labeled = fea.generate_new_bert_results(
    df_labeled,
    text_col1='text1',
    text_col2='text2',
    model_path="./fine_tuned_bi_model",
    new_col="new_cos_sim_score",
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)

df_labeled.head()

Using pre-computed embeddings from cache...


Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score
0,B0860002sc,S0010771002sc,The king's support must match his responsibili...,clear evidence of acting against the interests...,NO,[],[],,0.695312
1,B1170001sc,S0020225001sc,Active governance by the prince is essential f...,Maintaining respect for the monarchy is essential,NO,[],[],,0.708008
2,B0454001p,S0004868005p,Agrarian laws can effectively prevent the rise...,This situation highlights the tension between ...,NO,[],"[B0203001p, B0278002p, B0314009p]",0.0,0.657715
3,B0227001sc,S0000883002sc,Parliament should hold the power to correct le...,Parliament must uphold the rule of law,YES,"[S0000883002sc, S0019473002sc, S0022873001sc]",[B0227001sc],0.5,0.821777
4,B0580002sc,S0023399001sc,The king's presence is essential for validatin...,Parliament must assert authority,NO,[],[],,0.744629


In [7]:
## Takes a few minutes depending on computing power


# df_main = fea.add_cross_encoder_score(
#     df_main,                 
#     text_col1='text1',
#     text_col2='text2',
#     model_name="./fine_tuned_nli_model",  
#     new_col="nli_score",
#     batch_size=128              # Keep batch size lower for Cross-Encoders, go easy on your computer
# )

# df_main.head()
# df_main.shape

In [8]:
df_crossed = fea.add_cosine_similarity_from_text(
    df_crossed,
    text_col1="text1",
    text_col2="text2",
    model_name="./fine_tuned_bi_model",  # Ignored when cache provided
    batch_size=128,
    show_progress_bar=False,  # No need since we're using cache
    # NEW PARAMETERS: Use the fine-tuned embedding cache!
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)

df_crossed.head()

Using pre-computed embeddings from cache...


Unnamed: 0,id1,id2,text1,text2,verdict,cosine_sim
0,B0663002p,B0382007p,A king's power and authority are best confirme...,The mutual obligation between the king and the...,,0.716797
1,B0383006p,B0087006p,"If a ruler acts as an enemy to their subjects,...",The overarching control of a single ruler can ...,,0.713379
2,B0383006p,B0260004p,"If a ruler acts as an enemy to their subjects,...",This demonstrates that authority is not an inh...,,0.667969
3,B0711002sc,B0204002sc,Laws establish a structured relationship with ...,Laws protect their rights in the political str...,,0.81543
4,B0711002sc,B0289001sc,Laws establish a structured relationship with ...,The legal system ensures justice through a col...,,0.78125


In [9]:
df_labeled_crossed = fea.add_cosine_similarity_from_text(
    df_labeled_crossed,
    text_col1="text1",
    text_col2="text2",
    model_name="./fine_tuned_bi_model",  # Ignored when cache provided
    batch_size=128,
    show_progress_bar=False,  # No need since we're using cache
    # NEW PARAMETERS: Use the fine-tuned embedding cache!
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)

df_labeled_crossed.head()

Using pre-computed embeddings from cache...


Unnamed: 0,id1,id2,text1,text2,verdict,cosine_sim
0,B0454001p,B0203001p,Agrarian laws can effectively prevent the rise...,Laws are necessary to limit the power of kings,,0.717773
1,B0454001p,B0278002p,Agrarian laws can effectively prevent the rise...,Allowing a King to have absolute power undermi...,,0.655273
2,B0454001p,B0314009p,Agrarian laws can effectively prevent the rise...,The historical context shows that the struggle...,,0.675293
3,B0227001sc,B0227001sc,Parliament should hold the power to correct le...,Parliament should hold the power to correct le...,,1.000977
4,B0244002sc,B0311001sc,Parliaments in England can create laws indepen...,The king requires parliamentary approval to im...,,0.727539


# Features

## Compute Cos Sim Neighborhood Score

In [10]:
df_candidates = fea.compute_neighbor_weighted_score(
    df5 = df_crossed,
    df6 = df_candidates_with_scores,
    id1_col = "id1",
    id2_col = "id2",
    cosim_df5_col = "cosine_sim",
    cosim_df6_col = "new_cos_sim_score",
    alpha_col = "alpha",
    eq1_col = "equivalents1",
    eq2_col = "equivalents2",
    new_col = "cos_sim_neighbor_score",
)
df_candidates.head()

Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score,cos_sim_neighbor_score
0,B0783006p,S0019961006p,The Parliament holds the power to regulate the...,"By promptly seeking the King's guidance, Parli...",,[S0003551004p],[],1.0,0.780273,0.0
1,B1129001sc,S0003329002sc,A monarchical dominion needs a clear hierarchy,to enhance the monarchy-Parliament relationship,,[],[],,0.694824,0.694824
2,B0287002sc,S0019015004sc,A government must maintain societal integrity,Accountability ensures government integrity,,[],[],,0.862305,0.862305
3,B1086002sc,S0023795002sc,threatens the stability of a Commonwealth,Preserving Parliament's authority is essential...,,[],[],,0.673828,0.673828
4,B0336005p,S0016856004p,Such focused discussions would enable the coun...,Established procedures should guide discussion...,,[],[],,0.71582,0.71582


In [11]:
df_labeled = fea.compute_neighbor_weighted_score(
    df5 = df_labeled_crossed,
    df6 = df_labeled,
    id1_col = "id1",
    id2_col = "id2",
    cosim_df5_col = "cosine_sim",
    cosim_df6_col = "new_cos_sim_score",
    alpha_col = "alpha",
    eq1_col = "equivalents1",
    eq2_col = "equivalents2",
    new_col = "cos_sim_neighbor_score",
)
df_labeled.head()

Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score,cos_sim_neighbor_score
0,B0860002sc,S0010771002sc,The king's support must match his responsibili...,clear evidence of acting against the interests...,NO,[],[],,0.695312,0.695312
1,B1170001sc,S0020225001sc,Active governance by the prince is essential f...,Maintaining respect for the monarchy is essential,NO,[],[],,0.708008,0.708008
2,B0454001p,S0004868005p,Agrarian laws can effectively prevent the rise...,This situation highlights the tension between ...,NO,[],"[B0203001p, B0278002p, B0314009p]",0.0,0.657715,0.449075
3,B0227001sc,S0000883002sc,Parliament should hold the power to correct le...,Parliament must uphold the rule of law,YES,"[S0000883002sc, S0019473002sc, S0022873001sc]",[B0227001sc],0.5,0.821777,0.41129
4,B0580002sc,S0023399001sc,The king's presence is essential for validatin...,Parliament must assert authority,NO,[],[],,0.744629,0.744629


In [12]:

df_candidates.shape, df_labeled.shape

((4818, 10), (5182, 10))

## Compute NLI Score 

In [13]:
# df_crossed = fea.add_cross_encoder_score(
#     df_crossed,
#     text_col1="text1",
#     text_col2="text2",
    
#     # FIX: Use the relative path with ./ just like before
#     model_name="./fine_tuned_nli_model", 
#     batch_size=128,
#     new_col="nli_score" 
# )

# # (add_cross_encoder_score adds the column in-place)
# if "nli_score" not in df_candidates.columns:
#     print("Scores added to df_crossed!")
    
# df_crossed.head()

# # We reuse the same function used for Cosine Similarity, but point to NLI columns.
# df_candidates = fea.compute_neighbor_weighted_score(
#     df5=df_crossed,
#     df6=df_candidates,
#     id1_col="id1",
#     id2_col="id2",
#     cosim_df5_col="nli_score",    # The 'sigma' lookup table uses NLI
#     cosim_df6_col="nli_score",    # The 'sigma_ij' value uses NLI
#     alpha_col="alpha",
#     eq1_col="equivalents1",
#     eq2_col="equivalents2",
#     new_col="nli_neighbor_score"
# )

# cols_to_show = ['text1', 'text2', 'nli_score', 'nli_neighbor_score', 'verdict']
# df_candidates[cols_to_show].head()

## Compute Transitivity Score

In [14]:
# We calculate graph-based scores (Transitivity).
# 'graph_entailment_score': A -> B (Path Decay applied)
# 'graph_equivalence_score': A <-> B (Stronger constraint)

df_candidates = fea.add_graph_features(
    df=df_candidates,
    entailment_df=df_obs_ent,
    id1_col="id1",
    id2_col="id2",
    verdict_col="verdict",
    positive_label="YES",
    decay=0.9,  # Confidence drops by 10% per extra hop
    max_hops=5
)

print("\nGraph Score Stats:")
print(df_candidates[['graph_entailment_score', 'graph_equivalence_score']].describe())

Building Directed Entailment Graph...
Computing graph features for 4818 pairs...

Graph Score Stats:
       graph_entailment_score  graph_equivalence_score
count                  4818.0                   4818.0
mean                      0.0                      0.0
std                       0.0                      0.0
min                       0.0                      0.0
25%                       0.0                      0.0
50%                       0.0                      0.0
75%                       0.0                      0.0
max                       0.0                      0.0


In [15]:
df_labeled = fea.add_graph_features(
    df=df_labeled,
    entailment_df=df_obs_ent,
    id1_col="id1",
    id2_col="id2",
    verdict_col="verdict",
    positive_label="YES",
    decay=0.9,  # Confidence drops by 10% per extra hop
    max_hops=5
)

print("\nGraph Score Stats:")
print(df_candidates[['graph_entailment_score', 'graph_equivalence_score']].describe())

Building Directed Entailment Graph...
Computing graph features for 5182 pairs...



Graph Score Stats:
       graph_entailment_score  graph_equivalence_score
count                  4818.0                   4818.0
mean                      0.0                      0.0
std                       0.0                      0.0
min                       0.0                      0.0
25%                       0.0                      0.0
50%                       0.0                      0.0
75%                       0.0                      0.0
max                       0.0                      0.0


In [16]:
df_candidates = df_candidates.dropna(subset=['cos_sim_neighbor_score'])
df_labeled = df_labeled.dropna(subset=['cos_sim_neighbor_score'])

# Predicting Entailment (can change model pipeline to something other than logistic)

In [17]:
# Feature Engineering & Model Training
# Updated features list to include robust graph metrics
features = [
    'cos_sim_neighbor_score', 
    #'nli_neighbor_score', 
    #'graph_entailment_score', 
    #'graph_equivalence_score'
]
target = 'verdict'
positive_label = 'YES'

# Generate BERT results for df_labeled


print(f"Training dataset: {len(df_labeled)} pairs with features and verdicts")
print(f"Prediction dataset: {len(df_candidates)} pairs with features (no verdicts)")

# 2. (Optional) Run Hyperparameter Optimization with Optuna
try:
    print("\n>>> Optimizing Boosting Hyperparameters with Optuna...")
    # This might take a minute but will find scientifically best parameters
    best_params = fea.optimize_boosting_hyperparameters(
        df=df_labeled,  # TRAIN ON LABELED DATA!
        feature_cols=features,
        target_col=target,
        positive_label=positive_label,
        n_trials=30 
    )
    
    # Add monotonic constraint assumption back if we believe in it
    # (Optuna doesn't optimize this structure, it optimizes numbers)
    if best_params:
        best_params['enforce_monotonicity'] = True 
        
except Exception as e:
    print(f"\nOptimization skipped or failed: {e}")
    print("Using conservative defaults.")
    best_params = {
        'learning_rate': 0.05,
        'max_iter': 300,
        'enforce_monotonicity': True
    }


# 3. Run Comparative Analysis using helper function
# Updated: Now uses Cross-Validation internally to prevent Overfitting!
comparison_df, best_model_name = fea.compare_entailment_models(
    df=df_labeled,  # TRAIN ON LABELED DATA!
    feature_cols=features,
    target_col=target,
    model_names=["logistic", "spline", "tree", "boosting"], 
    positive_label=positive_label,
    **best_params # Unpack the best parameters here
)

# --- Display Results ---
print("\nComparison Results (Sorted by ROC-AUC):")
display(comparison_df)

print(f"\n>>> Selected '{best_model_name}' model for downstream processing.")

Training dataset: 5182 pairs with features and verdicts
Prediction dataset: 4818 pairs with features (no verdicts)

>>> Optimizing Boosting Hyperparameters with Optuna...
Optuna not installed. Please run: pip install optuna
Running comparative analysis on 5182 samples...
Features: ['cos_sim_neighbor_score']

--- Training logistic ---
Training Logistic Regression...
Model (logistic) Train Accuracy: 0.7877


--- Training spline ---
Training Spline Logistic Regression...
Model (spline) Train Accuracy: 0.9446
--- Training tree ---
Training Decision Tree Classifier...
Model (tree) Train Accuracy: 0.9676
--- Training boosting ---
Training Histogram Gradient Boosting Classifier (lr=0.05, iter=200)...


Model (boosting) Train Accuracy: 0.9637



Comparison Results (Sorted by ROC-AUC):


Unnamed: 0_level_0,ROC-AUC (CV),Log Loss,Separation,Mean Prob (YES),Mean Prob (NO)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
boosting,0.992226,0.104223,0.913715,0.954741,0.041026
spline,0.985724,0.136028,0.873589,0.937735,0.064146
tree,0.978038,0.192294,0.911665,0.947763,0.036097
logistic,0.77893,0.667906,0.199047,0.599758,0.400711



>>> Selected 'boosting' model for downstream processing.


In [18]:
print(f"Retraining '{best_model_name}' with optimized parameters on labeled data...")

# Train the model on df_labeled_with_features (which has verdicts)
best_pipeline = fea.train_entailment_model(
    df=df_labeled,  # TRAIN ON LABELED DATA!
    feature_cols=features,
    target_col=target,
    method=best_model_name,
    positive_label=positive_label,
    **best_params
)


df_candidates = fea.predict_entailment_probabilities(
    df_candidates,
    model_pipeline=best_pipeline,
    feature_cols=features,
    new_col='entailment_probability'
)

print(f"\nPrediction stats:")
print(f"  Min probability: {df_candidates['entailment_probability'].min():.4f}")
print(f"  Max probability: {df_candidates['entailment_probability'].max():.4f}")
print(f"  Mean probability: {df_candidates['entailment_probability'].mean():.4f}")

df_labeled.head()

Retraining 'boosting' with optimized parameters on labeled data...
Training Histogram Gradient Boosting Classifier (lr=0.05, iter=200)...


Model (boosting) Train Accuracy: 0.9637

Prediction stats:
  Min probability: 0.0002
  Max probability: 0.9952
  Mean probability: 0.0037


Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score,cos_sim_neighbor_score,graph_entailment_score,graph_equivalence_score
0,B0860002sc,S0010771002sc,The king's support must match his responsibili...,clear evidence of acting against the interests...,NO,[],[],,0.695312,0.695312,0.0,0.0
1,B1170001sc,S0020225001sc,Active governance by the prince is essential f...,Maintaining respect for the monarchy is essential,NO,[],[],,0.708008,0.708008,0.0,0.0
2,B0454001p,S0004868005p,Agrarian laws can effectively prevent the rise...,This situation highlights the tension between ...,NO,[],"[B0203001p, B0278002p, B0314009p]",0.0,0.657715,0.449075,0.0,0.0
3,B0227001sc,S0000883002sc,Parliament should hold the power to correct le...,Parliament must uphold the rule of law,YES,"[S0000883002sc, S0019473002sc, S0022873001sc]",[B0227001sc],0.5,0.821777,0.41129,1.0,0.0
4,B0580002sc,S0023399001sc,The king's presence is essential for validatin...,Parliament must assert authority,NO,[],[],,0.744629,0.744629,0.0,0.0


# Optimize threshold 

In [19]:
# Predict on labeled data to find optimal thresholds
print("Predicting on labeled data for threshold optimization...")
df_labeled_with_features = fea.predict_entailment_probabilities(
    df_labeled,
    model_pipeline=best_pipeline,
    feature_cols=features,
    new_col='entailment_probability'
)

# We use the generic 'entailment_probability' column which now holds the best model's output
importlib.reload(fea)

results = fea.find_best_thresholds(
    df=df_labeled_with_features,  # USE LABELED DATA FOR THRESHOLD TUNING!
    score_col="entailment_probability",
    verdict_col="verdict",
    positive_label="YES"
)

df_labeled_with_features.head()

Predicting on labeled data for threshold optimization...


Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score,cos_sim_neighbor_score,graph_entailment_score,graph_equivalence_score,entailment_probability
0,B0860002sc,S0010771002sc,The king's support must match his responsibili...,clear evidence of acting against the interests...,NO,[],[],,0.695312,0.695312,0.0,0.0,0.000223
1,B1170001sc,S0020225001sc,Active governance by the prince is essential f...,Maintaining respect for the monarchy is essential,NO,[],[],,0.708008,0.708008,0.0,0.0,0.000223
2,B0454001p,S0004868005p,Agrarian laws can effectively prevent the rise...,This situation highlights the tension between ...,NO,[],"[B0203001p, B0278002p, B0314009p]",0.0,0.657715,0.449075,0.0,0.0,0.696395
3,B0227001sc,S0000883002sc,Parliament should hold the power to correct le...,Parliament must uphold the rule of law,YES,"[S0000883002sc, S0019473002sc, S0022873001sc]",[B0227001sc],0.5,0.821777,0.41129,1.0,0.0,0.991445
4,B0580002sc,S0023399001sc,The king's presence is essential for validatin...,Parliament must assert authority,NO,[],[],,0.744629,0.744629,0.0,0.0,0.000223


In [20]:
print("Best tau (accuracy):", results["best_tau_accuracy"],
      "Accuracy:", results["best_accuracy"])

print("Best tau (F1):", results["best_tau_f1"],
      "F1:", results["best_f1"])

print("Best tau (TP):", results["best_tau_tp"],
      "TP:", results["max_true_positives"])

print("Best tau (precision):", results["best_tau_precision"],
      "prec:", results["best_precision"])

print("Best tau (recall):",    results["best_tau_recall"],
      "rec:",  results["best_recall"])

Best tau (accuracy): 0.8493446931802708 Accuracy: 0.9737553068313393
Best tau (F1): 0.8378699448241904 F1: 0.8976608187134502
Best tau (TP): 0.00022251254560292668 TP: 657
Best tau (precision): 0.9951546843846958 prec: 1.0
Best tau (recall): 0.00022251254560292668 rec: 1.0


In [21]:
results["best_taus_table"]

Unnamed: 0,tau,TP,TN,FP,FN,accuracy,precision,recall,f1
0,0.000223,657,3373,1152,0,0.777692,0.363184,1.0,0.532847
1,0.83787,614,4428,97,43,0.972983,0.863572,0.934551,0.897661
2,0.849345,596,4450,75,61,0.973755,0.888227,0.907154,0.89759
3,0.995155,161,4525,0,496,0.904284,1.0,0.245053,0.393643


In [22]:
import importlib
import plotly.io as pio
import free_entailments_algorithm_utils as fea
importlib.reload(fea) 

# Ensure Plotly renders appropriately for notebook/vscode context
pio.renderers.default = "notebook_connected" 

# This also calculates 'best_tau_low_send' (Top 1-5% Candidates)
# Use labeled data (with verdicts) for threshold finding!

results = fea.find_best_thresholds(
    df=df_labeled_with_features,  # FIX: Use labeled data, not candidates!
    score_col="entailment_probability", 
    verdict_col="verdict",
    positive_label="YES"
)


tau_low_send = results.get('best_tau_low_send', 0.95)

print("\nLow-Send Optimization (Candidate Selection):")
print(f"Selected Low-Send Threshold: {tau_low_send:.6f}")
if "low_send_table" in results:
    display(results["low_send_table"])

print("\n>>> Interactive Analysis: LLM Savings vs Threshold")

markers_to_show = {
    "Optimization (Top %)":    tau_low_send,
    "Max Accuracy":            results["best_tau_accuracy"],
    "Max F1":                  results["best_tau_f1"]
}


# Sent = Prob > Threshold
fig = fea.plot_llm_savings_over_thresholds(
    df=df_labeled_with_features,  # FIX: Use labeled data for plotting too!
    prob_col="entailment_probability",
    verdict_col="verdict",
    positive_label="YES",
    step=0.01,
    markers=markers_to_show
)
fig.show()


Low-Send Optimization (Candidate Selection):
Selected Low-Send Threshold: 0.992959


Unnamed: 0,target_percentile,tau,sent_rate,FN,TP,FP,TN
0,0.01,0.998846,0.0,657,0,0,4525
1,0.02,0.997781,0.019491,556,101,0,4525
2,0.03,0.995816,0.027403,515,142,0,4525
3,0.04,0.994486,0.038788,458,199,2,4523
4,0.05,0.992959,0.047086,417,240,4,4521



>>> Interactive Analysis: LLM Savings vs Threshold


In [23]:
import importlib
import free_entailments_algorithm_utils as fea
importlib.reload(fea)

print("--- Defining Threshold for LLM ---")

# Strategy: Send everything above a certain confidence threshold.
# We use the 'Minimize False Negatives' strategy (Cost Sensitive) to find a threshold
# effectively filtering out 'Definite Negatives' while keeping all potential Candidates.
# Cost Ratio 1:5 means we punish missing a Yes (FN) 5x more than sending a useless No (FP).

tau = fea.get_optimal_threshold_minimize_fn(strategy='cost', cost_fn=5.0)

print(f"Selected Threshold: {tau:.4f} (Send if Score > {tau:.4f})")
print(f"Logic: Minimize FN (Don't miss Entailments). Auto-Reject scores <= {tau:.4f}.")

# # 2. Estimate Cost
# print("\n--- Cost Analysis ---")
# cost = fea.estimate_deepseek_cost(
#     df=df_candidates, 
#     prob_col='entailment_probability', 
#     threshold=tau,
#     model="deepseek-reasoner"
# )

# 3. Generate Final DataFrame for the LLM
print("\n--- Generating File ---")
df_final = fea.generate_final_df(
    df=df_candidates, 
    prob_col='entailment_probability', 
    threshold=tau
)
# df_final.to_csv("llm_batch_final.csv", index=False)

--- Defining Threshold for LLM ---
Selected Threshold: 0.1667 (Send if Score > 0.1667)
Logic: Minimize FN (Don't miss Entailments). Auto-Reject scores <= 0.1667.

--- Generating File ---
--- Generating LLM Batch ---
Original Count: 4,818
Filtered Count: 30 (0.6%)
Condition:      P > 0.1667 (Send High Confidence Pairs)


In [24]:
# Record outputs for papermill using scrapbook
import scrapbook as sb

# Save df_final and cost for this iteration
sb.glue('df_final', df_final)

# Convert Plotly figure to HTML for serialization
fig_html = fig.to_html(include_plotlyjs='cdn')
sb.glue('fig_html', fig_html)

print("\n✓ Outputs recorded for papermill retrieval")


✓ Outputs recorded for papermill retrieval
