# Set up 

## Import libraries 

In [None]:
# ALTERED: Removed STANDALONE_TEST feature
# ALTERED: Removed total_cost parameter (cost calculated in FEA_Loop only)
# Parameters - papermill will inject these values
# Tag this cell with "parameters" in the notebook

df_candidates_path = ""
df_crossed_path = ""
df_labeled_path = ""
df_labeled_crossed_path = ""
df_obs_ent_path = ""
df_clause_path = ""
embedding_cache_path = ""  # Optional: only needed if cosine sims are NOT pre-computed

=== STANDALONE TEST MODE ===
Loading test data for FreeEntailmentAlgorithm.ipynb...
⚠ Variables not found in workspace.
To test standalone: First run FEA_Pipeline.ipynb up to cell 16 (before the papermill call)
Then run this notebook with those variables in memory.


RuntimeError: Required variables not in workspace. Run FEA_Pipeline cells first.

In [None]:

import pandas as pd
import pickle
import importlib
import numpy as np
import sys
import os

current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)
    print(f"Added current directory to sys.path: {current_dir}")
import free_entailments_algorithm_utils as fea
import scrapbook as sb

print(f"Loading data from pickle files...")
print(f"  Loading from: {df_candidates_path}")

df_candidates = pd.read_pickle(df_candidates_path)
df_crossed = pd.read_pickle(df_crossed_path)
df_labeled = pd.read_pickle(df_labeled_path)
df_labeled_crossed = pd.read_pickle(df_labeled_crossed_path)
df_obs_ent = pd.read_pickle(df_obs_ent_path)
df_clause = pd.read_pickle(df_clause_path)

# Embedding cache is optional — only load if cosine sims aren't pre-computed
embedding_cache_finetuned = None
if embedding_cache_path and os.path.exists(embedding_cache_path):
    with open(embedding_cache_path, 'rb') as f:
        embedding_cache_finetuned = pickle.load(f)
    print(f"  - embedding_cache: {len(embedding_cache_finetuned)} entries (will use for cosine sims)")
else:
    print(f"  - embedding_cache: skipped (cosine sims pre-computed in Pipeline)")

# Build equiv_map for memory-efficient neighbor score computation
# (avoids requiring 'equivalents1'/'equivalents2' list columns in df_candidates)
equiv_map = fea.build_equiv_map(df_obs_ent, id1_col="id1", id2_col="id2", include_self=False)
LARGE_SCALE = len(df_candidates) > 5_000_000

print(f"✓ Successfully loaded all data:")
print(f"  - df_candidates: {len(df_candidates):,} rows {'(LARGE-SCALE MODE)' if LARGE_SCALE else ''}")
print(f"  - df_crossed: {len(df_crossed):,} rows")
print(f"  - df_labeled: {len(df_labeled)} rows")
print(f"  - df_labeled_crossed: {len(df_labeled_crossed)} rows")
print(f"  - df_obs_ent: {len(df_obs_ent)} rows")
print(f"  - df_clause: {len(df_clause)} rows")
print(f"  - embedding_cache: {len(embedding_cache_finetuned) if embedding_cache_finetuned else 'N/A (pre-computed)'} entries")
print(f"  - equiv_map: {len(equiv_map)} IDs with equivalents")

In [2]:
importlib.reload(fea)

<module 'free_entailments_algorithm_utils' from 'c:\\Users\\agust\\Downloads\\fea_project\\fea_project\\free_entailments_algorithm_utils.py'>

# Calculate Similarity On LLM Results

In [None]:
# OPTIMIZED: Using fine-tuned embeddings from cache (instant!)
# If cosine sims were pre-computed in Pipeline, skip this step entirely.

if 'new_cos_sim_score' in df_candidates.columns:
    print(f"✓ new_cos_sim_score already present in df_candidates — skipping")
    df_candidates_with_scores = df_candidates
else:
    df_candidates_with_scores = fea.generate_new_bert_results(
        df_candidates,
        text_col1='text1',
        text_col2='text2',
        model_path="./fine_tuned_bi_model",
        new_col="new_cos_sim_score",
        embedding_cache=embedding_cache_finetuned,
        id_col1='id1',
        id_col2='id2'
    )
df_candidates_with_scores.head()

Encoding unique sentences from text1 and text2...


Batches:   0%|          | 0/74 [00:00<?, ?it/s]

In [None]:
if 'new_cos_sim_score' in df_labeled.columns:
    print(f"✓ new_cos_sim_score already present in df_labeled — skipping")
else:
    df_labeled = fea.generate_new_bert_results(
        df_labeled,
        text_col1='text1',
        text_col2='text2',
        model_path="./fine_tuned_bi_model",
        new_col="new_cos_sim_score",
        embedding_cache=embedding_cache_finetuned,
        id_col1='id1',
        id_col2='id2'
    )

df_labeled.head()

In [None]:
## Takes a few minutes depending on computing power


# df_main = fea.add_cross_encoder_score(
#     df_main,                 
#     text_col1='text1',
#     text_col2='text2',
#     model_name="./fine_tuned_nli_model",  
#     new_col="nli_score",
#     batch_size=128              # Keep batch size lower for Cross-Encoders, go easy on your computer
# )

# df_main.head()
# df_main.shape

The CrossEncoder `tokenizer_args` argument was renamed and is now deprecated, please use `tokenizer_kwargs` instead.


Loading Cross-Encoder model: ./fine_tuned_nli_model on cuda...
Predicting NLI scores for 44837 pairs (Bidirectional)...


Batches:   0%|          | 0/351 [00:00<?, ?it/s]

Batches:   0%|          | 0/351 [00:00<?, ?it/s]

(44837, 8)

In [None]:
if 'cosine_sim' in df_crossed.columns:
    print(f"✓ cosine_sim already present in df_crossed — skipping")
else:
    df_crossed = fea.add_cosine_similarity_from_text(
        df_crossed,
        text_col1="text1",
        text_col2="text2",
        model_name="./fine_tuned_bi_model",
        batch_size=128,
        show_progress_bar=False,
        embedding_cache=embedding_cache_finetuned,
        id_col1='id1',
        id_col2='id2'
    )

df_crossed.head()

In [None]:
if 'cosine_sim' in df_labeled_crossed.columns:
    print(f"✓ cosine_sim already present in df_labeled_crossed — skipping")
else:
    df_labeled_crossed = fea.add_cosine_similarity_from_text(
        df_labeled_crossed,
        text_col1="text1",
        text_col2="text2",
        model_name="./fine_tuned_bi_model",
        batch_size=128,
        show_progress_bar=False,
        embedding_cache=embedding_cache_finetuned,
        id_col1='id1',
        id_col2='id2'
    )

df_labeled_crossed.head()

# Features

## Compute Cos Sim Neighborhood Score

In [None]:
if LARGE_SCALE:
    # Memory-efficient: uses equiv_map instead of list columns,
    # short-circuits ~95%+ rows that have no neighbours.
    sigma_lookup = fea._build_sigma_lookup_from_df5(
        df_crossed, id1_col="id1", id2_col="id2", cosim_col="cosine_sim"
    )
    df_candidates = fea.compute_neighbor_score_efficient(
        sigma_lookup=sigma_lookup,
        df6=df_candidates_with_scores,
        equiv_map=equiv_map,
        id1_col="id1",
        id2_col="id2",
        cosim_col="new_cos_sim_score",
        alpha_col="alpha",
        new_col="cos_sim_neighbor_score",
    )
else:
    df_candidates = fea.compute_neighbor_weighted_score(
        df5 = df_crossed,
        df6 = df_candidates_with_scores,
        id1_col = "id1",
        id2_col = "id2",
        cosim_df5_col = "cosine_sim",
        cosim_df6_col = "new_cos_sim_score",
        alpha_col = "alpha",
        eq1_col = "equivalents1",
        eq2_col = "equivalents2",
        new_col = "cos_sim_neighbor_score",
    )
df_candidates.head()

In [None]:
df_labeled = fea.compute_neighbor_weighted_score(
    df5 = df_labeled_crossed,
    df6 = df_labeled,
    id1_col = "id1",
    id2_col = "id2",
    cosim_df5_col = "cosine_sim",
    cosim_df6_col = "new_cos_sim_score",
    alpha_col = "alpha",
    eq1_col = "equivalents1",
    eq2_col = "equivalents2",
    new_col = "cos_sim_neighbor_score",
)
df_labeled.head()

In [None]:

df_candidates.shape, df_labeled.shape

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,cosine_sim,new_cos_sim_score,nli_score,equivalents1,equivalents2,alpha,cos_sim_neighbor_score
147326,B0269010p,S10891005p,The authority of kings is contingent upon thei...,Undermining the authority of the crown could d...,NO,0.667969,0.400879,2.079574e-08,"[B0082005p, B0194008p, B0268008p, B0318001p, B...",[],1.0,0.136711
313982,B0578011p,B0989004p,The king's power and influence over the kingdo...,This limited authority aligns with the princip...,NO,0.609375,0.328857,5.410718e-08,[],[],,0.328857
381168,B0247003p,B0710007p,The legitimacy of a government derives from th...,The power of governance is derived from the na...,YES,0.752441,0.968262,0.972468,"[B0256006p, B0530006p]","[B0040002p, B0256004p]",0.5,0.920511
349537,B0166005p,B0233005p,Any authority a prince has is derived from the...,Laws are products of the people's consent and ...,YES,0.6875,0.974609,0.9804142,"[B0076003p, B0312003p, B0350010p]","[B0186004p, B0311005p]",0.5,0.82312
168934,B0709001sc,B0757003sc,A king who ignores the law,The belief in kings' power above human laws is...,YES,0.708008,0.949707,0.9911806,[],[],,0.949707


## Compute NLI Score 

In [None]:
# df_crossed = fea.add_cross_encoder_score(
#     df_crossed,
#     text_col1="text1",
#     text_col2="text2",
    
#     # FIX: Use the relative path with ./ just like before
#     model_name="./fine_tuned_nli_model", 
#     batch_size=128,
#     new_col="nli_score" 
# )

# # (add_cross_encoder_score adds the column in-place)
# if "nli_score" not in df_candidates.columns:
#     print("Scores added to df_crossed!")
    
# df_crossed.head()

# # We reuse the same function used for Cosine Similarity, but point to NLI columns.
# df_candidates = fea.compute_neighbor_weighted_score(
#     df5=df_crossed,
#     df6=df_candidates,
#     id1_col="id1",
#     id2_col="id2",
#     cosim_df5_col="nli_score",    # The 'sigma' lookup table uses NLI
#     cosim_df6_col="nli_score",    # The 'sigma_ij' value uses NLI
#     alpha_col="alpha",
#     eq1_col="equivalents1",
#     eq2_col="equivalents2",
#     new_col="nli_neighbor_score"
# )

# cols_to_show = ['text1', 'text2', 'nli_score', 'nli_neighbor_score', 'verdict']
# df_candidates[cols_to_show].head()

The CrossEncoder `tokenizer_args` argument was renamed and is now deprecated, please use `tokenizer_kwargs` instead.


Loading Cross-Encoder model: ./fine_tuned_nli_model on cuda...
Predicting NLI scores for 90214 pairs (Bidirectional)...


Batches:   0%|          | 0/705 [00:00<?, ?it/s]

Batches:   0%|          | 0/705 [00:00<?, ?it/s]

Unnamed: 0,text1,text2,nli_score,nli_neighbor_score,verdict
147326,The authority of kings is contingent upon thei...,Undermining the authority of the crown could d...,2.079574e-08,7.207293e-16,NO
313982,The king's power and influence over the kingdo...,This limited authority aligns with the princip...,5.410718e-08,5.410718e-08,NO
381168,The legitimacy of a government derives from th...,The power of governance is derived from the na...,0.972468,0.8300657,YES
349537,Any authority a prince has is derived from the...,Laws are products of the people's consent and ...,0.9804142,0.7651983,YES
168934,A king who ignores the law,The belief in kings' power above human laws is...,0.9911806,0.9911806,YES


## Compute Transitivity Score

In [None]:
# Graph features: vectorized BFS precomputation makes this feasible at any scale.
# Precomputes single-source BFS from all unique nodes, then does dict lookups.
print(f"Computing graph features for df_candidates ({len(df_candidates):,} rows)...")
df_candidates = fea.add_graph_features_vectorized(
    df=df_candidates,
    entailment_df=df_obs_ent,
    id1_col="id1",
    id2_col="id2",
    verdict_col="verdict",
    positive_label="YES",
    decay=0.9,
    max_hops=5
)

print("\nGraph Score Stats:")
print(df_candidates[['graph_entailment_score', 'graph_equivalence_score']].describe())

Building Directed Entailment Graph...
Computing graph features for 35870 pairs...

Graph Score Stats:
       graph_entailment_score  graph_equivalence_score
count            35870.000000                  35870.0
mean                 0.002306                      0.0
std                  0.043634                      0.0
min                  0.000000                      0.0
25%                  0.000000                      0.0
50%                  0.000000                      0.0
75%                  0.000000                      0.0
max                  0.900000                      0.0


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,cosine_sim,new_cos_sim_score,nli_score,equivalents1,equivalents2,alpha,cos_sim_neighbor_score,nli_neighbor_score,graph_entailment_score,graph_equivalence_score
147326,B0269010p,S10891005p,The authority of kings is contingent upon thei...,Undermining the authority of the crown could d...,NO,0.667969,0.400879,2.079574e-08,"[B0082005p, B0194008p, B0268008p, B0318001p, B...",[],1.0,0.136711,7.207293e-16,0.0,0.0
313982,B0578011p,B0989004p,The king's power and influence over the kingdo...,This limited authority aligns with the princip...,NO,0.609375,0.328857,5.410718e-08,[],[],,0.328857,5.410718e-08,0.0,0.0
381168,B0247003p,B0710007p,The legitimacy of a government derives from th...,The power of governance is derived from the na...,YES,0.752441,0.968262,0.972468,"[B0256006p, B0530006p]","[B0040002p, B0256004p]",0.5,0.920511,0.8300657,0.0,0.0
349537,B0166005p,B0233005p,Any authority a prince has is derived from the...,Laws are products of the people's consent and ...,YES,0.6875,0.974609,0.9804142,"[B0076003p, B0312003p, B0350010p]","[B0186004p, B0311005p]",0.5,0.82312,0.7651983,0.0,0.0
168934,B0709001sc,B0757003sc,A king who ignores the law,The belief in kings' power above human laws is...,YES,0.708008,0.949707,0.9911806,[],[],,0.949707,0.9911806,0.0,0.0


In [None]:
# Graph features for df_labeled (small — use original BFS, always fast)
df_labeled = fea.add_graph_features(
    df=df_labeled,
    entailment_df=df_obs_ent,
    id1_col="id1",
    id2_col="id2",
    verdict_col="verdict",
    positive_label="YES",
    decay=0.9,
    max_hops=5
)

print("\nGraph Score Stats (df_labeled):")
print(df_labeled[['graph_entailment_score', 'graph_equivalence_score']].describe())

In [None]:
import gc

# Free stale references that still point at the 75M-row DataFrame.
# df_candidates_with_scores is an alias set in cell 7;
# sigma_lookup is the large dict built in cell 14.
# Neither is needed after this point.
try:
    del df_candidates_with_scores
except NameError:
    pass
try:
    del sigma_lookup
except NameError:
    pass
gc.collect()

# dropna creates a COPY.  At 75M rows that doubles memory briefly.
# Avoid the copy entirely when there are no NaN values.
nan_count = int(df_candidates['cos_sim_neighbor_score'].isna().sum())
if nan_count > 0:
    df_candidates = df_candidates.dropna(subset=['cos_sim_neighbor_score'])
    gc.collect()

df_labeled = df_labeled.dropna(subset=['cos_sim_neighbor_score'])

print(f"After dropna: {len(df_candidates):,} candidates (dropped {nan_count:,}), {len(df_labeled)} labeled")
if len(df_labeled) == 0:
    print("WARNING: No labeled rows with valid scores — model training will be skipped upstream.")

# Predicting Entailment (can change model pipeline to something other than logistic)

In [None]:
# Feature Engineering & Model Training
# Updated features list to include robust graph metrics
features = [
    'cos_sim_neighbor_score', 
    #'nli_neighbor_score', 
    'graph_entailment_score', 
    'graph_equivalence_score'
]
target = 'verdict'
positive_label = 'YES'

# Generate BERT results for df_labeled


print(f"Training dataset: {len(df_labeled)} pairs with features and verdicts")
print(f"Prediction dataset: {len(df_candidates)} pairs with features (no verdicts)")

# 2. (Optional) Run Hyperparameter Optimization with Optuna
try:
    print("\n>>> Optimizing Boosting Hyperparameters with Optuna...")
    # This might take a minute but will find scientifically best parameters
    best_params = fea.optimize_boosting_hyperparameters(
        df=df_labeled,  # TRAIN ON LABELED DATA!
        feature_cols=features,
        target_col=target,
        positive_label=positive_label,
        n_trials=30 
    )
    
    # Add monotonic constraint assumption back if we believe in it
    # (Optuna doesn't optimize this structure, it optimizes numbers)
    if best_params:
        best_params['enforce_monotonicity'] = True 
        
except Exception as e:
    print(f"\nOptimization skipped or failed: {e}")
    print("Using conservative defaults.")
    best_params = {
        'learning_rate': 0.05,
        'max_iter': 300,
        'enforce_monotonicity': True
    }


# 3. Run Comparative Analysis using helper function
# Updated: Now uses Cross-Validation internally to prevent Overfitting!
comparison_df, best_model_name = fea.compare_entailment_models(
    df=df_labeled,  # TRAIN ON LABELED DATA!
    feature_cols=features,
    target_col=target,
    model_names=["logistic", "spline", "tree", "boosting"], 
    positive_label=positive_label,
    **best_params # Unpack the best parameters here
)

# --- Display Results ---
print("\nComparison Results (Sorted by ROC-AUC):")
display(comparison_df)

print(f"\n>>> Selected '{best_model_name}' model for downstream processing.")

>>> Optimizing Boosting Hyperparameters with Optuna...
Optuna not installed. Please run: pip install optuna
Running comparative analysis on 35870 samples...
Features: ['cos_sim_neighbor_score', 'nli_neighbor_score', 'graph_entailment_score', 'graph_equivalence_score']

--- Training logistic ---
Training Logistic Regression...
Model (logistic) Train Accuracy: 0.9151
--- Training spline ---
Training Spline Logistic Regression...
Model (spline) Train Accuracy: 0.9101
--- Training tree ---
Training Decision Tree Classifier...
Model (tree) Train Accuracy: 0.9004
--- Training boosting ---
Training Histogram Gradient Boosting Classifier (lr=0.05, iter=200)...
Model (boosting) Train Accuracy: 0.9052

Comparison Results (Sorted by ROC-AUC):


Unnamed: 0_level_0,ROC-AUC (CV),Log Loss,Separation,Mean Prob (YES),Mean Prob (NO)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
boosting,0.966241,0.236311,0.720296,0.857778,0.137482
tree,0.963677,0.251933,0.721029,0.858637,0.137607
spline,0.962304,0.245767,0.704592,0.85225,0.147657
logistic,0.958344,0.26024,0.684958,0.842499,0.157541



>>> Selected 'boosting' model for downstream processing.


In [None]:
print(f"Retraining '{best_model_name}' with optimized parameters on labeled data...")

# Train the model on df_labeled_with_features (which has verdicts)
best_pipeline = fea.train_entailment_model(
    df=df_labeled,  # TRAIN ON LABELED DATA!
    feature_cols=features,
    target_col=target,
    method=best_model_name,
    positive_label=positive_label,
    **best_params
)


df_candidates = fea.predict_entailment_probabilities(
    df_candidates,
    model_pipeline=best_pipeline,
    feature_cols=features,
    new_col='entailment_probability'
)

print(f"\nPrediction stats:")
print(f"  Min probability: {df_candidates['entailment_probability'].min():.4f}")
print(f"  Max probability: {df_candidates['entailment_probability'].max():.4f}")
print(f"  Mean probability: {df_candidates['entailment_probability'].mean():.4f}")

df_labeled.head()

Retraining 'boosting' with optimized parameters...
Training Histogram Gradient Boosting Classifier (lr=0.05, iter=200)...
Model (boosting) Train Accuracy: 0.9052


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,cosine_sim,new_cos_sim_score,nli_score,equivalents1,equivalents2,alpha,cos_sim_neighbor_score,nli_neighbor_score,graph_entailment_score,graph_equivalence_score,entailment_probability
147326,B0269010p,S10891005p,The authority of kings is contingent upon thei...,Undermining the authority of the crown could d...,NO,0.667969,0.400879,2.079574e-08,"[B0082005p, B0194008p, B0268008p, B0318001p, B...",[],1.0,0.136711,7.207293e-16,0.0,0.0,0.001576
313982,B0578011p,B0989004p,The king's power and influence over the kingdo...,This limited authority aligns with the princip...,NO,0.609375,0.328857,5.410718e-08,[],[],,0.328857,5.410718e-08,0.0,0.0,0.014641
381168,B0247003p,B0710007p,The legitimacy of a government derives from th...,The power of governance is derived from the na...,YES,0.752441,0.968262,0.972468,"[B0256006p, B0530006p]","[B0040002p, B0256004p]",0.5,0.920511,0.8300657,0.0,0.0,0.990212
349537,B0166005p,B0233005p,Any authority a prince has is derived from the...,Laws are products of the people's consent and ...,YES,0.6875,0.974609,0.9804142,"[B0076003p, B0312003p, B0350010p]","[B0186004p, B0311005p]",0.5,0.82312,0.7651983,0.0,0.0,0.979295
168934,B0709001sc,B0757003sc,A king who ignores the law,The belief in kings' power above human laws is...,YES,0.708008,0.949707,0.9911806,[],[],,0.949707,0.9911806,0.0,0.0,0.995576


# Optimize threshold 

In [None]:
# Predict on labeled data to find optimal thresholds
print("Predicting on labeled data for threshold optimization...")
df_labeled_with_features = fea.predict_entailment_probabilities(
    df_labeled,
    model_pipeline=best_pipeline,
    feature_cols=features,
    new_col='entailment_probability'
)

# We use the generic 'entailment_probability' column which now holds the best model's output
importlib.reload(fea)

results = fea.find_best_thresholds(
    df=df_labeled_with_features,  # USE LABELED DATA FOR THRESHOLD TUNING!
    score_col="entailment_probability",
    verdict_col="verdict",
    positive_label="YES"
)

df_labeled_with_features.head()

In [35]:
print("Best tau (accuracy):", results["best_tau_accuracy"],
      "Accuracy:", results["best_accuracy"])

print("Best tau (F1):", results["best_tau_f1"],
      "F1:", results["best_f1"])

print("Best tau (TP):", results["best_tau_tp"],
      "TP:", results["max_true_positives"])

print("Best tau (precision):", results["best_tau_precision"],
      "prec:", results["best_precision"])

print("Best tau (recall):",    results["best_tau_recall"],
      "rec:",  results["best_recall"])

Best tau (accuracy): 0.7928189728049367 Accuracy: 0.9256760524114859
Best tau (F1): 0.6850135832877435 F1: 0.8252886617452205
Best tau (TP): 0.0015759870708909347 TP: 7677
Best tau (precision): 0.9944160464006974 prec: 0.9935622317596566
Best tau (recall): 0.0015759870708909347 rec: 1.0


In [36]:
results["best_taus_table"]

Unnamed: 0,tau,TP,TN,FP,FN,accuracy,precision,recall,f1
0,0.001576,7677,7303,20890,0,0.417619,0.268737,1.0,0.423629
1,0.685014,6540,26561,1632,1137,0.922805,0.800294,0.851895,0.825289
2,0.792819,6130,27074,1119,1547,0.925676,0.845634,0.798489,0.821386
3,0.994416,463,28190,3,7214,0.798801,0.993562,0.06031,0.113717


In [None]:
import importlib
import plotly.io as pio
import free_entailments_algorithm_utils as fea
importlib.reload(fea) 

# Ensure Plotly renders appropriately for notebook/vscode context
pio.renderers.default = "notebook_connected" 

# This also calculates 'best_tau_low_send' (Top 1-5% Candidates)
# Use labeled data (with verdicts) for threshold finding!

results = fea.find_best_thresholds(
    df=df_labeled_with_features,  # FIX: Use labeled data, not candidates!
    score_col="entailment_probability", 
    verdict_col="verdict",
    positive_label="YES"
)


tau_low_send = results.get('best_tau_low_send', 0.95)

print("\nLow-Send Optimization (Candidate Selection):")
print(f"Selected Low-Send Threshold: {tau_low_send:.6f}")
if "low_send_table" in results:
    display(results["low_send_table"])

print("\n>>> Interactive Analysis: LLM Savings vs Threshold")

markers_to_show = {
    "Optimization (Top %)":    tau_low_send,
    "Max Accuracy":            results["best_tau_accuracy"],
    "Max F1":                  results["best_tau_f1"]
}


# Sent = Prob > Threshold
fig = fea.plot_llm_savings_over_thresholds(
    df=df_labeled_with_features,  # FIX: Use labeled data for plotting too!
    prob_col="entailment_probability",
    verdict_col="verdict",
    positive_label="YES",
    step=0.01,
    markers=markers_to_show
)
fig.show()


Low-Send Optimization (Candidate Selection):
Selected Low-Send Threshold: 0.990134


Unnamed: 0,target_percentile,tau,sent_rate,FN,TP,FP,TN
0,0.01,0.995576,0.0,7677,0,0,28193
1,0.02,0.994086,0.019905,6970,707,7,28186
2,0.03,0.992109,0.029551,6625,1052,8,28185
3,0.04,0.991372,0.039476,6277,1400,16,28177
4,0.05,0.990134,0.049902,5907,1770,20,28173



>>> Interactive Analysis: LLM Savings vs Threshold


ah


In [None]:
import importlib
import gc
import free_entailments_algorithm_utils as fea
importlib.reload(fea)

print("--- Defining Threshold for LLM ---")

# Strategy: Send everything above a certain confidence threshold.
# We use the 'Minimize False Negatives' strategy (Cost Sensitive) to find a threshold
# effectively filtering out 'Definite Negatives' while keeping all potential Candidates.
# Cost Ratio 1:5 means we punish missing a Yes (FN) 5x more than sending a useless No (FP).

tau = fea.get_optimal_threshold_minimize_fn(strategy='cost', cost_fn=5.0)

print(f"Selected Threshold: {tau:.4f} (Send if Score > {tau:.4f})")
print(f"Logic: Minimize FN (Don't miss Entailments). Auto-Reject scores <= {tau:.4f}.")

# 3. Generate Final DataFrame for the LLM
print("\n--- Generating File ---")
df_final = fea.generate_final_df(
    df=df_candidates, 
    prob_col='entailment_probability', 
    threshold=tau,
    df_clause=df_clause,
    id_col='sentence_id',
    text_col='sentence'
)

# FREE df_candidates (75M rows) — no longer needed.
# df_final is the tiny filtered subset (~1k rows).
# This MUST happen before sb.glue / pickle output or we'll OOM during serialization.
del df_candidates
try:
    del df_labeled, df_labeled_with_features
except NameError:
    pass
gc.collect()
print(f"✓ Freed df_candidates — only df_final ({len(df_final)} rows) remains")

--- Defining Threshold for LLM ---
Selected Threshold: 0.1667 (Send if Score > 0.1667)
Logic: Minimize FN (Don't miss Entailments). Auto-Reject scores <= 0.1667.

--- Cost Analysis ---
--- Cost Estimation for 14,455 Pairs (P > 0.1667) ---
Model: deepseek-reasoner
Input Tokens:  4,384,683 ($2.4116)
Output Tokens: 11,564,000 ($25.3252)
Total Cost:    $27.7367 (Approx $1.92/1k pairs)

--- Generating File ---
--- Generating LLM Batch ---
Original Count: 35,870
Filtered Count: 14,455 (40.3%)
Condition:      P > 0.1667 (Send High Confidence Pairs)


In [None]:
import os, pickle

# Save outputs to pickle files — much more memory-efficient than
# sb.glue which JSON-serializes through Jupyter's messaging layer.
# At 75M candidate scale, the JSON serialization alone can OOM.
_out_dir = os.path.dirname(df_candidates_path)  # same temp_dir used for inputs
df_final.to_pickle(os.path.join(_out_dir, "df_final.pkl"))

fig_html = fig.to_html(include_plotlyjs='cdn')
with open(os.path.join(_out_dir, "fig_html.pkl"), 'wb') as f:
    pickle.dump(fig_html, f)

print(f"✓ Saved df_final ({len(df_final)} rows) and fig_html ({len(fig_html)} chars) to {_out_dir}")

# Also glue small metadata via scrapbook so papermill knows the notebook succeeded
import scrapbook as sb
sb.glue('df_final_rows', len(df_final))
print("✓ Outputs saved for papermill retrieval")