# Set up 

## Import libraries 

In [1]:
# ALTERED: Removed STANDALONE_TEST feature
# ALTERED: Removed total_cost parameter (cost calculated in FEA_Loop only)
# Parameters - papermill will inject these values
# Tag this cell with "parameters" in the notebook

df_candidates_path = ""
df_crossed_path = ""
df_labeled_path = ""
df_labeled_crossed_path = ""
df_obs_ent_path = ""
df_clause_path = ""
embedding_cache_path = ""

In [2]:
# Parameters
df_candidates_path = "fea_iterations/temp_data/df_candidates.pkl"
df_crossed_path = "fea_iterations/temp_data/df_crossed.pkl"
df_labeled_path = "fea_iterations/temp_data/df_labeled.pkl"
df_labeled_crossed_path = "fea_iterations/temp_data/df_labeled_crossed.pkl"
df_obs_ent_path = "fea_iterations/temp_data/df_obs_ent.pkl"
df_clause_path = "fea_iterations/temp_data/df_clause.pkl"
embedding_cache_path = "fea_iterations/temp_data/embedding_cache.pkl"


In [3]:

import pandas as pd
import pickle
import importlib
import numpy as np
import sys
import os

current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)
    print(f"Added current directory to sys.path: {current_dir}")
import free_entailments_algorithm_utils as fea
import scrapbook as sb

print(f"Loading data from pickle files...")
print(f"  Loading from: {df_candidates_path}")

df_candidates = pd.read_pickle(df_candidates_path)
df_crossed = pd.read_pickle(df_crossed_path)
df_labeled = pd.read_pickle(df_labeled_path)
df_labeled_crossed = pd.read_pickle(df_labeled_crossed_path)
df_obs_ent = pd.read_pickle(df_obs_ent_path)
df_clause = pd.read_pickle(df_clause_path)

with open(embedding_cache_path, 'rb') as f:
    embedding_cache_finetuned = pickle.load(f)

# Build equiv_map for memory-efficient neighbor score computation
# (avoids requiring 'equivalents1'/'equivalents2' list columns in df_candidates)
equiv_map = fea.build_equiv_map(df_obs_ent, id1_col="id1", id2_col="id2", include_self=False)
LARGE_SCALE = len(df_candidates) > 5_000_000

print(f"✓ Successfully loaded all data:")
print(f"  - df_candidates: {len(df_candidates):,} rows {'(LARGE-SCALE MODE)' if LARGE_SCALE else ''}")
print(f"  - df_crossed: {len(df_crossed):,} rows")
print(f"  - df_labeled: {len(df_labeled)} rows")
print(f"  - df_labeled_crossed: {len(df_labeled_crossed)} rows")
print(f"  - df_obs_ent: {len(df_obs_ent)} rows")
print(f"  - df_clause: {len(df_clause)} rows")
print(f"  - embedding_cache: {len(embedding_cache_finetuned)} entries")
print(f"  - equiv_map: {len(equiv_map)} IDs with equivalents")

Added current directory to sys.path: c:\Users\aesteva\Dropbox\Culture\3_data_processing\10_Argumentation\Entailment\CODE\free_entailment_algorithm\fea_project


Loading data from pickle files...
  Loading from: fea_iterations/temp_data/df_candidates.pkl


✓ Successfully loaded all data:
  - df_candidates: 4,999,977 rows 
  - df_crossed: 262,376 rows
  - df_labeled: 1992 rows
  - df_labeled_crossed: 2056 rows
  - df_obs_ent: 600 rows
  - df_clause: 38635 rows
  - embedding_cache: 38635 entries
  - equiv_map: 507 IDs with equivalents


In [4]:
importlib.reload(fea)

<module 'free_entailments_algorithm_utils' from 'c:\\Users\\aesteva\\Dropbox\\Culture\\3_data_processing\\10_Argumentation\\Entailment\\CODE\\free_entailment_algorithm\\fea_project\\free_entailments_algorithm_utils.py'>

# Calculate Similarity On LLM Results

In [5]:
# OPTIMIZED: Using fine-tuned embeddings from cache (instant!)
# Old way: Re-encoded all texts (~3-5 minutes)
# New way: Lookup from cache (< 1 second)

df_candidates_with_scores = fea.generate_new_bert_results(
    df_candidates,
    text_col1='text1',
    text_col2='text2',
    model_path="./fine_tuned_bi_model",
    new_col="new_cos_sim_score",
    # NEW PARAMETERS: Use the fine-tuned embedding cache!
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)
df_candidates_with_scores.head()

Using pre-computed embeddings from cache...


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score
0,B0001001p,B0001007p,,[],[],,0.766113
1,B0001001p,B0005008p,,[],[],,0.409668
2,B0001001p,B0008009p,,[],[],,0.46875
3,B0001001p,B0012005p,,[],[],,0.113464
4,B0001001p,B0019001p,,[],[B0493002p],0.0,0.574219


In [6]:
df_labeled = fea.generate_new_bert_results(
    df_labeled,
    text_col1='text1',
    text_col2='text2',
    model_path="./fine_tuned_bi_model",
    new_col="new_cos_sim_score",
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)

df_labeled.head()

Using pre-computed embeddings from cache...


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score
0,B0137002p,S0022948006p,Legitimate authority derives from the consent ...,The populace is united in their desire for a s...,NO,"[B0090003p, B0768002p]",[],1.0,0.842773
1,S0022948006p,B0137002p,The populace is united in their desire for a s...,Legitimate authority derives from the consent ...,NO,[],"[B0090003p, B0768002p]",0.0,0.842773
2,B0691012p,S0023235007p,Prioritizing the people's welfare is essential...,The Commons is tasked with protecting the righ...,NO,[B0205002p],[],1.0,0.748535
3,S0023235007p,B0691012p,The Commons is tasked with protecting the righ...,Prioritizing the people's welfare is essential...,NO,[],[B0205002p],0.0,0.748535
4,B0360002p,S0023525004p,"When those in power, such as kings and royal o...",I have fulfilled my duty to my nation by speak...,NO,[],[],,0.712402


In [7]:
## Takes a few minutes depending on computing power


# df_main = fea.add_cross_encoder_score(
#     df_main,                 
#     text_col1='text1',
#     text_col2='text2',
#     model_name="./fine_tuned_nli_model",  
#     new_col="nli_score",
#     batch_size=128              # Keep batch size lower for Cross-Encoders, go easy on your computer
# )

# df_main.head()
# df_main.shape

In [8]:
df_crossed = fea.add_cosine_similarity_from_text(
    df_crossed,
    text_col1="text1",
    text_col2="text2",
    model_name="./fine_tuned_bi_model",  # Ignored when cache provided
    batch_size=128,
    show_progress_bar=False,  # No need since we're using cache
    # NEW PARAMETERS: Use the fine-tuned embedding cache!
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)

df_crossed.head()

Using pre-computed embeddings from cache...


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,cosine_sim
0,B0001001p,B0493002p,Proponents of divine right deny mankind's natu...,The essence of political power is rooted in th...,,0.513184
1,B0001001p,B0510005p,Proponents of divine right deny mankind's natu...,While the legislative is the supreme power dur...,,0.457031
2,B0001001p,B0749009p,Proponents of divine right deny mankind's natu...,A king's authority is not inherent but granted...,,0.538086
3,B0001001p,B0749004p,Proponents of divine right deny mankind's natu...,The people have the authority to choose their ...,,0.524414
4,B0001001p,B0752008p,Proponents of divine right deny mankind's natu...,Nations possess the autonomy to determine thei...,,0.533203


In [9]:
df_labeled_crossed = fea.add_cosine_similarity_from_text(
    df_labeled_crossed,
    text_col1="text1",
    text_col2="text2",
    model_name="./fine_tuned_bi_model",  # Ignored when cache provided
    batch_size=128,
    show_progress_bar=False,  # No need since we're using cache
    # NEW PARAMETERS: Use the fine-tuned embedding cache!
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)

df_labeled_crossed.head()

Using pre-computed embeddings from cache...


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,cosine_sim
0,S0022948006p,B0090003p,The populace is united in their desire for a s...,Individuals are allowed the liberty to establi...,,0.806152
1,S0022948006p,B0768002p,The populace is united in their desire for a s...,Historical examples and philosophical reasonin...,,0.778809
2,S0023235007p,B0205002p,The Commons is tasked with protecting the righ...,The role of a king is to serve the public good,,0.720703
3,S0018125004p,B0714004p,Parliament is the only body capable of keeping...,The magistrate's power is contingent upon what...,,0.666504
4,S0018125004p,B0783004p,Parliament is the only body capable of keeping...,The King cannot deny the enactment of just laws,,0.760254


# Features

## Compute Cos Sim Neighborhood Score

In [10]:
if LARGE_SCALE:
    # Memory-efficient: uses equiv_map instead of list columns,
    # short-circuits ~95%+ rows that have no neighbours.
    sigma_lookup = fea._build_sigma_lookup_from_df5(
        df_crossed, id1_col="id1", id2_col="id2", cosim_col="cosine_sim"
    )
    df_candidates = fea.compute_neighbor_score_efficient(
        sigma_lookup=sigma_lookup,
        df6=df_candidates_with_scores,
        equiv_map=equiv_map,
        id1_col="id1",
        id2_col="id2",
        cosim_col="new_cos_sim_score",
        alpha_col="alpha",
        new_col="cos_sim_neighbor_score",
    )
else:
    df_candidates = fea.compute_neighbor_weighted_score(
        df5 = df_crossed,
        df6 = df_candidates_with_scores,
        id1_col = "id1",
        id2_col = "id2",
        cosim_df5_col = "cosine_sim",
        cosim_df6_col = "new_cos_sim_score",
        alpha_col = "alpha",
        eq1_col = "equivalents1",
        eq2_col = "equivalents2",
        new_col = "cos_sim_neighbor_score",
    )
df_candidates.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score,cos_sim_neighbor_score
0,B0001001p,B0001007p,,[],[],,0.766113,0.766113
1,B0001001p,B0005008p,,[],[],,0.409668,0.409668
2,B0001001p,B0008009p,,[],[],,0.46875,0.46875
3,B0001001p,B0012005p,,[],[],,0.113464,0.113464
4,B0001001p,B0019001p,,[],[B0493002p],0.0,0.574219,0.29468


In [11]:
df_labeled = fea.compute_neighbor_weighted_score(
    df5 = df_labeled_crossed,
    df6 = df_labeled,
    id1_col = "id1",
    id2_col = "id2",
    cosim_df5_col = "cosine_sim",
    cosim_df6_col = "new_cos_sim_score",
    alpha_col = "alpha",
    eq1_col = "equivalents1",
    eq2_col = "equivalents2",
    new_col = "cos_sim_neighbor_score",
)
df_labeled.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score,cos_sim_neighbor_score
0,B0137002p,S0022948006p,Legitimate authority derives from the consent ...,The populace is united in their desire for a s...,NO,"[B0090003p, B0768002p]",[],1.0,0.842773,0.667881
1,S0022948006p,B0137002p,The populace is united in their desire for a s...,Legitimate authority derives from the consent ...,NO,[],"[B0090003p, B0768002p]",0.0,0.842773,0.667881
2,B0691012p,S0023235007p,Prioritizing the people's welfare is essential...,The Commons is tasked with protecting the righ...,NO,[B0205002p],[],1.0,0.748535,0.539472
3,S0023235007p,B0691012p,The Commons is tasked with protecting the righ...,Prioritizing the people's welfare is essential...,NO,[],[B0205002p],0.0,0.748535,0.539472
4,B0360002p,S0023525004p,"When those in power, such as kings and royal o...",I have fulfilled my duty to my nation by speak...,NO,[],[],,0.712402,0.712402


In [12]:

df_candidates.shape, df_labeled.shape

((4999977, 8), (1992, 10))

## Compute NLI Score 

In [13]:
# df_crossed = fea.add_cross_encoder_score(
#     df_crossed,
#     text_col1="text1",
#     text_col2="text2",
    
#     # FIX: Use the relative path with ./ just like before
#     model_name="./fine_tuned_nli_model", 
#     batch_size=128,
#     new_col="nli_score" 
# )

# # (add_cross_encoder_score adds the column in-place)
# if "nli_score" not in df_candidates.columns:
#     print("Scores added to df_crossed!")
    
# df_crossed.head()

# # We reuse the same function used for Cosine Similarity, but point to NLI columns.
# df_candidates = fea.compute_neighbor_weighted_score(
#     df5=df_crossed,
#     df6=df_candidates,
#     id1_col="id1",
#     id2_col="id2",
#     cosim_df5_col="nli_score",    # The 'sigma' lookup table uses NLI
#     cosim_df6_col="nli_score",    # The 'sigma_ij' value uses NLI
#     alpha_col="alpha",
#     eq1_col="equivalents1",
#     eq2_col="equivalents2",
#     new_col="nli_neighbor_score"
# )

# cols_to_show = ['text1', 'text2', 'nli_score', 'nli_neighbor_score', 'verdict']
# df_candidates[cols_to_show].head()

## Compute Transitivity Score

In [14]:
# Graph features use per-row BFS — infeasible at 75M+ pairs.
# Since these features are currently commented out of the model's feature list,
# we fill with zeros at large scale to maintain column structure.
if LARGE_SCALE:
    print(f"LARGE-SCALE: Skipping BFS graph features for {len(df_candidates):,} rows (would take days)")
    df_candidates['graph_entailment_score'] = 0.0
    df_candidates['graph_equivalence_score'] = 0.0
else:
    df_candidates = fea.add_graph_features(
        df=df_candidates,
        entailment_df=df_obs_ent,
        id1_col="id1",
        id2_col="id2",
        verdict_col="verdict",
        positive_label="YES",
        decay=0.9,
        max_hops=5
    )

print("\nGraph Score Stats:")
print(df_candidates[['graph_entailment_score', 'graph_equivalence_score']].describe())

Building Directed Entailment Graph...


Computing graph features for 4999977 pairs...



Graph Score Stats:


       graph_entailment_score  graph_equivalence_score
count            4.999977e+06             4.999977e+06
mean             1.458007e-07             1.458007e-07
std              3.260195e-04             3.260195e-04
min              0.000000e+00             0.000000e+00
25%              0.000000e+00             0.000000e+00
50%              0.000000e+00             0.000000e+00
75%              0.000000e+00             0.000000e+00
max              7.290000e-01             7.290000e-01


In [15]:
# Graph features are NOT in the model's feature list (commented out in the cell below),
# so BFS computation is unnecessary for df_labeled.  Fill with zeros for column alignment.
df_labeled['graph_entailment_score'] = 0.0
df_labeled['graph_equivalence_score'] = 0.0

print("\nGraph Score Stats (df_candidates):")
print(df_candidates[['graph_entailment_score', 'graph_equivalence_score']].describe())


Graph Score Stats (df_candidates):


       graph_entailment_score  graph_equivalence_score
count            4.999977e+06             4.999977e+06
mean             1.458007e-07             1.458007e-07
std              3.260195e-04             3.260195e-04
min              0.000000e+00             0.000000e+00
25%              0.000000e+00             0.000000e+00
50%              0.000000e+00             0.000000e+00
75%              0.000000e+00             0.000000e+00
max              7.290000e-01             7.290000e-01


In [16]:
import gc

# Free stale references that still point at the 75M-row DataFrame.
# df_candidates_with_scores is an alias set in cell 7;
# sigma_lookup is the large dict built in cell 14.
# Neither is needed after this point.
try:
    del df_candidates_with_scores
except NameError:
    pass
try:
    del sigma_lookup
except NameError:
    pass
gc.collect()

# dropna creates a COPY.  At 75M rows that doubles memory briefly.
# Avoid the copy entirely when there are no NaN values.
nan_count = int(df_candidates['cos_sim_neighbor_score'].isna().sum())
if nan_count > 0:
    df_candidates = df_candidates.dropna(subset=['cos_sim_neighbor_score'])
    gc.collect()

df_labeled = df_labeled.dropna(subset=['cos_sim_neighbor_score'])

print(f"After dropna: {len(df_candidates):,} candidates (dropped {nan_count:,}), {len(df_labeled)} labeled")
if len(df_labeled) == 0:
    print("WARNING: No labeled rows with valid scores — model training will be skipped upstream.")

After dropna: 4,999,977 candidates (dropped 0), 1992 labeled


# Predicting Entailment (can change model pipeline to something other than logistic)

In [17]:
# Feature Engineering & Model Training
# Updated features list to include robust graph metrics
features = [
    'cos_sim_neighbor_score', 
    #'nli_neighbor_score', 
    #'graph_entailment_score', 
    #'graph_equivalence_score'
]
target = 'verdict'
positive_label = 'YES'

# Generate BERT results for df_labeled


print(f"Training dataset: {len(df_labeled)} pairs with features and verdicts")
print(f"Prediction dataset: {len(df_candidates)} pairs with features (no verdicts)")

# 2. (Optional) Run Hyperparameter Optimization with Optuna
try:
    print("\n>>> Optimizing Boosting Hyperparameters with Optuna...")
    # This might take a minute but will find scientifically best parameters
    best_params = fea.optimize_boosting_hyperparameters(
        df=df_labeled,  # TRAIN ON LABELED DATA!
        feature_cols=features,
        target_col=target,
        positive_label=positive_label,
        n_trials=30 
    )
    
    # Add monotonic constraint assumption back if we believe in it
    # (Optuna doesn't optimize this structure, it optimizes numbers)
    if best_params:
        best_params['enforce_monotonicity'] = True 
        
except Exception as e:
    print(f"\nOptimization skipped or failed: {e}")
    print("Using conservative defaults.")
    best_params = {
        'learning_rate': 0.05,
        'max_iter': 300,
        'enforce_monotonicity': True
    }


# 3. Run Comparative Analysis using helper function
# Updated: Now uses Cross-Validation internally to prevent Overfitting!
comparison_df, best_model_name = fea.compare_entailment_models(
    df=df_labeled,  # TRAIN ON LABELED DATA!
    feature_cols=features,
    target_col=target,
    model_names=["logistic", "spline", "tree", "boosting"], 
    positive_label=positive_label,
    **best_params # Unpack the best parameters here
)

# --- Display Results ---
print("\nComparison Results (Sorted by ROC-AUC):")
display(comparison_df)

print(f"\n>>> Selected '{best_model_name}' model for downstream processing.")

Training dataset: 1992 pairs with features and verdicts
Prediction dataset: 4999977 pairs with features (no verdicts)

>>> Optimizing Boosting Hyperparameters with Optuna...


Starting Optuna optimization with 30 trials...


Best ROC-AUC: 0.7939
Best Params: {'learning_rate': 0.011579523371542905, 'max_iter': 176, 'max_depth': 15, 'max_leaf_nodes': 63, 'min_samples_leaf': 74, 'l2_regularization': 0.00020938628288916486}
Running comparative analysis on 1992 samples...
Features: ['cos_sim_neighbor_score']

--- Training logistic ---
Training Logistic Regression...
Model (logistic) Train Accuracy: 0.7028
--- Training spline ---
Training Spline Logistic Regression...
Model (spline) Train Accuracy: 0.7761
--- Training tree ---
Training Decision Tree Classifier...
Model (tree) Train Accuracy: 0.9227
--- Training boosting ---
Enforcing monotonic constraints: [1]
Training Histogram Gradient Boosting Classifier (lr=0.011579523371542905, iter=176)...


Model (boosting) Train Accuracy: 0.7430



Comparison Results (Sorted by ROC-AUC):


Unnamed: 0_level_0,ROC-AUC (CV),Log Loss,Separation,Mean Prob (YES),Mean Prob (NO)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
spline,0.744035,0.611927,0.314212,0.614675,0.300463
boosting,0.743284,0.578326,0.27473,0.62528,0.35055
logistic,0.730587,0.928007,0.099416,0.512945,0.413529
tree,0.685704,7.677284,0.323033,0.511402,0.188369



>>> Selected 'spline' model for downstream processing.


In [18]:
print(f"Retraining '{best_model_name}' with optimized parameters on labeled data...")

# Train the model on df_labeled_with_features (which has verdicts)
best_pipeline = fea.train_entailment_model(
    df=df_labeled,  # TRAIN ON LABELED DATA!
    feature_cols=features,
    target_col=target,
    method=best_model_name,
    positive_label=positive_label,
    **best_params
)


df_candidates = fea.predict_entailment_probabilities(
    df_candidates,
    model_pipeline=best_pipeline,
    feature_cols=features,
    new_col='entailment_probability'
)

print(f"\nPrediction stats:")
print(f"  Min probability: {df_candidates['entailment_probability'].min():.4f}")
print(f"  Max probability: {df_candidates['entailment_probability'].max():.4f}")
print(f"  Mean probability: {df_candidates['entailment_probability'].mean():.4f}")

df_labeled.head()

Retraining 'spline' with optimized parameters on labeled data...
Training Spline Logistic Regression...
Model (spline) Train Accuracy: 0.7761



Prediction stats:
  Min probability: 0.0575
  Max probability: 0.9240
  Mean probability: 0.7729


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score,cos_sim_neighbor_score,graph_entailment_score,graph_equivalence_score
0,B0137002p,S0022948006p,Legitimate authority derives from the consent ...,The populace is united in their desire for a s...,NO,"[B0090003p, B0768002p]",[],1.0,0.842773,0.667881,0.0,0.0
1,S0022948006p,B0137002p,The populace is united in their desire for a s...,Legitimate authority derives from the consent ...,NO,[],"[B0090003p, B0768002p]",0.0,0.842773,0.667881,0.0,0.0
2,B0691012p,S0023235007p,Prioritizing the people's welfare is essential...,The Commons is tasked with protecting the righ...,NO,[B0205002p],[],1.0,0.748535,0.539472,0.0,0.0
3,S0023235007p,B0691012p,The Commons is tasked with protecting the righ...,Prioritizing the people's welfare is essential...,NO,[],[B0205002p],0.0,0.748535,0.539472,0.0,0.0
4,B0360002p,S0023525004p,"When those in power, such as kings and royal o...",I have fulfilled my duty to my nation by speak...,NO,[],[],,0.712402,0.712402,0.0,0.0


# Optimize threshold 

In [19]:
# Predict on labeled data to find optimal thresholds
print("Predicting on labeled data for threshold optimization...")
df_labeled_with_features = fea.predict_entailment_probabilities(
    df_labeled,
    model_pipeline=best_pipeline,
    feature_cols=features,
    new_col='entailment_probability'
)

# We use the generic 'entailment_probability' column which now holds the best model's output
importlib.reload(fea)

results = fea.find_best_thresholds(
    df=df_labeled_with_features,  # USE LABELED DATA FOR THRESHOLD TUNING!
    score_col="entailment_probability",
    verdict_col="verdict",
    positive_label="YES"
)

df_labeled_with_features.head()

Predicting on labeled data for threshold optimization...


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id1,id2,text1,text2,verdict,equivalents1,equivalents2,alpha,new_cos_sim_score,cos_sim_neighbor_score,graph_entailment_score,graph_equivalence_score,entailment_probability
0,B0137002p,S0022948006p,Legitimate authority derives from the consent ...,The populace is united in their desire for a s...,NO,"[B0090003p, B0768002p]",[],1.0,0.842773,0.667881,0.0,0.0,0.058054
1,S0022948006p,B0137002p,The populace is united in their desire for a s...,Legitimate authority derives from the consent ...,NO,[],"[B0090003p, B0768002p]",0.0,0.842773,0.667881,0.0,0.0,0.058054
2,B0691012p,S0023235007p,Prioritizing the people's welfare is essential...,The Commons is tasked with protecting the righ...,NO,[B0205002p],[],1.0,0.748535,0.539472,0.0,0.0,0.178297
3,S0023235007p,B0691012p,The Commons is tasked with protecting the righ...,Prioritizing the people's welfare is essential...,NO,[],[B0205002p],0.0,0.748535,0.539472,0.0,0.0,0.178297
4,B0360002p,S0023525004p,"When those in power, such as kings and royal o...",I have fulfilled my duty to my nation by speak...,NO,[],[],,0.712402,0.712402,0.0,0.0,0.06252


In [20]:
print("Best tau (accuracy):", results["best_tau_accuracy"],
      "Accuracy:", results["best_accuracy"])

print("Best tau (F1):", results["best_tau_f1"],
      "F1:", results["best_f1"])

print("Best tau (TP):", results["best_tau_tp"],
      "TP:", results["max_true_positives"])

print("Best tau (precision):", results["best_tau_precision"],
      "prec:", results["best_precision"])

print("Best tau (recall):",    results["best_tau_recall"],
      "rec:",  results["best_recall"])

Best tau (accuracy): 0.43807750663905065 Accuracy: 0.7791164658634538
Best tau (F1): 0.43807750663905065 F1: 0.7142857142857143
Best tau (TP): 0.05748090145007371 TP: 600
Best tau (precision): 0.8493363484263536 prec: 0.7676767676767676
Best tau (recall): 0.05748090145007371 rec: 1.0


In [21]:
results["best_taus_table"]

Unnamed: 0,tau,TP,TN,FP,FN,accuracy,precision,recall,f1
0,0.057481,600,2,1390,0,0.302209,0.301508,1.0,0.46332
1,0.438078,550,1002,390,50,0.779116,0.585106,0.916667,0.714286
2,0.849336,152,1346,46,448,0.752008,0.767677,0.253333,0.380952


In [22]:
import importlib
import plotly.io as pio
import free_entailments_algorithm_utils as fea
importlib.reload(fea) 

# Ensure Plotly renders appropriately for notebook/vscode context
pio.renderers.default = "notebook_connected" 

# This also calculates 'best_tau_low_send' (Top 1-5% Candidates)
# Use labeled data (with verdicts) for threshold finding!

results = fea.find_best_thresholds(
    df=df_labeled_with_features,  # FIX: Use labeled data, not candidates!
    score_col="entailment_probability", 
    verdict_col="verdict",
    positive_label="YES"
)


tau_low_send = results.get('best_tau_low_send', 0.95)

print("\nLow-Send Optimization (Candidate Selection):")
print(f"Selected Low-Send Threshold: {tau_low_send:.6f}")
if "low_send_table" in results:
    display(results["low_send_table"])

print("\n>>> Interactive Analysis: LLM Savings vs Threshold")

markers_to_show = {
    "Optimization (Top %)":    tau_low_send,
    "Max Accuracy":            results["best_tau_accuracy"],
    "Max F1":                  results["best_tau_f1"]
}


# Sent = Prob > Threshold
fig = fea.plot_llm_savings_over_thresholds(
    df=df_labeled_with_features,  # FIX: Use labeled data for plotting too!
    prob_col="entailment_probability",
    verdict_col="verdict",
    positive_label="YES",
    step=0.01,
    markers=markers_to_show
)
fig.show()


Low-Send Optimization (Candidate Selection):
Selected Low-Send Threshold: 0.878303


Unnamed: 0,target_percentile,tau,sent_rate,FN,TP,FP,TN
0,0.01,0.908117,0.01004,588,12,8,1384
1,0.02,0.901335,0.02008,576,24,16,1376
2,0.03,0.894188,0.03012,562,38,22,1370
3,0.04,0.884899,0.040161,546,54,26,1366
4,0.05,0.878303,0.050201,530,70,30,1362



>>> Interactive Analysis: LLM Savings vs Threshold


In [23]:
import importlib
import gc
import free_entailments_algorithm_utils as fea
importlib.reload(fea)

print("--- Defining Threshold for LLM ---")

# Strategy: Send everything above a certain confidence threshold.
# We use the 'Minimize False Negatives' strategy (Cost Sensitive) to find a threshold
# effectively filtering out 'Definite Negatives' while keeping all potential Candidates.
# Cost Ratio 1:5 means we punish missing a Yes (FN) 5x more than sending a useless No (FP).

tau = fea.get_optimal_threshold_minimize_fn(strategy='cost', cost_fn=5.0)

print(f"Selected Threshold: {tau:.4f} (Send if Score > {tau:.4f})")
print(f"Logic: Minimize FN (Don't miss Entailments). Auto-Reject scores <= {tau:.4f}.")

# 3. Generate Final DataFrame for the LLM
print("\n--- Generating File ---")
df_final = fea.generate_final_df(
    df=df_candidates, 
    prob_col='entailment_probability', 
    threshold=tau,
    df_clause=df_clause,
    id_col='sentence_id',
    text_col='sentence'
)

# FREE df_candidates (75M rows) — no longer needed.
# df_final is the tiny filtered subset (~1k rows).
# This MUST happen before sb.glue / pickle output or we'll OOM during serialization.
del df_candidates
try:
    del df_labeled, df_labeled_with_features
except NameError:
    pass
gc.collect()
print(f"✓ Freed df_candidates — only df_final ({len(df_final)} rows) remains")

--- Defining Threshold for LLM ---
Selected Threshold: 0.1667 (Send if Score > 0.1667)
Logic: Minimize FN (Don't miss Entailments). Auto-Reject scores <= 0.1667.

--- Generating File ---


--- Generating LLM Batch ---
Original Count: 4,999,977
Filtered Count: 4,785,065 (95.7%)
Condition:      P > 0.1667 (Send High Confidence Pairs)


✓ Freed df_candidates — only df_final (4785065 rows) remains


In [24]:
import os, pickle

# Save outputs to pickle files — much more memory-efficient than
# sb.glue which JSON-serializes through Jupyter's messaging layer.
# At 75M candidate scale, the JSON serialization alone can OOM.
_out_dir = os.path.dirname(df_candidates_path)  # same temp_dir used for inputs
df_final.to_pickle(os.path.join(_out_dir, "df_final.pkl"))

fig_html = fig.to_html(include_plotlyjs='cdn')
with open(os.path.join(_out_dir, "fig_html.pkl"), 'wb') as f:
    pickle.dump(fig_html, f)

print(f"✓ Saved df_final ({len(df_final)} rows) and fig_html ({len(fig_html)} chars) to {_out_dir}")

# Also glue small metadata via scrapbook so papermill knows the notebook succeeded
import scrapbook as sb
sb.glue('df_final_rows', len(df_final))
print("✓ Outputs saved for papermill retrieval")

✓ Saved df_final (4785065 rows) and fig_html (14638 chars) to fea_iterations/temp_data


✓ Outputs saved for papermill retrieval
