In [1]:
import pandas as pd
import sys
import pickle
import importlib
import os
import numpy as np
import scrapbook as sb
from sentence_transformers import SentenceTransformer

# Ensure project root is on sys.path (required for papermill fresh kernels)
current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)

import free_entailments_algorithm_utils as fea

KeyboardInterrupt: 

In [3]:
iteration_number = 1
input_csv_path = "labeled_pairs/Results_DS_BtoS_iteration_0.csv"
df_clause_path = None
embedding_cache_path = None
test = True
remaining_llm_calls_path = None
unlabeled_pairs_path = None
sent_frac = 0.5
budget = 0.00

In [None]:
pipeline_data = fea.load_pipeline_data(
    df_clause_path=df_clause_path,
    embedding_cache_path=embedding_cache_path,
    test=test,
    remaining_llm_calls_path=remaining_llm_calls_path,
    unlabeled_pairs_path=unlabeled_pairs_path,
    iteration_number=iteration_number,
)

df_clause = pipeline_data['df_clause']
embedding_cache_finetuned = pipeline_data['embedding_cache']
remaining_llm_calls = pipeline_data['remaining_llm_calls']
unlabeled_pairs = pipeline_data['unlabeled_pairs']

# Task 1: Seting up dataframes and Running FEA

In [None]:
df_llm_original = pd.read_csv(input_csv_path)

# If 'verdict' column already exists (e.g. from process_llm_results_bidirectional),
# use it directly instead of recomputing via add_verdict (which only sees one-way
# conclusions and would overwrite correct bidirectional verdicts).
if 'verdict' in df_llm_original.columns and df_llm_original['verdict'].notna().any():
    df_llm = df_llm_original
    print(f"Using existing 'verdict' column ({(df_llm['verdict']=='YES').sum()} YES, {(df_llm['verdict']=='NO').sum()} NO)")
else:
    df_llm = fea.add_verdict(
        df_llm_original,
        id1_col='sentence_id_1',
        id2_col='sentence_id_2',
        conclusion_col='llm_conclusion_12',
        positive_label='YES'
    )

if test:
    df_llm_remaining = fea.add_verdict(
        remaining_llm_calls,
        id1_col='sentence_id_1',
        id2_col='sentence_id_2',
        conclusion_col='llm_conclusion_12',
        positive_label='YES'
    )

Total cost so far: $0.0000

VERDICT SUMMARY
Total pairs: 2000
Bidirectional entailment (YES): 241 (12.0%)
Not bidirectionally entailed (NO): 1759 (87.9%)


VERDICT SUMMARY
Total pairs: 8000
Bidirectional entailment (YES): 974 (12.2%)
Not bidirectionally entailed (NO): 7026 (87.8%)



In [45]:
df_labeled = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_llm,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['sentence_id_1', 'sentence_id_2', 'verdict']
)
df_labeled.head()

Unnamed: 0,id1,id2,text1,text2,verdict
0,B0860002sc,S0010771002sc,The king's support must match his responsibili...,clear evidence of acting against the interests...,NO
1,B1170001sc,S0020225001sc,Active governance by the prince is essential f...,Maintaining respect for the monarchy is essential,NO
2,B0454001p,S0004868005p,Agrarian laws can effectively prevent the rise...,This situation highlights the tension between ...,NO
3,B0227001sc,S0000883002sc,Parliament should hold the power to correct le...,Parliament must uphold the rule of law,YES
4,B0580002sc,S0023399001sc,The king's presence is essential for validatin...,Parliament must assert authority,NO


In [None]:
import gc

if test:
    df_predict = fea.merge_pairwise_texts(
        df1 = df_clause,
        df2 = df_llm_remaining,
        df1_cols = ['sentence_id', 'sentence'],
        df2_cols = ['sentence_id_1', 'sentence_id_2']
    )
else:
    # At 20M+ pairs, we skip merge_pairwise_texts on the full pool
    # (pairs were already validated at generation, text not needed until
    #  the small df_final at the end). Just remove already-labeled pairs.
    df_predict = fea.setminus(
        df_big= unlabeled_pairs,
        df_small= df_labeled,
        id_cols = ['id1', 'id2']
    )
    df_predict['verdict'] = np.nan

    # Free unlabeled_pairs immediately — it consumed ~3 GB and is no longer
    # needed (df_predict is the working copy).  We will reload from pickle
    # later when finalize_pipeline_iteration needs it.
    del unlabeled_pairs
    gc.collect()
    print("✓ Freed unlabeled_pairs (will reload from pickle before finalize)")

df_predict.head()

Unnamed: 0,id1,id2,text1,text2,verdict
0,B0859002p,S5393003p,The authority of a king is divinely ordained a...,The assertion that the king's power is derived...,
1,B0672011p,S15260001p,Ensuring the stability and governance of the s...,Establishing a stable government requires adhe...,
2,B0589007p,S0000863004p,The rebellion was provoked by the subjects' at...,King Charles I believed that asserting his aut...,
3,B0382001p,S8507005p,The authority of a king is not absolute; it is...,The supremacy of royal authority in governance...,
4,B1114001sc,S0000941002sc,The connection between protection and obedienc...,to advocate for peace,


## Embedding All Sentences

In [None]:
## Patches an error later on with kwargs
import transformers.utils.hub
import transformers.tokenization_utils_base

def _safe_list_templates(*args, **kwargs):
    return []

transformers.utils.hub.list_repo_templates = _safe_list_templates
print(" - Patched transformers.utils.hub")

# The library had already imported the broken function here, so we must update it.
transformers.tokenization_utils_base.list_repo_templates = _safe_list_templates
print(" - Patched transformers.tokenization_utils_base")

print("\nSUCCESS: The 404 error is now blocked.")

 - Patched transformers.utils.hub
 - Patched transformers.tokenization_utils_base

SUCCESS: The 404 error is now blocked.


## Test and Validation Subsamples

In [48]:
# Keep only entailed pairs from sent
df_obs_ent = df_labeled.loc[df_labeled['verdict'] == 'YES']
df_obs_ent.head()

Unnamed: 0,id1,id2,text1,text2,verdict
3,B0227001sc,S0000883002sc,Parliament should hold the power to correct le...,Parliament must uphold the rule of law,YES
18,B0134001sc,S0004953001sc,Governance derives its legitimacy from the peo...,Governance legitimacy should come from the wil...,YES
20,B0794007p,S0000823011p,King Charles's actions demonstrate a tyrannica...,King Charles I's disregard for the people's vo...,YES
22,B0161002p,S15310007p,Such actions threaten the liberties and well-b...,Such actions endanger the rights of individual...,YES
26,B0252006p,S0000715007p,"The authority of governing bodies, like Parlia...",Parliament serves as a check on the power of t...,YES


In [None]:
# --- Memory-efficient path for large candidate pools ---
# Uses vectorised alpha + equiv_map lookups instead of storing
# Python list objects in every row (saves ~8 GB at 75M rows).
LARGE_THRESHOLD = 5_000_000

if len(df_predict) > LARGE_THRESHOLD:
    print(f"Using memory-efficient path ({len(df_predict):,} candidate rows)")
    df_candidates, df_crossed, equiv_map = fea.prepare_candidates_efficient(
        df_obs_ent=df_obs_ent,
        df_predict=df_predict,
        df_clause=df_clause,
    )
else:
    # Original path (fine for small DataFrames)
    df_candidates = fea.add_equivalents_from_pairs(
        df3=df_obs_ent,
        df4=df_predict,
        df3_cols=["id1", "id2"],
        df4_cols=["id1", "id2"],
        new_cols=("equivalents1", "equivalents2"),
        include_self=False,
    )
    df_candidates = fea.add_alpha_weight_column(
        df = df_candidates,
        list_col1 = 'equivalents1',
        list_col2 = 'equivalents2',
        new_col = "alpha"
    )
    equiv_map = None

In [50]:
df_labeled = fea.add_equivalents_from_pairs(
    df3=df_obs_ent,
    df4=df_labeled,
    df3_cols=["id1", "id2"],
    df4_cols=["id1", "id2"],
    new_cols=("equivalents1", "equivalents2"),
    include_self=False,  # keep the ID itself in the list
)

df_labeled = fea.add_alpha_weight_column(
    df = df_labeled,
    list_col1 = 'equivalents1',
    list_col2 = 'equivalents2',
    new_col = "alpha"
)

## Equivalence Classes

In [None]:
# Only compute crossed pairs here if we used the original (small) path above.
# The efficient path already produced df_crossed.
if equiv_map is None:
    df_crossed = fea.build_equiv_pair_candidates(
        df = df_candidates,
        id1_col = "id1",
        id2_col = "id2",
        equiv1_col = "equivalents1",
        equiv2_col = "equivalents2",
    )
    df_crossed = fea.merge_pairwise_texts(
        df1 = df_clause,
        df2 = df_crossed,
        df1_cols = ['sentence_id', 'sentence'],
        df2_cols = ['id1', 'id2']
    )

df_crossed.head()

Filtered 1228 pairs (kept 710).


Unnamed: 0,id1,id2,text1,text2,verdict
0,B1015002sc,B1009004sc,A stable society enables individual flourishing,Foster societal cohesion,
1,B0312002p,B0659002p,The concept of a free monarchy fundamentally c...,The legitimacy of royal power is rooted in the...,
2,B0781006p,B0223012p,The King's duty to uphold justice is emphasized,Limiting a King's authority to the consent of ...,
3,B0278001sc,B0795002sc,The King's power should be limited to promote ...,The King does not govern for the people's benefit,
4,B0190002p,B0223001p,The legitimacy of royal power is contingent up...,The authority of a King should indeed be limit...,


In [54]:
df_labeled_crossed = fea.build_equiv_pair_candidates(
    df = df_labeled,
    id1_col = "id1",
    id2_col = "id2",
    equiv1_col = "equivalents1",
    equiv2_col = "equivalents2",
)

# Retrieve clause sentences
df_labeled_crossed = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_labeled_crossed,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['id1', 'id2']
)

df_labeled_crossed.head()

Filtered 510 pairs (kept 456).


Unnamed: 0,id1,id2,text1,text2,verdict
0,B0227001sc,B0227001sc,Parliament should hold the power to correct le...,Parliament should hold the power to correct le...,
1,B0244002sc,B0311001sc,Parliaments in England can create laws indepen...,The king requires parliamentary approval to im...,
2,B0089006p,B0800005p,The rights and liberties of the people depend ...,The authority of a king or any governing body ...,
3,B0134001sc,B0134001sc,Governance derives its legitimacy from the peo...,Governance derives its legitimacy from the peo...,
4,B0794007p,B0794007p,King Charles's actions demonstrate a tyrannica...,King Charles's actions demonstrate a tyrannica...,


## Pre-compute cosine similarities & run FEA

In [None]:
# Pre-compute all cosine similarities HERE (Pipeline) so FreeEntailmentAlgorithm
# doesn't have to reload the embedding cache or redo this work.
# This saves ~88 MB of cache pickling/unpickling and one full pass over 5M+ rows.

print("Pre-computing cosine similarities in Pipeline...")

# 1. df_candidates → new_cos_sim_score
df_candidates = fea.generate_new_bert_results(
    df_candidates,
    text_col1='text1',
    text_col2='text2',
    model_path="./fine_tuned_bi_model",
    new_col="new_cos_sim_score",
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)
print(f"  ✓ df_candidates: new_cos_sim_score added ({len(df_candidates):,} rows)")

# 2. df_labeled → new_cos_sim_score
df_labeled = fea.generate_new_bert_results(
    df_labeled,
    text_col1='text1',
    text_col2='text2',
    model_path="./fine_tuned_bi_model",
    new_col="new_cos_sim_score",
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)
print(f"  ✓ df_labeled: new_cos_sim_score added ({len(df_labeled)} rows)")

# 3. df_crossed → cosine_sim
df_crossed = fea.add_cosine_similarity_from_text(
    df_crossed,
    text_col1="text1",
    text_col2="text2",
    model_name="./fine_tuned_bi_model",
    batch_size=128,
    show_progress_bar=False,
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)
print(f"  ✓ df_crossed: cosine_sim added ({len(df_crossed):,} rows)")

# 4. df_labeled_crossed → cosine_sim
df_labeled_crossed = fea.add_cosine_similarity_from_text(
    df_labeled_crossed,
    text_col1="text1",
    text_col2="text2",
    model_name="./fine_tuned_bi_model",
    batch_size=128,
    show_progress_bar=False,
    embedding_cache=embedding_cache_finetuned,
    id_col1='id1',
    id_col2='id2'
)
print(f"  ✓ df_labeled_crossed: cosine_sim added ({len(df_labeled_crossed)} rows)")

print("✓ All cosine similarities pre-computed in Pipeline")

In [None]:
import gc, os, pickle

temp_dir = "fea_iterations/temp_data"
os.makedirs(temp_dir, exist_ok=True)

n_cand = len(df_candidates)
n_obs  = len(df_obs_ent)

if n_cand == 0 or n_obs == 0:
    print(f"⚠ Skipping FreeEntailmentAlgorithm (empty data: "
          f"{n_cand} candidates, {n_obs} entailed pairs)")
    df_final = pd.DataFrame(
        columns=['id1', 'id2', 'text1', 'text2', 'entailment_probability'])
    fig_html = "<p>No data for this iteration</p>"
else:
    # 1. Pickle all DataFrames to disk for FreeEntailmentAlgorithm
    #    Cosine similarities are already computed — no need to pickle embedding_cache.
    df_candidates.to_pickle(f"{temp_dir}/df_candidates.pkl")
    print(f"  Pickled df_candidates: {n_cand:,} rows, cols={list(df_candidates.columns)}")
    df_crossed.to_pickle(f"{temp_dir}/df_crossed.pkl")
    df_labeled.to_pickle(f"{temp_dir}/df_labeled.pkl")
    df_labeled_crossed.to_pickle(f"{temp_dir}/df_labeled_crossed.pkl")
    df_obs_ent.to_pickle(f"{temp_dir}/df_obs_ent.pkl")
    df_clause.to_pickle(f"{temp_dir}/df_clause.pkl")

    # 2. FREE all large DataFrames BEFORE spawning FreeEntailmentAlgorithm.
    #    papermill runs a NEW kernel process — if we keep these in memory,
    #    we'd have two copies of the 75M-row DataFrame across two processes.
    try: del df_candidates
    except NameError: pass
    try: del df_predict
    except NameError: pass
    try: del df_crossed
    except NameError: pass
    try: del df_labeled_crossed
    except NameError: pass
    try: del df_obs_ent
    except NameError: pass
    try: del equiv_map
    except NameError: pass
    gc.collect()
    print("  ✓ Freed large DataFrames before FreeEntailmentAlgorithm subprocess")

    # 3. Execute FreeEntailmentAlgorithm (data already on disk)
    df_final, fig_html = fea.run_fea_papermill(
        iteration_number=iteration_number,
        temp_dir=temp_dir,
        data_on_disk=True,
    )

print(f"✓ df_final: {len(df_final)} rows")

Executing FreeEntailmentAlgorithm.ipynb for iteration 1...


Executing:   0%|          | 0/34 [00:00<?, ?cell/s]

✓ Retrieved outputs:
  - df_final: 3748 rows
  - fig_html: HTML plot (14714 chars)
  - estimated_cost_all_pairs: $7.1918


In [90]:
df_final.head()

Unnamed: 0,id1,id2,text1,text2,entailment_probability
2,B0589007p,S0000863004p,The rebellion was provoked by the subjects' at...,King Charles I believed that asserting his aut...,0.637147
4,B1114001sc,S0000941002sc,The connection between protection and obedienc...,to advocate for peace,0.706039
5,B0244004p,S0024289007p,Parliaments possess the power to create and ab...,It is crucial to maintain a clear separation b...,0.708335
14,B0351002sc,S0003513002sc,The assembly of estates has the authority to p...,The House of Commons must protect the relation...,0.720604
18,B0403006p,S0020750006p,The power to grant pardons distinguishes the s...,The implications of allowing these pardons ext...,0.26041


# Task 2: Cleaning LLM Calls

In [None]:
# Cap at 100k pairs max — send ALL pairs above threshold (no random subsampling)
MAX_LLM_PAIRS = 1000

df_final = df_final.reset_index(drop=True)

if len(df_final) > MAX_LLM_PAIRS:
    df_to_llm = df_final.sample(n=MAX_LLM_PAIRS, random_state=42)
    print(f"Capped df_to_llm at {MAX_LLM_PAIRS:,} (from {len(df_final):,} above threshold)")
else:
    df_to_llm = df_final.copy()
    print(f"Sending all {len(df_to_llm):,} pairs above threshold to LLM")

3.5958936666666665

In [None]:
df_to_llm = fea.format_df_to_llm(df_to_llm, df_clause=df_clause, id_col='sentence_id', text_col='sentence')
df_to_llm.head()

Unnamed: 0,sentence_id_2,sentence_id_1,sentence_text_2,argument_id_2,sentence_text_1,argument_id_1,score
3039,S0005432003p,B1157006p,Upholding parliamentary authority is essential...,S00054,The governance ensured by the elected leader i...,B1157,0.72072
152,S0018405001p,B0273002p,The necessity for immediate and decisive advic...,S00184,The foundation of a king's authority is rooted...,B0273,0.614942
681,S0003019001sc,B0223001sc,The proposed Paper Address to the king require...,S00030,Limiting a King's authority to the consent of ...,B0223,0.687784
5947,S0051611003p,B1140007p,The potential abuse of power by the monarchy n...,S00516,"By distancing blood-relations from power, the ...",B1140,0.386518
6813,S0020972001p,B0778006p,The necessity of immediate action by Parliamen...,S00209,The King must act in accordance with the legal...,B0778,0.66668


In [95]:
df_to_llm.shape

(1874, 7)

# Next loop:

In [None]:
# Reload unlabeled_pairs from pickle if in production mode.
# It was freed earlier to save memory while FreeEntailmentAlgorithm ran.
if not test and unlabeled_pairs_path:
    import gc
    unlabeled_pairs = pd.read_pickle(unlabeled_pairs_path)
    print(f"Reloaded unlabeled_pairs: {len(unlabeled_pairs):,} rows")
    gc.collect()

result = fea.finalize_pipeline_iteration(
    test=test,
    df_to_llm=df_to_llm,
    iteration_number=iteration_number,
    remaining_llm_calls=remaining_llm_calls,
    remaining_llm_calls_path=remaining_llm_calls_path,
    unlabeled_pairs=unlabeled_pairs,
    unlabeled_pairs_path=unlabeled_pairs_path,
)

remaining_llm_calls = result['remaining_llm_calls']
unlabeled_pairs = result['unlabeled_pairs']


TEST MODE: Mocking LLM responses
✓ Matched 1874/1874 pairs with mock LLM results
✓ Removed 1874 pairs from remaining LLM calls
✓ Remaining pairs for future iterations: 6126
✓ Saved 1874 pairs with LLM results to fea_iterations/llm_batch_iter_1.csv

Iteration 1 complete
Total accumulated cost: $0.0000
