In [1]:
import pandas as pd
import sys
import pickle
import importlib
import os
import numpy as np
import scrapbook as sb
from sentence_transformers import SentenceTransformer

# Ensure project root is on sys.path (required for papermill fresh kernels)
current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)

import free_entailments_algorithm_utils as fea

In [2]:
iteration_number = 1
input_csv_path = "labeled_pairs/Results_DS_BtoS_iteration_0.csv"
df_clause_path = None
embedding_cache_path = None
test = True
remaining_llm_calls_path = None
unlabeled_pairs_path = None
sent_frac = 0.5
budget = 0.00

In [3]:
# Parameters
iteration_number = 0
input_csv_path = "labeled_pairs/Results_DS_BtoS_iteration_0.csv"
df_clause_path = "fea_iterations\\loop_data/df_clause.pkl"
embedding_cache_path = "fea_iterations\\loop_data/embedding_cache.pkl"
test = False
remaining_llm_calls_path = None
unlabeled_pairs_path = "fea_iterations\\loop_data/unlabeled_pairs.pkl"
sent_frac = 0.5
budget = 5.0


In [4]:
pipeline_data = fea.load_pipeline_data(
    df_clause_path=df_clause_path,
    embedding_cache_path=embedding_cache_path,
    test=test,
    remaining_llm_calls_path=remaining_llm_calls_path,
    unlabeled_pairs_path=unlabeled_pairs_path,
    iteration_number=iteration_number,
)

df_clause = pipeline_data['df_clause']
embedding_cache_finetuned = pipeline_data['embedding_cache']
remaining_llm_calls = pipeline_data['remaining_llm_calls']
unlabeled_pairs = pipeline_data['unlabeled_pairs']


PARAMETER VALUES AFTER PAPERMILL INJECTION:
iteration_number = 0
test = False
remaining_llm_calls_path = None
df_clause_path = fea_iterations\loop_data/df_clause.pkl

✓ Loaded df_clause: 38635 rows


✓ Loaded embedding cache: 38635 embeddings
✓ Loaded unlabeled_pairs: 4999977 rows
✓ All data loaded from pickle files


# Task 1: Seting up dataframes and Running FEA

In [5]:
df_llm_original = pd.read_csv(input_csv_path)

# If 'verdict' column already exists (e.g. from process_llm_results_bidirectional),
# use it directly instead of recomputing via add_verdict (which only sees one-way
# conclusions and would overwrite correct bidirectional verdicts).
if 'verdict' in df_llm_original.columns and df_llm_original['verdict'].notna().any():
    df_llm = df_llm_original
    print(f"Using existing 'verdict' column ({(df_llm['verdict']=='YES').sum()} YES, {(df_llm['verdict']=='NO').sum()} NO)")
else:
    df_llm = fea.add_verdict(
        df_llm_original,
        id1_col='sentence_id_1',
        id2_col='sentence_id_2',
        conclusion_col='llm_conclusion_12',
        positive_label='YES'
    )

if test:
    df_llm_remaining = fea.add_verdict(
        remaining_llm_calls,
        id1_col='sentence_id_1',
        id2_col='sentence_id_2',
        conclusion_col='llm_conclusion_12',
        positive_label='YES'
    )

Using existing 'verdict' column (600 YES, 1398 NO)


In [6]:
df_labeled = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_llm,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['sentence_id_1', 'sentence_id_2', 'verdict']
)
df_labeled.head()

Filtered 6 pairs (kept 1992).


Unnamed: 0,id1,id2,text1,text2,verdict
0,B0137002p,S0022948006p,Legitimate authority derives from the consent ...,The populace is united in their desire for a s...,NO
1,S0022948006p,B0137002p,The populace is united in their desire for a s...,Legitimate authority derives from the consent ...,NO
2,B0691012p,S0023235007p,Prioritizing the people's welfare is essential...,The Commons is tasked with protecting the righ...,NO
3,S0023235007p,B0691012p,The Commons is tasked with protecting the righ...,Prioritizing the people's welfare is essential...,NO
4,B0360002p,S0023525004p,"When those in power, such as kings and royal o...",I have fulfilled my duty to my nation by speak...,NO


In [7]:
import gc

if test:
    df_predict = fea.merge_pairwise_texts(
        df1 = df_clause,
        df2 = df_llm_remaining,
        df1_cols = ['sentence_id', 'sentence'],
        df2_cols = ['sentence_id_1', 'sentence_id_2']
    )
else:
    # At 20M+ pairs, we skip merge_pairwise_texts on the full pool
    # (pairs were already validated at generation, text not needed until
    #  the small df_final at the end). Just remove already-labeled pairs.
    df_predict = fea.setminus(
        df_big= unlabeled_pairs,
        df_small= df_labeled,
        id_cols = ['id1', 'id2']
    )
    df_predict['verdict'] = np.nan

    # Free unlabeled_pairs immediately — it consumed ~3 GB and is no longer
    # needed (df_predict is the working copy).  We will reload from pickle
    # later when finalize_pipeline_iteration needs it.
    del unlabeled_pairs
    gc.collect()
    print("✓ Freed unlabeled_pairs (will reload from pickle before finalize)")

df_predict.head()

Set difference: 4,999,977 - 1,992 = 4,999,977 rows
✓ Freed unlabeled_pairs (will reload from pickle before finalize)


Unnamed: 0,id1,id2,verdict
0,B0001001p,B0001007p,
1,B0001001p,B0005008p,
2,B0001001p,B0008009p,
3,B0001001p,B0012005p,
4,B0001001p,B0019001p,


## Embedding All Sentences

In [8]:
## Patches an error later on with kwargs
import transformers.utils.hub
import transformers.tokenization_utils_base

def _safe_list_templates(*args, **kwargs):
    return []

transformers.utils.hub.list_repo_templates = _safe_list_templates
print(" - Patched transformers.utils.hub")

# The library had already imported the broken function here, so we must update it.
transformers.tokenization_utils_base.list_repo_templates = _safe_list_templates
print(" - Patched transformers.tokenization_utils_base")

print("\nSUCCESS: The 404 error is now blocked.")

 - Patched transformers.utils.hub
 - Patched transformers.tokenization_utils_base

SUCCESS: The 404 error is now blocked.


## Test and Validation Subsamples

In [9]:
# Keep only entailed pairs from sent
df_obs_ent = df_labeled.loc[df_labeled['verdict'] == 'YES']
df_obs_ent.head()

Unnamed: 0,id1,id2,text1,text2,verdict
1016,B0228001sc,S0000726003sc,True liberty requires a ruler bound by laws,to uphold accountability in governance,YES
1017,S0000726003sc,B0228001sc,to uphold accountability in governance,True liberty requires a ruler bound by laws,YES
1018,B0781002p,S0016904011p,The King's authority is limited by the laws an...,Parliamentary involvement ensures that governa...,YES
1019,S0016904011p,B0781002p,Parliamentary involvement ensures that governa...,The King's authority is limited by the laws an...,YES
1020,B0217001p,S0022814001p,The right to rule is fundamentally based on co...,The establishment of a constitutional monarchy...,YES


In [10]:
# --- Memory-efficient path for large candidate pools ---
# Uses vectorised alpha + equiv_map lookups instead of storing
# Python list objects in every row (saves ~8 GB at 75M rows).
LARGE_THRESHOLD = 5_000_000

if len(df_predict) > LARGE_THRESHOLD:
    print(f"Using memory-efficient path ({len(df_predict):,} candidate rows)")
    df_candidates, df_crossed, equiv_map = fea.prepare_candidates_efficient(
        df_obs_ent=df_obs_ent,
        df_predict=df_predict,
        df_clause=df_clause,
    )
else:
    # Original path (fine for small DataFrames)
    df_candidates = fea.add_equivalents_from_pairs(
        df3=df_obs_ent,
        df4=df_predict,
        df3_cols=["id1", "id2"],
        df4_cols=["id1", "id2"],
        new_cols=("equivalents1", "equivalents2"),
        include_self=False,
    )
    df_candidates = fea.add_alpha_weight_column(
        df = df_candidates,
        list_col1 = 'equivalents1',
        list_col2 = 'equivalents2',
        new_col = "alpha"
    )
    equiv_map = None

In [11]:
df_labeled = fea.add_equivalents_from_pairs(
    df3=df_obs_ent,
    df4=df_labeled,
    df3_cols=["id1", "id2"],
    df4_cols=["id1", "id2"],
    new_cols=("equivalents1", "equivalents2"),
    include_self=False,  # keep the ID itself in the list
)

df_labeled = fea.add_alpha_weight_column(
    df = df_labeled,
    list_col1 = 'equivalents1',
    list_col2 = 'equivalents2',
    new_col = "alpha"
)

## Equivalence Classes

In [12]:
# Only compute crossed pairs here if we used the original (small) path above.
# The efficient path already produced df_crossed.
if equiv_map is None:
    df_crossed = fea.build_equiv_pair_candidates(
        df = df_candidates,
        id1_col = "id1",
        id2_col = "id2",
        equiv1_col = "equivalents1",
        equiv2_col = "equivalents2",
    )
    df_crossed = fea.merge_pairwise_texts(
        df1 = df_clause,
        df2 = df_crossed,
        df1_cols = ['sentence_id', 'sentence'],
        df2_cols = ['id1', 'id2']
    )

df_crossed.head()

Filtered 13228 pairs (kept 262376).


Unnamed: 0,id1,id2,text1,text2,verdict
0,B0001001p,B0493002p,Proponents of divine right deny mankind's natu...,The essence of political power is rooted in th...,
1,B0001001p,B0510005p,Proponents of divine right deny mankind's natu...,While the legislative is the supreme power dur...,
2,B0001001p,B0749009p,Proponents of divine right deny mankind's natu...,A king's authority is not inherent but granted...,
3,B0001001p,B0749004p,Proponents of divine right deny mankind's natu...,The people have the authority to choose their ...,
4,B0001001p,B0752008p,Proponents of divine right deny mankind's natu...,Nations possess the autonomy to determine thei...,


In [13]:
df_labeled_crossed = fea.build_equiv_pair_candidates(
    df = df_labeled,
    id1_col = "id1",
    id2_col = "id2",
    equiv1_col = "equivalents1",
    equiv2_col = "equivalents2",
)

# Retrieve clause sentences
df_labeled_crossed = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_labeled_crossed,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['id1', 'id2']
)

df_labeled_crossed.head()

Filtered 128 pairs (kept 2056).


Unnamed: 0,id1,id2,text1,text2,verdict
0,S0022948006p,B0090003p,The populace is united in their desire for a s...,Individuals are allowed the liberty to establi...,
1,S0022948006p,B0768002p,The populace is united in their desire for a s...,Historical examples and philosophical reasonin...,
2,S0023235007p,B0205002p,The Commons is tasked with protecting the righ...,The role of a king is to serve the public good,
3,S0018125004p,B0714004p,Parliament is the only body capable of keeping...,The magistrate's power is contingent upon what...,
4,S0018125004p,B0783004p,Parliament is the only body capable of keeping...,The King cannot deny the enactment of just laws,


## Running FEA

In [14]:
import gc, os, pickle

temp_dir = "fea_iterations/temp_data"
os.makedirs(temp_dir, exist_ok=True)

n_cand = len(df_candidates)
n_obs  = len(df_obs_ent)

if n_cand == 0 or n_obs == 0:
    print(f"⚠ Skipping FreeEntailmentAlgorithm (empty data: "
          f"{n_cand} candidates, {n_obs} entailed pairs)")
    df_final = pd.DataFrame(
        columns=['id1', 'id2', 'text1', 'text2', 'entailment_probability'])
    fig_html = "<p>No data for this iteration</p>"
else:
    # 1. Pickle all DataFrames to disk for FreeEntailmentAlgorithm
    df_candidates.to_pickle(f"{temp_dir}/df_candidates.pkl")
    print(f"  Pickled df_candidates: {n_cand:,} rows, cols={list(df_candidates.columns)}")
    df_crossed.to_pickle(f"{temp_dir}/df_crossed.pkl")
    df_labeled.to_pickle(f"{temp_dir}/df_labeled.pkl")
    df_labeled_crossed.to_pickle(f"{temp_dir}/df_labeled_crossed.pkl")
    df_obs_ent.to_pickle(f"{temp_dir}/df_obs_ent.pkl")
    df_clause.to_pickle(f"{temp_dir}/df_clause.pkl")
    with open(f"{temp_dir}/embedding_cache.pkl", 'wb') as f:
        pickle.dump(embedding_cache_finetuned, f)

    # 2. FREE all large DataFrames BEFORE spawning FreeEntailmentAlgorithm.
    #    papermill runs a NEW kernel process — if we keep these in memory,
    #    we'd have two copies of the 75M-row DataFrame across two processes.
    try: del df_candidates
    except NameError: pass
    try: del df_predict
    except NameError: pass
    try: del df_crossed
    except NameError: pass
    try: del df_labeled_crossed
    except NameError: pass
    try: del df_obs_ent
    except NameError: pass
    try: del equiv_map
    except NameError: pass
    gc.collect()
    print("  ✓ Freed large DataFrames before FreeEntailmentAlgorithm subprocess")

    # 3. Execute FreeEntailmentAlgorithm (data already on disk)
    df_final, fig_html = fea.run_fea_papermill(
        iteration_number=iteration_number,
        temp_dir=temp_dir,
        data_on_disk=True,
    )

print(f"✓ df_final: {len(df_final)} rows")

  Pickled df_candidates: 4,999,977 rows, cols=['id1', 'id2', 'verdict', 'equivalents1', 'equivalents2', 'alpha']


  ✓ Freed large DataFrames before FreeEntailmentAlgorithm subprocess
Executing FreeEntailmentAlgorithm.ipynb for iteration 0...


Executing:   0%|          | 0/33 [00:00<?, ?cell/s]

✓ Retrieved outputs:
  - df_final: 4785065 rows
  - fig_html: HTML plot (14638 chars)
✓ df_final: 4785065 rows


In [15]:
df_final.head()

Unnamed: 0,id1,id2,text1,text2,entailment_probability
1,B0001001p,B0005008p,Proponents of divine right deny mankind's natu...,The collapse of the distinction between lawful...,0.706927
2,B0001001p,B0008009p,Proponents of divine right deny mankind's natu...,This dissolves the bonds of government and obe...,0.437574
3,B0001001p,B0012005p,Proponents of divine right deny mankind's natu...,Not all excesses warrant such correction; mino...,0.669992
4,B0001001p,B0019001p,Proponents of divine right deny mankind's natu...,"Men are naturally free, equal, and independent...",0.917073
5,B0001001p,B0020002p,Proponents of divine right deny mankind's natu...,The very foundation of political society is bu...,0.918669


# Task 2: Cleaning LLM Calls

In [16]:
# Cap at 100k pairs max — send ALL pairs above threshold (no random subsampling)
MAX_LLM_PAIRS = 1000

df_final = df_final.reset_index(drop=True)

if len(df_final) > MAX_LLM_PAIRS:
    df_to_llm = df_final.sample(n=MAX_LLM_PAIRS, random_state=42)
    print(f"Capped df_to_llm at {MAX_LLM_PAIRS:,} (from {len(df_final):,} above threshold)")
else:
    df_to_llm = df_final.copy()
    print(f"Sending all {len(df_to_llm):,} pairs above threshold to LLM")

Capped df_to_llm at 1,000 (from 4,785,065 above threshold)


In [17]:
df_to_llm = fea.format_df_to_llm(df_to_llm, df_clause=df_clause, id_col='sentence_id', text_col='sentence')
df_to_llm.head()

Unnamed: 0,sentence_id_2,sentence_id_1,sentence_text_2,argument_id_2,sentence_text_1,argument_id_1,score
3162774,S0022931007p,B1004002p,A long-term commitment to the Revenue would de...,S0022931,This form of governance is rooted in the struc...,B1004,0.908391
1473841,S0022862004p,B0331004p,It is essential to recognize that the laws ena...,S0022862,Certain subsidies may be reasonable if agreed ...,B0331,0.923941
3075621,S0020662001p,B0967004p,The necessity for the House of Lords to align ...,S0020662,The Council's authority was deemed 'sacred' an...,B0967,0.916521
3630189,S0010739001p,B1196004p,It is unacceptable for Mr. Brunkard to be in c...,S0010739,The exhortation to execute judgment and righte...,B1196,0.919153
4416797,S0023550001sc,B0736002sc,Parliament must introduce a bill to address in...,S0023550,The commonwealth's integrity is restored in re...,B0736,0.557881


In [18]:
df_to_llm.shape

(1000, 7)

# Next loop:

In [19]:
# Reload unlabeled_pairs from pickle if in production mode.
# It was freed earlier to save memory while FreeEntailmentAlgorithm ran.
if not test and unlabeled_pairs_path:
    import gc
    unlabeled_pairs = pd.read_pickle(unlabeled_pairs_path)
    print(f"Reloaded unlabeled_pairs: {len(unlabeled_pairs):,} rows")
    gc.collect()

result = fea.finalize_pipeline_iteration(
    test=test,
    df_to_llm=df_to_llm,
    iteration_number=iteration_number,
    remaining_llm_calls=remaining_llm_calls,
    remaining_llm_calls_path=remaining_llm_calls_path,
    unlabeled_pairs=unlabeled_pairs,
    unlabeled_pairs_path=unlabeled_pairs_path,
)

remaining_llm_calls = result['remaining_llm_calls']
unlabeled_pairs = result['unlabeled_pairs']

Reloaded unlabeled_pairs: 4,999,977 rows


✓ Saved 1000 pairs to fea_iterations/llm_batch_iter_0.csv for LLM processing
✓ LLM labeled pairs updated: 2995 total in labeled_pairs/llm_labeled_pairs.csv


✓ Removed 1000 pairs from unlabeled_pairs
✓ Remaining pairs for future iterations: 4998977


✓ Saved updated unlabeled_pairs to fea_iterations\loop_data/unlabeled_pairs.pkl


✓ Glued df_to_llm to scrapbook for FEA_Loop retrieval

Iteration 0 complete
