In [1]:
import pandas as pd
import sys
import transformers.utils.hub
import transformers.tokenization_utils_base
import pickle
import importlib
import os
import papermill as pm
import scrapbook as sb
import numpy as np

from sentence_transformers import SentenceTransformer
from collections import defaultdict
from typing import List, Tuple, Any, Mapping, Iterable, Dict, Literal

import free_entailments_algorithm_utils as fea



In [2]:
# Parameters cell - tagged for papermill injection
STANDALONE_TEST = True
iteration_number = 1
input_csv_path = "Results_DS_BtoS.csv"
df_clause_path = None
embedding_cache_path = None
total_cost = 0.0
test = True
remaining_llm_calls_path = None
df_pairs_path = None
sent_frac = 0.5
budget = 0.0

In [3]:
# Parameters
STANDALONE_TEST = False
iteration_number = 5
input_csv_path = "fea_iterations/loop_data/accumulated_labeled_iter_4.csv"
df_clause_path = "fea_iterations/loop_data/df_clause.pkl"
embedding_cache_path = "fea_iterations/loop_data/embedding_cache.pkl"
total_cost = 41.120598333333334
test = True
remaining_llm_calls_path = "fea_iterations/loop_data/remaining_llm_calls.pkl"
df_pairs_path = None
sent_frac = 0.5
budget = 0.0


In [4]:
# Loading logic (runs after papermill injection)
print(f"\n{'='*80}")
print(f"PARAMETER VALUES AFTER PAPERMILL INJECTION:")
print(f"{'='*80}")
print(f"STANDALONE_TEST = {STANDALONE_TEST}")
print(f"iteration_number = {iteration_number}")
print(f"test = {test}")
print(f"remaining_llm_calls_path = {remaining_llm_calls_path}")
print(f"df_clause_path = {df_clause_path}")
print(f"{'='*80}\n")

if STANDALONE_TEST:
    # Standalone mode: load from default files
    print("✓ Running in STANDALONE mode")
    
    # Load df_clause
    df_p = pd.read_excel("ClauseLevel_df_p.xlsx")
    df_sc = pd.read_excel("ClauseLevel_df_sc.xlsx")
    df_clause = pd.concat([df_p, df_sc]).drop_duplicates(subset='sentence_id')
    
    # Load embedding cache
    with open("embedding_cache_finetuned.pkl", 'rb') as f:
        embedding_cache_finetuned = pickle.load(f)
    print(f"✓ Loaded embedding cache: {len(embedding_cache_finetuned)} embeddings")
    
    # Create remaining_llm_calls and df_pairs based on test mode
    if test:
        df_input = pd.read_csv(input_csv_path)
        input_csv_path, remaining_llm_calls = fea.two_random_subsamples(
            df=df_input,
            frac1=0.8,  # 80% labeled
            frac2=0.2,  # 20% unlabeled
            random_state=42
        )
        df_pairs = None
    else:
        remaining_llm_calls = None
        df_pairs = fea.generate_valid_pairs(
            df_p,
            df_sc,
            'sentence_id',
            'sentence',
            max_pairs=500000,
        )
    
    print(f"✓ Standalone test mode: Parameters loaded")
    print(f"  - Initial labeled: {len(input_csv_path) if isinstance(input_csv_path, pd.DataFrame) else 'CSV path'}")
    print(f"  - Initial unlabeled: {len(remaining_llm_calls) if remaining_llm_calls is not None else 0}")
else:
    # Papermill mode: parameters were injected in the previous cell
    print(f"✓ Running in PAPERMILL mode")
    print(f"  - iteration_number: {iteration_number}")
    print(f"  - test: {test}")
    print(f"  - remaining_llm_calls_path: {remaining_llm_calls_path}")
    
    # Load df_clause
    if df_clause_path:
        df_clause = pd.read_pickle(df_clause_path)
        print(f"✓ Loaded df_clause: {len(df_clause)} rows")
    else:
        raise ValueError("df_clause_path is None - papermill didn't inject parameters correctly")
    
    # Load embedding cache
    if embedding_cache_path:
        with open(embedding_cache_path, 'rb') as f:
            embedding_cache_finetuned = pickle.load(f)
        print(f"✓ Loaded embedding cache: {len(embedding_cache_finetuned)} embeddings")
    else:
        raise ValueError("embedding_cache_path is None - papermill didn't inject parameters correctly")
    
    # Load test-specific data
    if test and remaining_llm_calls_path:
        remaining_llm_calls = pd.read_pickle(remaining_llm_calls_path)
        df_pairs = None
        print(f"✓ Loaded remaining_llm_calls: {len(remaining_llm_calls)} rows")
    elif not test and df_pairs_path:
        df_pairs = pd.read_pickle(df_pairs_path)
        remaining_llm_calls = None
        print(f"✓ Loaded df_pairs: {len(df_pairs)} rows")
    else:
        # This shouldn't happen if papermill is working correctly
        print(f"⚠ WARNING: No data loaded!")
        print(f"  test={test}, remaining_llm_calls_path={remaining_llm_calls_path}, df_pairs_path={df_pairs_path}")
        remaining_llm_calls = None
        df_pairs = None
    
    print(f"✓ All data loaded from pickle files")


PARAMETER VALUES AFTER PAPERMILL INJECTION:
STANDALONE_TEST = False
iteration_number = 5
test = True
remaining_llm_calls_path = fea_iterations/loop_data/remaining_llm_calls.pkl
df_clause_path = fea_iterations/loop_data/df_clause.pkl

✓ Running in PAPERMILL mode
  - iteration_number: 5
  - test: True
  - remaining_llm_calls_path: fea_iterations/loop_data/remaining_llm_calls.pkl
✓ Loaded df_clause: 63909 rows


✓ Loaded embedding cache: 58538 embeddings
✓ Loaded remaining_llm_calls: 3056 rows
✓ All data loaded from pickle files


# Task 1: Seting up dataframes and Running FEA

In [5]:
# Load input data (path is set by papermill parameters)
# In standalone mode, input_csv_path may be a DataFrame (after two_random_subsamples)
# When called via papermill, it's always a string path
if isinstance(input_csv_path, pd.DataFrame):
    df_llm_original = input_csv_path
else:
    df_llm_original = pd.read_csv(input_csv_path)

print(f"Total cost so far: ${total_cost:.4f}")

df_llm = fea.add_verdict(
    df_llm_original,
    id1_col='sentence_id_1',
    id2_col='sentence_id_2',
    conclusion_col='llm_conclusion_12',
    positive_label='YES'
)

if test:
    df_llm_remaining = fea.add_verdict(
        remaining_llm_calls,
        id1_col='sentence_id_1',
        id2_col='sentence_id_2',
        conclusion_col='llm_conclusion_12',
        positive_label='YES'
    )

Total cost so far: $41.1206



VERDICT SUMMARY
Total pairs: 6944
Bidirectional entailment (YES): 691 (10.0%)
Not bidirectionally entailed (NO): 6253 (90.0%)




VERDICT SUMMARY
Total pairs: 3056
Bidirectional entailment (YES): 524 (17.1%)
Not bidirectionally entailed (NO): 2532 (82.9%)



In [6]:
df_labeled = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_llm,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['sentence_id_1', 'sentence_id_2', 'verdict']
)
df_labeled.head()

Unnamed: 0,id1,id2,text1,text2,verdict
0,B0860002sc,S0010771002sc,The king's support must match his responsibili...,clear evidence of acting against the interests...,NO
1,B1170001sc,S0020225001sc,Active governance by the prince is essential f...,Maintaining respect for the monarchy is essential,NO
2,B0454001p,S0004868005p,Agrarian laws can effectively prevent the rise...,This situation highlights the tension between ...,NO
3,B0227001sc,S0000883002sc,Parliament should hold the power to correct le...,Parliament must uphold the rule of law,YES
4,B0580002sc,S0023399001sc,The king's presence is essential for validatin...,Parliament must assert authority,NO


In [7]:
if test:
    df_predict = fea.merge_pairwise_texts(
        df1 = df_clause,
        df2 = df_llm_remaining,
        df1_cols = ['sentence_id', 'sentence'],
        df2_cols = ['sentence_id_1', 'sentence_id_2']
    )
else:
    df_predict = fea.setminus(
        df_big= df_pairs,
        df_small= df_labeled,
        id_cols = ['id1', 'id2']
    )

df_predict.head()

Unnamed: 0,id1,id2,text1,text2,verdict
0,B0783006p,S0019961006p,The Parliament holds the power to regulate the...,"By promptly seeking the King's guidance, Parli...",
1,B0287002sc,S0019015004sc,A government must maintain societal integrity,Accountability ensures government integrity,
2,B0336005p,S0016856004p,Such focused discussions would enable the coun...,Established procedures should guide discussion...,
3,B0791001sc,S0019314003sc,The House of Commons fulfilled its duty to saf...,Members of Parliament must protect civil liber...,
4,B0789001sc,S0000754001sc,Protecting the people's rights takes precedenc...,King Charles I believes that balancing royal p...,


## Embedding All Sentences

In [8]:
##Patches an error later on with kwargs

def _safe_list_templates(*args, **kwargs):
    return []

transformers.utils.hub.list_repo_templates = _safe_list_templates
print(" - Patched transformers.utils.hub")

# The library had already imported the broken function here, so we must update it.
transformers.tokenization_utils_base.list_repo_templates = _safe_list_templates
print(" - Patched transformers.tokenization_utils_base")

print("\nSUCCESS: The 404 error is now blocked.")

 - Patched transformers.utils.hub
 - Patched transformers.tokenization_utils_base

SUCCESS: The 404 error is now blocked.


## Test and Validation Subsamples

In [9]:
# Keep only entailed pairs from sent
df_obs_ent = df_labeled.loc[df_labeled['verdict'] == 'YES']
df_obs_ent.head()

Unnamed: 0,id1,id2,text1,text2,verdict
3,B0227001sc,S0000883002sc,Parliament should hold the power to correct le...,Parliament must uphold the rule of law,YES
18,B0134001sc,S0004953001sc,Governance derives its legitimacy from the peo...,Governance legitimacy should come from the wil...,YES
20,B0794007p,S0000823011p,King Charles's actions demonstrate a tyrannica...,King Charles I's disregard for the people's vo...,YES
22,B0161002p,S15310007p,Such actions threaten the liberties and well-b...,Such actions endanger the rights of individual...,YES
26,B0252006p,S0000715007p,"The authority of governing bodies, like Parlia...",Parliament serves as a check on the power of t...,YES


In [10]:
# Get set of pairs observed to be entailed to each clause
df_candidates = fea.add_equivalents_from_pairs(
    df3=df_obs_ent,
    df4=df_predict,
    df3_cols=["id1", "id2"],
    df4_cols=["id1", "id2"],
    new_cols=("equivalents1", "equivalents2"),
    include_self=False,  # keep the ID itself in the list
)

# Get adequate weight
df_candidates = fea.add_alpha_weight_column(
    df = df_candidates,
    list_col1 = 'equivalents1',
    list_col2 = 'equivalents2',
    new_col = "alpha"
)

In [11]:
df_labeled = fea.add_equivalents_from_pairs(
    df3=df_obs_ent,
    df4=df_labeled,
    df3_cols=["id1", "id2"],
    df4_cols=["id1", "id2"],
    new_cols=("equivalents1", "equivalents2"),
    include_self=False,  # keep the ID itself in the list
)

df_labeled = fea.add_alpha_weight_column(
    df = df_labeled,
    list_col1 = 'equivalents1',
    list_col2 = 'equivalents2',
    new_col = "alpha"
)

## Equivalence Classes

In [12]:
# Produce set of all pairs of clauses i/j with k in the class of j/i
df_crossed = fea.build_equiv_pair_candidates(
    df = df_candidates,
    id1_col = "id1",
    id2_col = "id2",
    equiv1_col = "equivalents1",
    equiv2_col = "equivalents2",
)

# Retrieve clause sentences
df_crossed = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_crossed,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['id1', 'id2']
)

df_crossed.head()

Filtered 2259 pairs (kept 625).


Unnamed: 0,id1,id2,text1,text2,verdict
0,B0711002sc,B0204002sc,Laws establish a structured relationship with ...,Laws protect their rights in the political str...,
1,B0711002sc,B0289001sc,Laws establish a structured relationship with ...,The legal system ensures justice through a col...,
2,B0711002sc,B1203002sc,Laws establish a structured relationship with ...,to maintain justice,
3,B0770001sc,B1133003sc,A monarchy can coexist with popular authority,Citizens must maintain stability in the monarchy,
4,B0134004p,B0124002p,True governance requires an active and deliber...,The essence of governance is rooted in the con...,


In [13]:
df_labeled_crossed = fea.build_equiv_pair_candidates(
    df = df_labeled,
    id1_col = "id1",
    id2_col = "id2",
    equiv1_col = "equivalents1",
    equiv2_col = "equivalents2",
)

# Retrieve clause sentences
df_labeled_crossed = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_labeled_crossed,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['id1', 'id2']
)

df_labeled_crossed.head()

Filtered 3360 pairs (kept 2880).


Unnamed: 0,id1,id2,text1,text2,verdict
0,B0454001p,B0203001p,Agrarian laws can effectively prevent the rise...,Laws are necessary to limit the power of kings,
1,B0454001p,B0278002p,Agrarian laws can effectively prevent the rise...,Allowing a King to have absolute power undermi...,
2,B0454001p,B0314009p,Agrarian laws can effectively prevent the rise...,The historical context shows that the struggle...,
3,B0454001p,B0545004p,Agrarian laws can effectively prevent the rise...,When rulers act contrary to the laws establish...,
4,B0227001sc,B0227001sc,Parliament should hold the power to correct le...,Parliament should hold the power to correct le...,


## Running FEA

In [14]:
# Save DataFrames to temporary files for passing to FreeEntailmentAlgorithm
temp_dir = "fea_iterations/temp_data"
os.makedirs(temp_dir, exist_ok=True)

# Save all DataFrames as pickle files (preserves all data types)
df_candidates.to_pickle(f"{temp_dir}/df_candidates.pkl")
df_crossed.to_pickle(f"{temp_dir}/df_crossed.pkl")
df_labeled.to_pickle(f"{temp_dir}/df_labeled.pkl")
df_labeled_crossed.to_pickle(f"{temp_dir}/df_labeled_crossed.pkl")
df_obs_ent.to_pickle(f"{temp_dir}/df_obs_ent.pkl")
df_clause.to_pickle(f"{temp_dir}/df_clause.pkl")

# Save embedding cache
with open(f"{temp_dir}/embedding_cache.pkl", 'wb') as f:
    pickle.dump(embedding_cache_finetuned, f)

# Prepare parameters - now just pass file paths (strings)
parameters = {
    "STANDALONE_TEST": False,
    "df_candidates_path": f"{temp_dir}/df_candidates.pkl",
    "df_crossed_path": f"{temp_dir}/df_crossed.pkl",
    "df_labeled_path": f"{temp_dir}/df_labeled.pkl",
    "df_labeled_crossed_path": f"{temp_dir}/df_labeled_crossed.pkl",
    "df_obs_ent_path": f"{temp_dir}/df_obs_ent.pkl",
    "df_clause_path": f"{temp_dir}/df_clause.pkl",
    "embedding_cache_path": f"{temp_dir}/embedding_cache.pkl",
    "total_cost": total_cost
}

# Define output path for the executed notebook
output_notebook_path = f"fea_iterations/FEA_iter_{iteration_number}.ipynb"
os.makedirs("fea_iterations", exist_ok=True)

print(f"Executing FreeEntailmentAlgorithm.ipynb for iteration {iteration_number}...")
pm.execute_notebook(
    'FreeEntailmentAlgorithm.ipynb',
    output_notebook_path,
    parameters=parameters
)

# Read outputs from the executed notebook
nb = sb.read_notebook(output_notebook_path)

df_final = nb.scraps['df_final'].data
fig_html = nb.scraps['fig_html'].data
cost = nb.scraps['cost'].data  # Full cost for all pairs above threshold

print(f"✓ Retrieved outputs:")
print(f"  - df_final: {len(df_final)} rows")
print(f"  - fig_html: HTML plot ({len(fig_html)} chars)")
print(f"  - estimated_cost_all_pairs: ${cost:.4f}")

Executing FreeEntailmentAlgorithm.ipynb for iteration 5...


Executing:   0%|          | 0/33 [00:00<?, ?cell/s]

✓ Retrieved outputs:
  - df_final: 65 rows
  - fig_html: HTML plot (14554 chars)
  - estimated_cost_all_pairs: $0.1247


In [15]:
df_final.head()

Unnamed: 0,id1,id2,text1,text2,entailment_probability
64,B0222005p,S0018950005p,"In a monarchy, the king must be above the law ...","In this tumultuous period, where the balance o...",0.679869
116,B1210005p,S0017289004p,Subjects are still expected to comply with the...,The King’s actions suggest a willingness to en...,0.680346
174,B0311001p,S0023803002p,A king cannot unilaterally impose laws without...,The argument against the idea that the King po...,0.727208
237,B0684006p,S0022948006p,The fleeting nature of human glory can comprom...,The King need not doubt the affections of his ...,0.652096
266,B0547003p,S0020364008p,A king provides the necessary unity and direct...,The King’s prerogative should not shield those...,0.737322


# Task 2: Cleaning LLM Calls

In [16]:
# Reset index to make it writable for permutation
df_final = df_final.reset_index(drop=True)
df_to_llm, rest_above_tau = fea.two_random_subsamples(df_final, sent_frac, 1 - sent_frac, 42)

total_cost += cost * sent_frac
total_cost

41.18296041666667

In [17]:
df_to_llm = df_to_llm[['id1', 'id2', 'text1', 'text2', 'entailment_probability']].copy()

# Rename columns to match desired format
df_to_llm = df_to_llm.rename(columns={
    'id1': 'sentence_id_1',
    'id2': 'sentence_id_2', 
    'text1': 'sentence_text_1',
    'text2': 'sentence_text_2',
    'entailment_probability': 'score'
})

# Extract argument IDs
def extract_argument_id(sentence_id):
    """Extract argument ID from sentence ID based on prefix"""
    if not isinstance(sentence_id, str):
        return None
    if sentence_id.startswith('S'):
        # Speech: S + first 5 digits (e.g., S11150001sc -> S11150)
        return sentence_id[:6] if len(sentence_id) >= 6 else sentence_id
    elif sentence_id.startswith('B'):
        # Book: B + first 4 digits (e.g., B0249004sc -> B0249)
        return sentence_id[:5] if len(sentence_id) >= 5 else sentence_id
    return None

df_to_llm['argument_id_1'] = df_to_llm['sentence_id_1'].apply(extract_argument_id)
df_to_llm['argument_id_2'] = df_to_llm['sentence_id_2'].apply(extract_argument_id)

# Reorder columns to match desired format
df_to_llm = df_to_llm[['sentence_id_2', 'sentence_id_1', 'sentence_text_2', 'argument_id_2', 
                         'sentence_text_1', 'argument_id_1', 'score']]

df_to_llm.head()

Unnamed: 0,sentence_id_2,sentence_id_1,sentence_text_2,argument_id_2,sentence_text_1,argument_id_1,score
29,S0017094004p,B0956004p,Failure to honor the King could lead to dire c...,S00170,Historical examples illustrate that even when ...,B0956,0.5543
42,S0023729009p,B0080002p,A constitutional monarchy should reflect the w...,S00237,The acceptance of absolute monarchy relies on ...,B0080,0.492206
18,S0010886005p,B0904004p,The collective power of the 41 individuals may...,S00108,"When a single Monarch governs, the path to ach...",B0904,0.733856
24,S0022793004p,B0346005p,"If William is declared King de facto, it may e...",S00227,"Without the support and consent of the people,...",B0346,0.639189
7,S0024472004p,B1154002p,A constitutional monarchy ensures a more stabl...,S00244,Elected monarchs govern with laws that are mor...,B1154,0.401941


In [18]:
df_to_llm.shape

(32, 7)

# Next loop:

In [19]:
if test:
    # In test mode: Pull LLM results from remaining_llm_calls for the pairs in df_to_llm
    print(f"\n{'='*60}")
    print("TEST MODE: Mocking LLM responses")
    print(f"{'='*60}")
    
    # Merge df_to_llm with remaining_llm_calls to get LLM conclusions
    df_to_llm_with_results = df_to_llm.merge(
        remaining_llm_calls[['sentence_id_1', 'sentence_id_2', 'answers_12', 'reasonings_12', 
                             'comment_12', 'llm_confidence_12', 'llm_conclusion_12']],
        on=['sentence_id_1', 'sentence_id_2'],
        how='left'
    )
    
    # Check how many got matched
    matched = df_to_llm_with_results['llm_conclusion_12'].notna().sum()
    print(f"✓ Matched {matched}/{len(df_to_llm_with_results)} pairs with mock LLM results")
    
    if matched < len(df_to_llm_with_results):
        print(f"⚠ Warning: {len(df_to_llm_with_results) - matched} pairs have no mock LLM data")
    
    # Remove matched pairs from remaining_llm_calls for next iteration
    sent_pairs = set(zip(df_to_llm['sentence_id_1'], df_to_llm['sentence_id_2']))
    mask = remaining_llm_calls.apply(
        lambda row: (row['sentence_id_1'], row['sentence_id_2']) not in sent_pairs,
        axis=1
    )
    remaining_llm_calls = remaining_llm_calls[mask].copy()
    
    print(f"✓ Removed {len(sent_pairs)} pairs from remaining LLM calls")
    print(f"✓ Remaining pairs for future iterations: {len(remaining_llm_calls)}")
    
    # Save updated remaining_llm_calls back to pickle for next iteration
    if remaining_llm_calls_path:
        remaining_llm_calls.to_pickle(remaining_llm_calls_path)
        print(f"✓ Saved updated remaining_llm_calls to {remaining_llm_calls_path}")
    
    # Save the enriched results (with LLM conclusions) for next iteration input
    output_csv = f"fea_iterations/llm_batch_iter_{iteration_number}.csv"
    df_to_llm_with_results.to_csv(output_csv, index=False)
    print(f"✓ Saved {len(df_to_llm_with_results)} pairs with LLM results to {output_csv}")
else:
    # Production mode: Save df_to_llm for actual LLM processing
    output_csv = f"fea_iterations/llm_batch_iter_{iteration_number}.csv"
    df_to_llm.to_csv(output_csv, index=False)
    print(f"✓ Saved {len(df_to_llm)} pairs to {output_csv} for LLM processing")

print(f"\nIteration {iteration_number} complete")
print(f"Total accumulated cost: ${total_cost:.4f}")

sb.glue('cost', total_cost)


TEST MODE: Mocking LLM responses
✓ Matched 32/32 pairs with mock LLM results
✓ Removed 32 pairs from remaining LLM calls
✓ Remaining pairs for future iterations: 3024
✓ Saved updated remaining_llm_calls to fea_iterations/loop_data/remaining_llm_calls.pkl
✓ Saved 32 pairs with LLM results to fea_iterations/llm_batch_iter_5.csv

Iteration 5 complete
Total accumulated cost: $41.1830
