In [97]:
import pandas as pd
import sys
import transformers.utils.hub
import transformers.tokenization_utils_base
import pickle
import importlib
import os
import papermill as pm
import scrapbook as sb
import numpy as np

from sentence_transformers import SentenceTransformer
from collections import defaultdict
from typing import List, Tuple, Any, Mapping, Iterable, Dict, Literal

import free_entailments_algorithm_utils as fea



In [None]:
iteration_number = 1
input_csv_path = "labeled_pairs/Results_DS_BtoS_iteration_0.csv"
df_clause_path = None
embedding_cache_path = None
test = True
remaining_llm_calls_path = None
unlabeled_pairs_path = None
sent_frac = 0.5
budget = 0.0

✓ Loaded embedding cache: 58538 embeddings
✓ Standalone test mode: Parameters loaded


In [None]:
pipeline_data = fea.load_pipeline_data(
    df_clause_path=df_clause_path,
    embedding_cache_path=embedding_cache_path,
    test=test,
    remaining_llm_calls_path=remaining_llm_calls_path,
    unlabeled_pairs_path=unlabeled_pairs_path,
    iteration_number=iteration_number,
)

df_clause = pipeline_data['df_clause']
embedding_cache_finetuned = pipeline_data['embedding_cache']
remaining_llm_calls = pipeline_data['remaining_llm_calls']
unlabeled_pairs = pipeline_data['unlabeled_pairs']

# Task 1: Seting up dataframes and Running FEA

In [None]:
df_llm_original = pd.read_csv(input_csv_path)

df_llm = fea.add_verdict(
    df_llm_original,
    id1_col='sentence_id_1',
    id2_col='sentence_id_2',
    conclusion_col='llm_conclusion_12',
    positive_label='YES'
)

if test:
    df_llm_remaining = fea.add_verdict(
        remaining_llm_calls,
        id1_col='sentence_id_1',
        id2_col='sentence_id_2',
        conclusion_col='llm_conclusion_12',
        positive_label='YES'
    )

Total cost so far: $0.0000

VERDICT SUMMARY
Total pairs: 2000
Bidirectional entailment (YES): 241 (12.0%)
Not bidirectionally entailed (NO): 1759 (87.9%)


VERDICT SUMMARY
Total pairs: 8000
Bidirectional entailment (YES): 974 (12.2%)
Not bidirectionally entailed (NO): 7026 (87.8%)



In [45]:
df_labeled = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_llm,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['sentence_id_1', 'sentence_id_2', 'verdict']
)
df_labeled.head()

Unnamed: 0,id1,id2,text1,text2,verdict
0,B0860002sc,S0010771002sc,The king's support must match his responsibili...,clear evidence of acting against the interests...,NO
1,B1170001sc,S0020225001sc,Active governance by the prince is essential f...,Maintaining respect for the monarchy is essential,NO
2,B0454001p,S0004868005p,Agrarian laws can effectively prevent the rise...,This situation highlights the tension between ...,NO
3,B0227001sc,S0000883002sc,Parliament should hold the power to correct le...,Parliament must uphold the rule of law,YES
4,B0580002sc,S0023399001sc,The king's presence is essential for validatin...,Parliament must assert authority,NO


In [None]:
if test:
    df_predict = fea.merge_pairwise_texts(
        df1 = df_clause,
        df2 = df_llm_remaining,
        df1_cols = ['sentence_id', 'sentence'],
        df2_cols = ['sentence_id_1', 'sentence_id_2']
    )
else:
    df_predict = fea.setminus(
        df_big= unlabeled_pairs,
        df_small= df_labeled,
        id_cols = ['id1', 'id2']
    )

    df_predict = fea.merge_pairwise_texts(
        df1 = df_clause,
        df2 = df_predict,
        df1_cols = ['sentence_id', 'sentence'],
        df2_cols = ['id1', 'id2']
    )

df_predict.head()


Unnamed: 0,id1,id2,text1,text2,verdict
0,B0859002p,S5393003p,The authority of a king is divinely ordained a...,The assertion that the king's power is derived...,
1,B0672011p,S15260001p,Ensuring the stability and governance of the s...,Establishing a stable government requires adhe...,
2,B0589007p,S0000863004p,The rebellion was provoked by the subjects' at...,King Charles I believed that asserting his aut...,
3,B0382001p,S8507005p,The authority of a king is not absolute; it is...,The supremacy of royal authority in governance...,
4,B1114001sc,S0000941002sc,The connection between protection and obedienc...,to advocate for peace,


## Embedding All Sentences

In [47]:
##Patches an error later on with kwargs

def _safe_list_templates(*args, **kwargs):
    return []

transformers.utils.hub.list_repo_templates = _safe_list_templates
print(" - Patched transformers.utils.hub")

# The library had already imported the broken function here, so we must update it.
transformers.tokenization_utils_base.list_repo_templates = _safe_list_templates
print(" - Patched transformers.tokenization_utils_base")

print("\nSUCCESS: The 404 error is now blocked.")

 - Patched transformers.utils.hub
 - Patched transformers.tokenization_utils_base

SUCCESS: The 404 error is now blocked.


## Test and Validation Subsamples

In [48]:
# Keep only entailed pairs from sent
df_obs_ent = df_labeled.loc[df_labeled['verdict'] == 'YES']
df_obs_ent.head()

Unnamed: 0,id1,id2,text1,text2,verdict
3,B0227001sc,S0000883002sc,Parliament should hold the power to correct le...,Parliament must uphold the rule of law,YES
18,B0134001sc,S0004953001sc,Governance derives its legitimacy from the peo...,Governance legitimacy should come from the wil...,YES
20,B0794007p,S0000823011p,King Charles's actions demonstrate a tyrannica...,King Charles I's disregard for the people's vo...,YES
22,B0161002p,S15310007p,Such actions threaten the liberties and well-b...,Such actions endanger the rights of individual...,YES
26,B0252006p,S0000715007p,"The authority of governing bodies, like Parlia...",Parliament serves as a check on the power of t...,YES


In [None]:
df_candidates = fea.add_equivalents_from_pairs(
    df3=df_obs_ent,
    df4=df_predict,
    df3_cols=["id1", "id2"],
    df4_cols=["id1", "id2"],
    new_cols=("equivalents1", "equivalents2"),
    include_self=False,
)

df_candidates = fea.add_alpha_weight_column(
    df = df_candidates,
    list_col1 = 'equivalents1',
    list_col2 = 'equivalents2',
    new_col = "alpha"
)

In [50]:
df_labeled = fea.add_equivalents_from_pairs(
    df3=df_obs_ent,
    df4=df_labeled,
    df3_cols=["id1", "id2"],
    df4_cols=["id1", "id2"],
    new_cols=("equivalents1", "equivalents2"),
    include_self=False,  # keep the ID itself in the list
)

df_labeled = fea.add_alpha_weight_column(
    df = df_labeled,
    list_col1 = 'equivalents1',
    list_col2 = 'equivalents2',
    new_col = "alpha"
)

## Equivalence Classes

In [51]:
# Produce set of all pairs of clauses i/j with k in the class of j/i
df_crossed = fea.build_equiv_pair_candidates(
    df = df_candidates,
    id1_col = "id1",
    id2_col = "id2",
    equiv1_col = "equivalents1",
    equiv2_col = "equivalents2",
)

# Retrieve clause sentences
df_crossed = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_crossed,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['id1', 'id2']
)

df_crossed.head()

Filtered 1228 pairs (kept 710).


Unnamed: 0,id1,id2,text1,text2,verdict
0,B1015002sc,B1009004sc,A stable society enables individual flourishing,Foster societal cohesion,
1,B0312002p,B0659002p,The concept of a free monarchy fundamentally c...,The legitimacy of royal power is rooted in the...,
2,B0781006p,B0223012p,The King's duty to uphold justice is emphasized,Limiting a King's authority to the consent of ...,
3,B0278001sc,B0795002sc,The King's power should be limited to promote ...,The King does not govern for the people's benefit,
4,B0190002p,B0223001p,The legitimacy of royal power is contingent up...,The authority of a King should indeed be limit...,


In [54]:
df_labeled_crossed = fea.build_equiv_pair_candidates(
    df = df_labeled,
    id1_col = "id1",
    id2_col = "id2",
    equiv1_col = "equivalents1",
    equiv2_col = "equivalents2",
)

# Retrieve clause sentences
df_labeled_crossed = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_labeled_crossed,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['id1', 'id2']
)

df_labeled_crossed.head()

Filtered 510 pairs (kept 456).


Unnamed: 0,id1,id2,text1,text2,verdict
0,B0227001sc,B0227001sc,Parliament should hold the power to correct le...,Parliament should hold the power to correct le...,
1,B0244002sc,B0311001sc,Parliaments in England can create laws indepen...,The king requires parliamentary approval to im...,
2,B0089006p,B0800005p,The rights and liberties of the people depend ...,The authority of a king or any governing body ...,
3,B0134001sc,B0134001sc,Governance derives its legitimacy from the peo...,Governance derives its legitimacy from the peo...,
4,B0794007p,B0794007p,King Charles's actions demonstrate a tyrannica...,King Charles's actions demonstrate a tyrannica...,


## Running FEA

In [None]:
df_final, fig_html = fea.run_fea_papermill(
    iteration_number=iteration_number,
    df_candidates=df_candidates,
    df_crossed=df_crossed,
    df_labeled=df_labeled,
    df_labeled_crossed=df_labeled_crossed,
    df_obs_ent=df_obs_ent,
    df_clause=df_clause,
    embedding_cache=embedding_cache_finetuned,
)

Executing FreeEntailmentAlgorithm.ipynb for iteration 1...


Executing:   0%|          | 0/34 [00:00<?, ?cell/s]

✓ Retrieved outputs:
  - df_final: 3748 rows
  - fig_html: HTML plot (14714 chars)
  - estimated_cost_all_pairs: $7.1918


In [90]:
df_final.head()

Unnamed: 0,id1,id2,text1,text2,entailment_probability
2,B0589007p,S0000863004p,The rebellion was provoked by the subjects' at...,King Charles I believed that asserting his aut...,0.637147
4,B1114001sc,S0000941002sc,The connection between protection and obedienc...,to advocate for peace,0.706039
5,B0244004p,S0024289007p,Parliaments possess the power to create and ab...,It is crucial to maintain a clear separation b...,0.708335
14,B0351002sc,S0003513002sc,The assembly of estates has the authority to p...,The House of Commons must protect the relation...,0.720604
18,B0403006p,S0020750006p,The power to grant pardons distinguishes the s...,The implications of allowing these pardons ext...,0.26041


# Task 2: Cleaning LLM Calls

In [None]:
df_final = df_final.reset_index(drop=True)
df_to_llm, rest_above_tau = fea.two_random_subsamples(df_final, sent_frac, 1 - sent_frac, 42)

3.5958936666666665

In [None]:
df_to_llm = fea.format_df_to_llm(df_to_llm)
df_to_llm.head()

Unnamed: 0,sentence_id_2,sentence_id_1,sentence_text_2,argument_id_2,sentence_text_1,argument_id_1,score
3039,S0005432003p,B1157006p,Upholding parliamentary authority is essential...,S00054,The governance ensured by the elected leader i...,B1157,0.72072
152,S0018405001p,B0273002p,The necessity for immediate and decisive advic...,S00184,The foundation of a king's authority is rooted...,B0273,0.614942
681,S0003019001sc,B0223001sc,The proposed Paper Address to the king require...,S00030,Limiting a King's authority to the consent of ...,B0223,0.687784
5947,S0051611003p,B1140007p,The potential abuse of power by the monarchy n...,S00516,"By distancing blood-relations from power, the ...",B1140,0.386518
6813,S0020972001p,B0778006p,The necessity of immediate action by Parliamen...,S00209,The King must act in accordance with the legal...,B0778,0.66668


In [95]:
df_to_llm.shape

(1874, 7)

# Next loop:

In [None]:
result = fea.finalize_pipeline_iteration(
    test=test,
    df_to_llm=df_to_llm,
    iteration_number=iteration_number,
    remaining_llm_calls=remaining_llm_calls,
    remaining_llm_calls_path=remaining_llm_calls_path,
    unlabeled_pairs=unlabeled_pairs,
    unlabeled_pairs_path=unlabeled_pairs_path,
)

remaining_llm_calls = result['remaining_llm_calls']
unlabeled_pairs = result['unlabeled_pairs']


TEST MODE: Mocking LLM responses
✓ Matched 1874/1874 pairs with mock LLM results
✓ Removed 1874 pairs from remaining LLM calls
✓ Remaining pairs for future iterations: 6126
✓ Saved 1874 pairs with LLM results to fea_iterations/llm_batch_iter_1.csv

Iteration 1 complete
Total accumulated cost: $0.0000
