In [1]:
import pandas as pd
import sys
import transformers.utils.hub
import transformers.tokenization_utils_base
import pickle
import importlib
import os
import papermill as pm
import scrapbook as sb
import numpy as np

from sentence_transformers import SentenceTransformer
from collections import defaultdict
from typing import List, Tuple, Any, Mapping, Iterable, Dict, Literal

import free_entailments_algorithm_utils as fea



In [2]:
iteration_number = 1
input_csv_path = "labeled_pairs/Results_DS_BtoS_iteration_0.csv"
df_clause_path = None
embedding_cache_path = None
test = True
remaining_llm_calls_path = None
unlabeled_pairs_path = None
sent_frac = 0.5
budget = 0.0

In [3]:
# Parameters
iteration_number = 0
input_csv_path = "fea_iterations/loop_data/input_iter_0.csv"
df_clause_path = "fea_iterations/loop_data/df_clause.pkl"
embedding_cache_path = "fea_iterations/loop_data/embedding_cache.pkl"
test = True
remaining_llm_calls_path = "fea_iterations/loop_data/remaining_llm_calls.pkl"
unlabeled_pairs_path = None
sent_frac = 0.5
budget = 0.0


In [4]:
pipeline_data = fea.load_pipeline_data(
    df_clause_path=df_clause_path,
    embedding_cache_path=embedding_cache_path,
    test=test,
    remaining_llm_calls_path=remaining_llm_calls_path,
    unlabeled_pairs_path=unlabeled_pairs_path,
    iteration_number=iteration_number,
)

df_clause = pipeline_data['df_clause']
embedding_cache_finetuned = pipeline_data['embedding_cache']
remaining_llm_calls = pipeline_data['remaining_llm_calls']
unlabeled_pairs = pipeline_data['unlabeled_pairs']


PARAMETER VALUES AFTER PAPERMILL INJECTION:
iteration_number = 0
test = True
remaining_llm_calls_path = fea_iterations/loop_data/remaining_llm_calls.pkl
df_clause_path = fea_iterations/loop_data/df_clause.pkl

✓ Loaded df_clause: 63909 rows


✓ Loaded embedding cache: 63909 embeddings
✓ Loaded remaining_llm_calls: 5000 rows
✓ All data loaded from pickle files


# Task 1: Seting up dataframes and Running FEA

In [5]:
df_llm_original = pd.read_csv(input_csv_path)

df_llm = fea.add_verdict(
    df_llm_original,
    id1_col='sentence_id_1',
    id2_col='sentence_id_2',
    conclusion_col='llm_conclusion_12',
    positive_label='YES'
)

if test:
    df_llm_remaining = fea.add_verdict(
        remaining_llm_calls,
        id1_col='sentence_id_1',
        id2_col='sentence_id_2',
        conclusion_col='llm_conclusion_12',
        positive_label='YES'
    )


VERDICT SUMMARY
Total pairs: 5000
Bidirectional entailment (YES): 588 (11.8%)
Not bidirectionally entailed (NO): 4412 (88.2%)




VERDICT SUMMARY
Total pairs: 5000
Bidirectional entailment (YES): 627 (12.5%)
Not bidirectionally entailed (NO): 4373 (87.5%)



In [6]:
df_labeled = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_llm,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['sentence_id_1', 'sentence_id_2', 'verdict']
)
df_labeled.head()

Unnamed: 0,id1,id2,text1,text2,verdict
0,B0860002sc,S0010771002sc,The king's support must match his responsibili...,clear evidence of acting against the interests...,NO
1,B1170001sc,S0020225001sc,Active governance by the prince is essential f...,Maintaining respect for the monarchy is essential,NO
2,B0454001p,S0004868005p,Agrarian laws can effectively prevent the rise...,This situation highlights the tension between ...,NO
3,B0227001sc,S0000883002sc,Parliament should hold the power to correct le...,Parliament must uphold the rule of law,YES
4,B0580002sc,S0023399001sc,The king's presence is essential for validatin...,Parliament must assert authority,NO


In [7]:
if test:
    df_predict = fea.merge_pairwise_texts(
        df1 = df_clause,
        df2 = df_llm_remaining,
        df1_cols = ['sentence_id', 'sentence'],
        df2_cols = ['sentence_id_1', 'sentence_id_2']
    )
else:
    df_predict = fea.setminus(
        df_big= unlabeled_pairs,
        df_small= df_labeled,
        id_cols = ['id1', 'id2']
    )

    df_predict = fea.merge_pairwise_texts(
        df1 = df_clause,
        df2 = df_predict,
        df1_cols = ['sentence_id', 'sentence'],
        df2_cols = ['id1', 'id2']
    )

df_predict.head()


Unnamed: 0,id1,id2,text1,text2,verdict
0,B0783006p,S0019961006p,The Parliament holds the power to regulate the...,"By promptly seeking the King's guidance, Parli...",
1,B1129001sc,S0003329002sc,A monarchical dominion needs a clear hierarchy,to enhance the monarchy-Parliament relationship,
2,B0287002sc,S0019015004sc,A government must maintain societal integrity,Accountability ensures government integrity,
3,B1086002sc,S0023795002sc,threatens the stability of a Commonwealth,Preserving Parliament's authority is essential...,
4,B0336005p,S0016856004p,Such focused discussions would enable the coun...,Established procedures should guide discussion...,


## Embedding All Sentences

In [8]:
##Patches an error later on with kwargs

def _safe_list_templates(*args, **kwargs):
    return []

transformers.utils.hub.list_repo_templates = _safe_list_templates
print(" - Patched transformers.utils.hub")

# The library had already imported the broken function here, so we must update it.
transformers.tokenization_utils_base.list_repo_templates = _safe_list_templates
print(" - Patched transformers.tokenization_utils_base")

print("\nSUCCESS: The 404 error is now blocked.")

 - Patched transformers.utils.hub
 - Patched transformers.tokenization_utils_base

SUCCESS: The 404 error is now blocked.


## Test and Validation Subsamples

In [9]:
# Keep only entailed pairs from sent
df_obs_ent = df_labeled.loc[df_labeled['verdict'] == 'YES']
df_obs_ent.head()

Unnamed: 0,id1,id2,text1,text2,verdict
3,B0227001sc,S0000883002sc,Parliament should hold the power to correct le...,Parliament must uphold the rule of law,YES
18,B0134001sc,S0004953001sc,Governance derives its legitimacy from the peo...,Governance legitimacy should come from the wil...,YES
20,B0794007p,S0000823011p,King Charles's actions demonstrate a tyrannica...,King Charles I's disregard for the people's vo...,YES
22,B0161002p,S15310007p,Such actions threaten the liberties and well-b...,Such actions endanger the rights of individual...,YES
26,B0252006p,S0000715007p,"The authority of governing bodies, like Parlia...",Parliament serves as a check on the power of t...,YES


In [10]:
df_candidates = fea.add_equivalents_from_pairs(
    df3=df_obs_ent,
    df4=df_predict,
    df3_cols=["id1", "id2"],
    df4_cols=["id1", "id2"],
    new_cols=("equivalents1", "equivalents2"),
    include_self=False,
)

df_candidates = fea.add_alpha_weight_column(
    df = df_candidates,
    list_col1 = 'equivalents1',
    list_col2 = 'equivalents2',
    new_col = "alpha"
)

In [11]:
df_labeled = fea.add_equivalents_from_pairs(
    df3=df_obs_ent,
    df4=df_labeled,
    df3_cols=["id1", "id2"],
    df4_cols=["id1", "id2"],
    new_cols=("equivalents1", "equivalents2"),
    include_self=False,  # keep the ID itself in the list
)

df_labeled = fea.add_alpha_weight_column(
    df = df_labeled,
    list_col1 = 'equivalents1',
    list_col2 = 'equivalents2',
    new_col = "alpha"
)

## Equivalence Classes

In [12]:
# Produce set of all pairs of clauses i/j with k in the class of j/i
df_crossed = fea.build_equiv_pair_candidates(
    df = df_candidates,
    id1_col = "id1",
    id2_col = "id2",
    equiv1_col = "equivalents1",
    equiv2_col = "equivalents2",
)

# Retrieve clause sentences
df_crossed = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_crossed,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['id1', 'id2']
)

df_crossed.head()

Filtered 2097 pairs (kept 1018).


Unnamed: 0,id1,id2,text1,text2,verdict
0,B0278001sc,B0311001sc,The King's power should be limited to promote ...,The king requires parliamentary approval to im...,
1,B0835004p,B0857006p,"The people create the king, suggesting that th...",Undermining the king's authority is tantamount...,
2,B0663002p,B0382007p,A king's power and authority are best confirme...,The mutual obligation between the king and the...,
3,B0383006p,B0087006p,"If a ruler acts as an enemy to their subjects,...",The overarching control of a single ruler can ...,
4,B0711002sc,B0204002sc,Laws establish a structured relationship with ...,Laws protect their rights in the political str...,


In [13]:
df_labeled_crossed = fea.build_equiv_pair_candidates(
    df = df_labeled,
    id1_col = "id1",
    id2_col = "id2",
    equiv1_col = "equivalents1",
    equiv2_col = "equivalents2",
)

# Retrieve clause sentences
df_labeled_crossed = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_labeled_crossed,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['id1', 'id2']
)

df_labeled_crossed.head()

Filtered 2641 pairs (kept 1658).


Unnamed: 0,id1,id2,text1,text2,verdict
0,B0454001p,B0203001p,Agrarian laws can effectively prevent the rise...,Laws are necessary to limit the power of kings,
1,B0227001sc,B0227001sc,Parliament should hold the power to correct le...,Parliament should hold the power to correct le...,
2,B0244002sc,B0311001sc,Parliaments in England can create laws indepen...,The king requires parliamentary approval to im...,
3,B0089006p,B0800005p,The rights and liberties of the people depend ...,The authority of a king or any governing body ...,
4,B0134001sc,B0134001sc,Governance derives its legitimacy from the peo...,Governance derives its legitimacy from the peo...,


## Running FEA

In [14]:
df_final, fig_html = fea.run_fea_papermill(
    iteration_number=iteration_number,
    df_candidates=df_candidates,
    df_crossed=df_crossed,
    df_labeled=df_labeled,
    df_labeled_crossed=df_labeled_crossed,
    df_obs_ent=df_obs_ent,
    df_clause=df_clause,
    embedding_cache=embedding_cache_finetuned,
)

Executing FreeEntailmentAlgorithm.ipynb for iteration 0...


Executing:   0%|          | 0/33 [00:00<?, ?cell/s]

✓ Retrieved outputs:
  - df_final: 207 rows
  - fig_html: HTML plot (14552 chars)


In [15]:
df_final.head()

Unnamed: 0,id1,id2,text1,text2,entailment_probability
8,B0278001sc,S0017330003sc,The King's power should be limited to promote ...,The monarchy must maintain governance legitima...,0.76849
19,B0835004p,S0023245007p,"The people create the king, suggesting that th...",The exploitation of the King's name could erod...,0.761231
50,B0223012p,S0020207005p,Limiting a King's authority to the consent of ...,The argument that the King's formal approval i...,0.802363
164,B0989004p,S0000835006p,This limited authority aligns with the princip...,It is important to uphold parliamentary privil...,0.569985
182,B0194012p,S0023803002p,There are mechanisms in place to address any t...,The argument against the idea that the King po...,0.761231


# Task 2: Cleaning LLM Calls

In [16]:
df_final = df_final.reset_index(drop=True)
df_to_llm, rest_above_tau = fea.two_random_subsamples(df_final, sent_frac, 1 - sent_frac, 42)

In [17]:
df_to_llm = fea.format_df_to_llm(df_to_llm)
df_to_llm.head()

Unnamed: 0,sentence_id_2,sentence_id_1,sentence_text_2,argument_id_2,sentence_text_1,argument_id_1,score
154,S0016873008p,B0842005p,Any significant alteration to the King's role ...,S00168,The multiplicity of kings and their varying po...,B0842,0.848554
26,S0022934009p,B0314009p,There is a necessity for a constitutional fram...,S00229,The historical context shows that the struggle...,B0314,0.802363
101,S0003662006p,B0314009p,This situation reflects broader concerns about...,S00036,The historical context shows that the struggle...,B0314,0.474935
46,S0001471001p,B0780004p,The authority of Parliament must be respected ...,S00014,The Parliament holds the power to regulate the...,B0780,0.871072
38,S0021163002sc,B0506001sc,to uphold justice,S00211,Established laws ensure justice,B0506,0.976345


In [18]:
df_to_llm.shape

(103, 7)

# Next loop:

In [19]:
result = fea.finalize_pipeline_iteration(
    test=test,
    df_to_llm=df_to_llm,
    iteration_number=iteration_number,
    remaining_llm_calls=remaining_llm_calls,
    remaining_llm_calls_path=remaining_llm_calls_path,
    unlabeled_pairs=unlabeled_pairs,
    unlabeled_pairs_path=unlabeled_pairs_path,
)

remaining_llm_calls = result['remaining_llm_calls']
unlabeled_pairs = result['unlabeled_pairs']


TEST MODE: Mocking LLM responses
✓ Matched 103/103 pairs with mock LLM results
✓ Removed 103 pairs from remaining LLM calls
✓ Remaining pairs for future iterations: 4897
✓ Saved updated remaining_llm_calls to fea_iterations/loop_data/remaining_llm_calls.pkl
✓ Saved 103 pairs with LLM results to fea_iterations/llm_batch_iter_0.csv

Iteration 0 complete
