In [1]:
import pandas as pd
import sys
import transformers.utils.hub
import transformers.tokenization_utils_base
import pickle
import importlib
import os
import papermill as pm
import scrapbook as sb
import numpy as np

from sentence_transformers import SentenceTransformer
from collections import defaultdict
from typing import List, Tuple, Any, Mapping, Iterable, Dict, Literal

import free_entailments_algorithm_utils as fea



In [2]:
iteration_number = 1
input_csv_path = "labeled_pairs/Results_DS_BtoS_iteration_0.csv"
df_clause_path = None
embedding_cache_path = None
test = True
remaining_llm_calls_path = None
unlabeled_pairs_path = None
sent_frac = 0.5
budget = 0.0

In [3]:
# Parameters
iteration_number = 3
input_csv_path = "fea_iterations/loop_data/accumulated_labeled_iter_2.csv"
df_clause_path = "fea_iterations/loop_data/df_clause.pkl"
embedding_cache_path = "fea_iterations/loop_data/embedding_cache.pkl"
test = True
remaining_llm_calls_path = "fea_iterations/loop_data/remaining_llm_calls.pkl"
unlabeled_pairs_path = None
sent_frac = 0.5
budget = 0.0


In [4]:
pipeline_data = fea.load_pipeline_data(
    df_clause_path=df_clause_path,
    embedding_cache_path=embedding_cache_path,
    test=test,
    remaining_llm_calls_path=remaining_llm_calls_path,
    unlabeled_pairs_path=unlabeled_pairs_path,
    iteration_number=iteration_number,
)

df_clause = pipeline_data['df_clause']
embedding_cache_finetuned = pipeline_data['embedding_cache']
remaining_llm_calls = pipeline_data['remaining_llm_calls']
unlabeled_pairs = pipeline_data['unlabeled_pairs']


PARAMETER VALUES AFTER PAPERMILL INJECTION:
iteration_number = 3
test = True
remaining_llm_calls_path = fea_iterations/loop_data/remaining_llm_calls.pkl
df_clause_path = fea_iterations/loop_data/df_clause.pkl

✓ Loaded df_clause: 63909 rows


✓ Loaded embedding cache: 63909 embeddings
✓ Loaded remaining_llm_calls: 4818 rows
✓ All data loaded from pickle files


# Task 1: Seting up dataframes and Running FEA

In [5]:
df_llm_original = pd.read_csv(input_csv_path)

df_llm = fea.add_verdict(
    df_llm_original,
    id1_col='sentence_id_1',
    id2_col='sentence_id_2',
    conclusion_col='llm_conclusion_12',
    positive_label='YES'
)

if test:
    df_llm_remaining = fea.add_verdict(
        remaining_llm_calls,
        id1_col='sentence_id_1',
        id2_col='sentence_id_2',
        conclusion_col='llm_conclusion_12',
        positive_label='YES'
    )


VERDICT SUMMARY
Total pairs: 5182
Bidirectional entailment (YES): 657 (12.7%)
Not bidirectionally entailed (NO): 4525 (87.3%)




VERDICT SUMMARY
Total pairs: 4818
Bidirectional entailment (YES): 558 (11.6%)
Not bidirectionally entailed (NO): 4260 (88.4%)



In [6]:
df_labeled = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_llm,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['sentence_id_1', 'sentence_id_2', 'verdict']
)
df_labeled.head()

Unnamed: 0,id1,id2,text1,text2,verdict
0,B0860002sc,S0010771002sc,The king's support must match his responsibili...,clear evidence of acting against the interests...,NO
1,B1170001sc,S0020225001sc,Active governance by the prince is essential f...,Maintaining respect for the monarchy is essential,NO
2,B0454001p,S0004868005p,Agrarian laws can effectively prevent the rise...,This situation highlights the tension between ...,NO
3,B0227001sc,S0000883002sc,Parliament should hold the power to correct le...,Parliament must uphold the rule of law,YES
4,B0580002sc,S0023399001sc,The king's presence is essential for validatin...,Parliament must assert authority,NO


In [7]:
if test:
    df_predict = fea.merge_pairwise_texts(
        df1 = df_clause,
        df2 = df_llm_remaining,
        df1_cols = ['sentence_id', 'sentence'],
        df2_cols = ['sentence_id_1', 'sentence_id_2']
    )
else:
    df_predict = fea.setminus(
        df_big= unlabeled_pairs,
        df_small= df_labeled,
        id_cols = ['id1', 'id2']
    )

    df_predict = fea.merge_pairwise_texts(
        df1 = df_clause,
        df2 = df_predict,
        df1_cols = ['sentence_id', 'sentence'],
        df2_cols = ['id1', 'id2']
    )

df_predict.head()


Unnamed: 0,id1,id2,text1,text2,verdict
0,B0783006p,S0019961006p,The Parliament holds the power to regulate the...,"By promptly seeking the King's guidance, Parli...",
1,B1129001sc,S0003329002sc,A monarchical dominion needs a clear hierarchy,to enhance the monarchy-Parliament relationship,
2,B0287002sc,S0019015004sc,A government must maintain societal integrity,Accountability ensures government integrity,
3,B1086002sc,S0023795002sc,threatens the stability of a Commonwealth,Preserving Parliament's authority is essential...,
4,B0336005p,S0016856004p,Such focused discussions would enable the coun...,Established procedures should guide discussion...,


## Embedding All Sentences

In [8]:
##Patches an error later on with kwargs

def _safe_list_templates(*args, **kwargs):
    return []

transformers.utils.hub.list_repo_templates = _safe_list_templates
print(" - Patched transformers.utils.hub")

# The library had already imported the broken function here, so we must update it.
transformers.tokenization_utils_base.list_repo_templates = _safe_list_templates
print(" - Patched transformers.tokenization_utils_base")

print("\nSUCCESS: The 404 error is now blocked.")

 - Patched transformers.utils.hub
 - Patched transformers.tokenization_utils_base

SUCCESS: The 404 error is now blocked.


## Test and Validation Subsamples

In [9]:
# Keep only entailed pairs from sent
df_obs_ent = df_labeled.loc[df_labeled['verdict'] == 'YES']
df_obs_ent.head()

Unnamed: 0,id1,id2,text1,text2,verdict
3,B0227001sc,S0000883002sc,Parliament should hold the power to correct le...,Parliament must uphold the rule of law,YES
18,B0134001sc,S0004953001sc,Governance derives its legitimacy from the peo...,Governance legitimacy should come from the wil...,YES
20,B0794007p,S0000823011p,King Charles's actions demonstrate a tyrannica...,King Charles I's disregard for the people's vo...,YES
22,B0161002p,S15310007p,Such actions threaten the liberties and well-b...,Such actions endanger the rights of individual...,YES
26,B0252006p,S0000715007p,"The authority of governing bodies, like Parlia...",Parliament serves as a check on the power of t...,YES


In [10]:
df_candidates = fea.add_equivalents_from_pairs(
    df3=df_obs_ent,
    df4=df_predict,
    df3_cols=["id1", "id2"],
    df4_cols=["id1", "id2"],
    new_cols=("equivalents1", "equivalents2"),
    include_self=False,
)

df_candidates = fea.add_alpha_weight_column(
    df = df_candidates,
    list_col1 = 'equivalents1',
    list_col2 = 'equivalents2',
    new_col = "alpha"
)

In [11]:
df_labeled = fea.add_equivalents_from_pairs(
    df3=df_obs_ent,
    df4=df_labeled,
    df3_cols=["id1", "id2"],
    df4_cols=["id1", "id2"],
    new_cols=("equivalents1", "equivalents2"),
    include_self=False,  # keep the ID itself in the list
)

df_labeled = fea.add_alpha_weight_column(
    df = df_labeled,
    list_col1 = 'equivalents1',
    list_col2 = 'equivalents2',
    new_col = "alpha"
)

## Equivalence Classes

In [12]:
# Produce set of all pairs of clauses i/j with k in the class of j/i
df_crossed = fea.build_equiv_pair_candidates(
    df = df_candidates,
    id1_col = "id1",
    id2_col = "id2",
    equiv1_col = "equivalents1",
    equiv2_col = "equivalents2",
)

# Retrieve clause sentences
df_crossed = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_crossed,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['id1', 'id2']
)

df_crossed.head()

Filtered 2195 pairs (kept 879).


Unnamed: 0,id1,id2,text1,text2,verdict
0,B0663002p,B0382007p,A king's power and authority are best confirme...,The mutual obligation between the king and the...,
1,B0383006p,B0087006p,"If a ruler acts as an enemy to their subjects,...",The overarching control of a single ruler can ...,
2,B0383006p,B0260004p,"If a ruler acts as an enemy to their subjects,...",This demonstrates that authority is not an inh...,
3,B0711002sc,B0204002sc,Laws establish a structured relationship with ...,Laws protect their rights in the political str...,
4,B0711002sc,B0289001sc,Laws establish a structured relationship with ...,The legal system ensures justice through a col...,


In [13]:
df_labeled_crossed = fea.build_equiv_pair_candidates(
    df = df_labeled,
    id1_col = "id1",
    id2_col = "id2",
    equiv1_col = "equivalents1",
    equiv2_col = "equivalents2",
)

# Retrieve clause sentences
df_labeled_crossed = fea.merge_pairwise_texts(
    df1 = df_clause,
    df2 = df_labeled_crossed,
    df1_cols = ['sentence_id', 'sentence'],
    df2_cols = ['id1', 'id2']
)

df_labeled_crossed.head()

Filtered 3264 pairs (kept 2427).


Unnamed: 0,id1,id2,text1,text2,verdict
0,B0454001p,B0203001p,Agrarian laws can effectively prevent the rise...,Laws are necessary to limit the power of kings,
1,B0454001p,B0278002p,Agrarian laws can effectively prevent the rise...,Allowing a King to have absolute power undermi...,
2,B0454001p,B0314009p,Agrarian laws can effectively prevent the rise...,The historical context shows that the struggle...,
3,B0227001sc,B0227001sc,Parliament should hold the power to correct le...,Parliament should hold the power to correct le...,
4,B0244002sc,B0311001sc,Parliaments in England can create laws indepen...,The king requires parliamentary approval to im...,


## Running FEA

In [14]:
df_final, fig_html = fea.run_fea_papermill(
    iteration_number=iteration_number,
    df_candidates=df_candidates,
    df_crossed=df_crossed,
    df_labeled=df_labeled,
    df_labeled_crossed=df_labeled_crossed,
    df_obs_ent=df_obs_ent,
    df_clause=df_clause,
    embedding_cache=embedding_cache_finetuned,
)

Executing FreeEntailmentAlgorithm.ipynb for iteration 3...


Executing:   0%|          | 0/33 [00:00<?, ?cell/s]

✓ Retrieved outputs:
  - df_final: 30 rows
  - fig_html: HTML plot (14844 chars)


In [15]:
df_final.head()

Unnamed: 0,id1,id2,text1,text2,entailment_probability
274,B0316001sc,S0017330003sc,An absolute monarchy without the consent of th...,The monarchy must maintain governance legitima...,0.433528
365,B0566003p,S0022913006p,The essence of kingship lies in the notion tha...,The people should not grant authority to a rul...,0.420576
416,B0268007p,S0017381010p,The Parliament acts on behalf of the People,Parliament must uphold the rights of its const...,0.953736
431,B0789007p,S0023251001p,If the king's rule leads to oppression or unde...,"The prerogative of the King must be upheld, as...",0.883361
556,B0380003p,S0024281008p,The king is entrusted with governance for the ...,"By engaging with Parliament, the King can ensu...",0.984057


# Task 2: Cleaning LLM Calls

In [16]:
df_final = df_final.reset_index(drop=True)
df_to_llm, rest_above_tau = fea.two_random_subsamples(df_final, sent_frac, 1 - sent_frac, 42)

In [17]:
df_to_llm = fea.format_df_to_llm(df_to_llm)
df_to_llm.head()

Unnamed: 0,sentence_id_2,sentence_id_1,sentence_text_2,argument_id_2,sentence_text_1,argument_id_1,score
29,S0000962003sc,B1131001sc,Parliament must ensure the kingdom's stability,S00009,Governance should balance the king's orders wi...,B1131,0.534539
16,S10938004p,B0395004p,The King does not have any additional authorit...,S10938,This distinction is crucial because it recogni...,B0395,0.420576
7,S0018785001sc,B0311001sc,A careful balance of power between the monarch...,S00187,The king requires parliamentary approval to im...,B0311,0.534539
25,S0023870002sc,B0770001sc,Accountability in politics is essential for ma...,S00238,A monarchy can coexist with popular authority,B0770,0.821134
24,S10938004p,B0370005p,The King does not have any additional authorit...,S10938,Rulers should only wield authority against tho...,B0370,0.256359


In [18]:
df_to_llm.shape

(15, 7)

# Next loop:

In [19]:
result = fea.finalize_pipeline_iteration(
    test=test,
    df_to_llm=df_to_llm,
    iteration_number=iteration_number,
    remaining_llm_calls=remaining_llm_calls,
    remaining_llm_calls_path=remaining_llm_calls_path,
    unlabeled_pairs=unlabeled_pairs,
    unlabeled_pairs_path=unlabeled_pairs_path,
)

remaining_llm_calls = result['remaining_llm_calls']
unlabeled_pairs = result['unlabeled_pairs']


TEST MODE: Mocking LLM responses
✓ Matched 15/15 pairs with mock LLM results
✓ Removed 15 pairs from remaining LLM calls
✓ Remaining pairs for future iterations: 4803
✓ Saved updated remaining_llm_calls to fea_iterations/loop_data/remaining_llm_calls.pkl
✓ Saved 15 pairs with LLM results to fea_iterations/llm_batch_iter_3.csv

Iteration 3 complete
