In [18]:
import os
import json
import tqdm
import threading
from concurrent.futures import ThreadPoolExecutor
import os
import contextlib

In [19]:
from modules.prompts import COT, ZERO_SHOT_PROMPT, FEW_SHOT_PROMPT
from modules import utils
from modules.models import Model, GeminiModel, SelfVerificationModel
from modules.dataset import Dataset, MiniEvalDataset
from modules import explanation_match as em
from modules import evaluate as eval

### Set up the mini-eval directory with the 'answers' (LLM-based ground truth) and 'documents' (perturbed documents without tags).


In [20]:
base_dir = 'perturbed_legal_documents'
PERTURBATION_TYPES = ['ambiguity', 'inconsistencies', 'misaligned_terminalogy', 'omission', 'structural_flaws']
CATEGORIES = ['inText', 'legal']

In [21]:
API_KEYS = [
    "AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k", # Aditya
    "AIzaSyDgafwAgDi2Zjvu6jdt_SIZ60VgK1Na32E", # Aditya
    "AIzaSyCWI7QJXWYBGGWGdL37W8ll0sDIwz0zqlo", # Aditya
    "AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE", # Foo
    "AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k", # Foo
    "AIzaSyAjby-dj9aBsolOdTDpvU7_x5uje8l4yiQ", # Foo
    "AIzaSyCN-EJ7s6CIeEybjT3tM_zN0-4xx4Rcqqw", # Foo
    "AIzaSyCKWwXUILaUvHkyppqY87-cqBad16vZb00", # Foo
    "AIzaSyCfYpaD89nvVJ6GIitszeWI0KXdlgEAv-Q", # Foo
    "AIzaSyCsA0PVE_BygEVMdrGs7Upyo4nBk2FTbhM", # Foo
    "AIzaSyAcqO6uxgeIP5qyxcDZLAY2TC9xyTlBmC0", # Foo
    "AIzaSyC_86XS-IZzhdfmhBSThwQoYMoQuFeY4mQ", # Foo
    "AIzaSyAH4zpotMPNF-GlGYmMMAi6ZoCte5b95Hk", # Ezra
    "AIzaSyDSG4tUWCN6oA7b2XMS8zLOfXG7R987D2Y", # Ezra
    "AIzaSyDwBOvWeSweppAjbU3fwWqBm0a_M7JGOWw", # Ezra
    "AIzaSyCqqBjoa2M6HF7aEagzJn_2ckEYrW1s7wY", # Ezra
    "AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4", # Noel
    "AIzaSyBTYgTD42xCABfJy1jsHchkZEhFaw8X1_c", # Mannan
]

You retrieve elements in each dataset like this:

In [22]:
dataset = MiniEvalDataset()
display(dataset[0]["answers"], dataset[0]["documents"])


[{'file_name': 'ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt',
  'perturbation': [{'type': 'Ambiguities - In Text Contradiction',
    'original_text': 'A. CONSULTANT\'S "ENDORSEMENT" means the right to use the CONSULTANT\'S name, fame, nickname, autograph, voice, facsimile, signature, photograph, likeness, and image in connection with the marketing, advertising, promotion and sale of ADAMS GOLF\'S PRODUCT.',
    'changed_text': 'A. CONSULTANT\'S "ENDORSEMENT" means the right to use the CONSULTANT\'S name solely for marketing materials directly created by ADAMS GOLF. This excludes use of likeness or image for promotional events unless specifically agreed upon in writing.',
    'explanation': 'The original definition of "ENDORSEMENT" is broad, including name, likeness, and image. The modified definition restricts the endorsement to the use of name only for marketing materials, contradicting the broad definition of endorsement in the original clause. This introduces ambiguity

'REDACTED COPY CONFIDENTIAL TREATMENT REQUESTED CONFIDENTIAL PORTIONS OF THIS DOCUMENT HAVE BEEN REDACTED AND HAVE BEEN SEPARATELY FILED WITH THE COMMISSION 1 ENDORSEMENT AGREEMENT This Agreement is entered into on January 13, 2005 between professional golfer, TOM WATSON, (hereinafter referred to as "CONSULTANT") and ADAMS GOLF, LTD. (hereinafter referred to as "ADAMS GOLF"). WITNESSETH WHEREAS, ADAMS GOLF desires to obtain the right to use the name, likeness and ENDORSEMENT of CONSULTANT in connection with the advertisement and promotion of ADAMS GOLF\'S PRODUCT; NOW THEREFORE, in consideration of the mutual covenants contained herein and other good and valuable consideration, the receipt and sufficiency of which is hereby acknowledged, the parties agree as follows: CONTRACT PERIOD 1. TERM OF CONTRACT The Term of this Agreement shall be for a period of [* ****] years and [*****] months commencing the 1st day of September 2004 and terminating the [*****] day of [*****]. 2. DEFINITIONS 

**You check the length like this:**

In [23]:
# len(dataset)
# print(dataset[5]["file_name"])

**To maintain the base file name, removing `modified_` or `perturbed_`**

In [24]:
# dataset = MiniEvalDataset()
# dataset.clean_filenames()

### Implementation of `generate_responses`

In [25]:
def generate_responses(model, dataset, prompt: str, output_dir, num_responses: int = 1):
    try:
        for sample in tqdm.tqdm(dataset, desc="Processing samples"):
            # print(sample)
            # Prepare base directory and document text
            base_name = sample["file_name"]
            document_with_tags = sample["documents"]
            document_with_tags_removed = sample["documents"].replace("<*$p$*>", "") 
            ground_truth = sample["answers"][0]["perturbation"]

            for i in range(num_responses):
                # Construct output path: outputs/self_consistency/<subdir>/<filename>_i.json
                subdir = os.path.join(output_dir, "self_consistency", os.path.dirname(base_name))
                os.makedirs(subdir, exist_ok=True)
                output_path = os.path.join(subdir, os.path.basename(base_name) + f"_{i}.json")

                # Skip if file already exists
                if os.path.exists(output_path):
                    continue

                # Generate model response
                model_response = model.generate(
                    # prompt.replace("[DOCUMENT]", document_with_tags_removed)
                    prompt.replace("[DOCUMENT]", document_with_tags)
                )
                parsed_response = utils.clean_and_parse_model_response(model_response)

                if parsed_response:
                    updated_predictions = utils.add_section_identified_flag(parsed_response, ground_truth)
                    with open(output_path, "w", encoding="utf-8") as f:
                        json.dump(updated_predictions, f, indent=4)
    except Exception as e:
        print(f"❌ Error in generate_responses: {e}")

In [26]:
def run(
    model: Model,
    dataset: Dataset,
    prompt: str,
    responses_dir: str,
    num_responses: int,
    evaluation_model: Model = None
):
    """
    Runs the evaluation process.
    :param model: The model to generate responses.
    :param dataset: The dataset to evaluate.
    :param prompt: The prompt to use for generating responses.
    :param responses_dir: Directory to save the responses.
    :param num_responses: The number of responses to collect per document (for self-consistency)
    :param evaluation_model: Model for evaluating model responses.
    """
    # generate_responses(model, dataset, prompt, responses_dir, num_responses)
    # explanation_match(evaluation_model, dataset, responses_dir)
    print("Running explanation_match...")
    em.explanation_match_sbert(dataset, responses_dir)
    return eval.evaluate_scoring(responses_dir)

In [27]:
runs = [
    {
        "name": "zero-shot",
        "model": GeminiModel(API_KEYS),
        "dataset": MiniEvalDataset(),
        "prompt": ZERO_SHOT_PROMPT,
        "responses_dir": utils.correct_path_name("mini-eval/responses_v2/zero-shot/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "name": "zero-shot-cot",
        "model": GeminiModel(API_KEYS),
        "dataset": MiniEvalDataset(),
        "prompt": ZERO_SHOT_PROMPT + COT,
        "responses_dir": utils.correct_path_name("mini-eval/responses_v2/zero-shot-cot/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "name": "zero-shot-self-verification",
        "model": SelfVerificationModel(GeminiModel(API_KEYS)),
        "dataset": MiniEvalDataset(),
        "prompt": ZERO_SHOT_PROMPT,
        "responses_dir": utils.correct_path_name(
            "mini-eval/responses_v2/zero-shot-self-verification/"
        ),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "name": "zero-shot-self-verification-cot",
        "model": SelfVerificationModel(GeminiModel(API_KEYS)),
        "dataset": MiniEvalDataset(),
        "prompt": ZERO_SHOT_PROMPT + COT,
        "responses_dir": utils.correct_path_name(
            "mini-eval/responses_v2/zero-shot-self-verification-cot/"
        ),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "name": "few-shot",
        "model": GeminiModel(API_KEYS),
        "dataset": MiniEvalDataset(),
        "prompt": FEW_SHOT_PROMPT,
        "responses_dir": utils.correct_path_name("mini-eval/responses_v2/few-shot/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "name": "few-shot-cot",
        "model": GeminiModel(API_KEYS),
        "dataset": MiniEvalDataset(),
        "prompt": FEW_SHOT_PROMPT + COT,
        "responses_dir": utils.correct_path_name("mini-eval/responses_v2/few-shot-cot/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "name": "few-shot-self-verification",
        "model": SelfVerificationModel(GeminiModel(API_KEYS)),
        "dataset": MiniEvalDataset(),
        "prompt": FEW_SHOT_PROMPT,
        "responses_dir": utils.correct_path_name(
            "mini-eval/responses_v2/few-shot-self-verification/"
        ),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "name": "few-shot-self-verification-cot",
        "model": SelfVerificationModel(GeminiModel(API_KEYS)),
        "dataset": MiniEvalDataset(),
        "prompt": FEW_SHOT_PROMPT + COT,
        "responses_dir": utils.correct_path_name(
            "mini-eval/responses_v2/few-shot-self-verification-cot/"
        ),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
]

In [28]:
@contextlib.contextmanager
def suppress_output():
    with open(os.devnull, "w") as fnull:
        with contextlib.redirect_stdout(fnull), contextlib.redirect_stderr(fnull):
            yield

# Semaphore to limit the number of concurrent threads to the number of API keys
api_key_semaphore = threading.Semaphore(len(API_KEYS))

run_results = {}

def run_with_semaphore(run_config):
    """
    Wrapper function to run a task while respecting the semaphore.
    """
    with api_key_semaphore:
        run_results[run_config["name"]] = run(
                model=run_config["model"],
                dataset=run_config["dataset"],
                prompt=run_config["prompt"],
                responses_dir=run_config["responses_dir"],
                num_responses=run_config["num_responses"],
                evaluation_model=run_config["evaluation_model"],
            )

with ThreadPoolExecutor(max_workers=len(API_KEYS)) as executor:
    for run_config in runs:
        executor.submit(run_with_semaphore, run_config)

print("✅ DONE")

Running explanation_match...
Running explanation_match...
Running explanation_match...
Running explanation_match...
Running explanation_match...
Running explanation_match...
Running explanation_match...
Running explanation_match...
✅ Model loaded successfully!


Evaluating explanations (SBERT):   4%|▍         | 1/25 [00:00<00:04,  5.22it/s]

❌ Failed to load SBERT model 'all-MiniLM-L6-v2': Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.

📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\few-shot-cot\self_consistency\ambiguity_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT (top sim): The original definition of "ENDORSEMENT" is broad, including name, likeness, and image. The modified definition restricts the endorsement to the use of name only for marketing materials, contradicting the broad definition of endorsement in the original clause. This introduces ambiguity about whether Adams Golf can use Watson's likeness in promotional events without additional written agreements, creating potential legal disputes about the scope of the endorsement rights originally granted. This is contradicting with section 3.
Model: The definition of



❌ Failed to load SBERT model 'all-MiniLM-L6-v2': Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.

📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\few-shot-cot\self_consistency\ambiguity_inText\BORROWMONEYCOM,INC_06_11_2020-EX-10.1-JOINTVENTUREAGREEMENT.txt_0.json
GT (top sim): This change introduces ambiguity by stating the term can be ended by "mutual agreement of the Managers." This contradicts section 36a which states the Venture will be dissolved and its assets liquidated in the event of the Term expires and is not extended, creating an in-text contradiction. Now the Term can either expire or it can be terminated earlier by Managers
Model: The term is defined as starting on March 1, 2020 and ending on February 28, 2025. This creates an in-text contradiction because, in most years, February has 28 days, but the agr

Evaluating explanations (SBERT):   8%|▊         | 2/25 [00:00<00:03,  6.05it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\few-shot-cot\self_consistency\ambiguity_inText\BORROWMONEYCOM,INC_06_11_2020-EX-10.1-JOINTVENTUREAGREEMENT.txt_0.json
GT (top sim): This change introduces ambiguity and in-text contradiction by stating that if the Managers are unable to reach an agreement on major issues, a majority vote of the Managers will be required. This contradicts section 28 which states Any vote required by the Members will be determined such that each Member receives one vote carrying equal weight. Now it is not clear whether a majority vote by Managers or Members will be required.
Model: This section creates a structural flaw in the sense that, the decision of the managers does not account for the contribution of the individual members, which is a structural flaw because it does not acknowledge how the members will react to the contributions and the potential monetary disagreements that might arise.
Score:

Evaluating explanations (SBERT):  12%|█▏        | 3/25 [00:00<00:03,  6.04it/s]



📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\few-shot-cot\self_consistency\ambiguity_inText\CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEBSITEHOSTINGAGREEMENT.txt_0.json
GT (top sim): The original text states that i-on will use best efforts to schedule maintenance between 8pm and 8am EST on weekdays or weekends, implying a commitment to minimize disruption. The changed text removes this commitment and allows i-on to perform maintenance at any time without notice. This contradicts the earlier statement that i-on will maintain the Hosted Site continuously, creating ambiguity regarding the guaranteed uptime and the extent of permissible interruptions. The contradiction is within the 'SERVICES PROVIDED TO THE CUSTOMER' section, where continuous operation is initially promised, but later undermined by unrestricted maintenance scheduling.
Model: The statement promises 24/7 continuous operation but immediately introduces an exceptio

Evaluating explanations (SBERT):   8%|▊         | 2/25 [00:00<00:03,  6.74it/s][A

❌ Failed to load SBERT model 'all-MiniLM-L6-v2': Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.

📁 Directory: ambiguity_inText
Text Match (any): 15 / 15
  ├─ v1 (changed_text): 15 / 15
  └─ v2 (contradicted_text): 6 / 15
Explanation Match: 3 / 15
Text + Explanation Match: 3 / 15

📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\ambiguity_inText\CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEBSITEHOSTINGAGREEMENT.txt_0.json
GT (top sim): The original text states that i-on will use best efforts to schedule maintenance between 8pm and 8am EST on weekdays or weekends, implying a commitment to minimize disruption. The changed text removes this commitment and allows i-on to perform maintenance at any time without notice. This contradicts the earlier statement that i-on will mainta

Evaluating explanations (SBERT):  16%|█▌        | 4/25 [00:00<00:03,  5.76it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\ambiguity_inText\CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEBSITEHOSTINGAGREEMENT.txt_0.json
GT (top sim): The original text specifies that the customer must pay by the 5th of each month. The modified version changes this to 'within 30 days of receiving the invoice,' creating a contradiction about the exact payment deadline. Additionally, changing 'will' to 'may' regarding service interruption introduces uncertainty about the consequences of late payment. This contradiction affects the enforceability of the payment terms, as the contract now contains conflicting deadlines. The contradiction is rooted in the 'RESPONSIBILITIES OF THE CUSTOMER' section, where the payment terms are explicitly defined.
Model: The contract states that the customer is responsible for paying the recurring monthly fee, but does not state when the invoice will b




📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\few-shot-cot\self_consistency\ambiguity_inText\Freecook_20180605_S-1_EX-10.3_11233807_EX-10.3_HostingAgreement.txt_0.json
GT (top sim): The original text specifies a fixed amount of $5,000 as the Contract Price, while the modified text changes this to a 'reasonable sum'. Additionally, the original text specified 'in accordance with', while the modified text specified 'in general accordance with'. This introduces uncertainty and ambiguity in Section 1. Website Design and Development of the text, as it's unclear how a 'reasonable sum' will be determined and leaves open to interpretation and debate what constitutes 'general accordance' with the Scope of Work, potentially conflicting with Section 2. Payment Terms where a specific amount ($5,000) and payment schedule are outlined.
Model: The clause uses the term "reasonable sum" but doesn't define it. However, Section 2 specifies the Con

Evaluating explanations (SBERT):  20%|██        | 5/25 [00:00<00:03,  5.82it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\ambiguity_inText\Freecook_20180605_S-1_EX-10.3_11233807_EX-10.3_HostingAgreement.txt_0.json
GT (top sim): The original text specifies a fixed amount of $5,000 as the Contract Price, while the modified text changes this to a 'reasonable sum'. Additionally, the original text specified 'in accordance with', while the modified text specified 'in general accordance with'. This introduces uncertainty and ambiguity in Section 1. Website Design and Development of the text, as it's unclear how a 'reasonable sum' will be determined and leaves open to interpretation and debate what constitutes 'general accordance' with the Scope of Work, potentially conflicting with Section 2. Payment Terms where a specific amount ($5,000) and payment schedule are outlined.
Model: The phrase "reasonable sum" is ambiguous. A legally sound contract should specify a de

Evaluating explanations (SBERT):  24%|██▍       | 6/25 [00:01<00:03,  5.97it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\few-shot-cot\self_consistency\inconsistencies_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT (top sim): The extension of the cure period from fifteen (15) to thirty (30) days in the 'Termination for Cause' clause (Section 23) introduces an in-text contradiction, creating confusion about the timeline for addressing and resolving breaches of contract. This contradiction creates ambiguity regarding the steps and timeframe for enforcing the termination clause, creating legal uncertainty and potential disputes.
Model: Paragraph 23 outlines the conditions for termination for cause, but it conflicts with the broader statement in the first line that this paragraph applies, 'Notwithstanding any other paragraph of this Agreement'. Paragraph 8 (Minimum Number of Tournaments and Potential Repayment of Base Compensation) already specifies conditions under which ADAMS 

Evaluating explanations (SBERT):  28%|██▊       | 7/25 [00:01<00:03,  5.98it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\few-shot-cot\self_consistency\inconsistencies_inText\BORROWMONEYCOM,INC_06_11_2020-EX-10.1-JOINTVENTUREAGREEMENT.txt_0.json
GT (top sim): The change mandates that JVLS, LLC's $60,000 contribution be made in cash. This contradicts the original text, which states the contribution comes "From Monthly Government , City And State, And Or Private Awarded Contracts", implying it's derived from contracts and not necessarily a direct cash injection. The phrase "in cash" conflicts with the understanding of how the capital is being contributed, creating an in-text contradiction and legal uncertainty.
Model: JVLS, LLC dba Vaccines 2Go's stated contribution is inconsistent and creates a contradiction. They are contributing '$60,000.00 USD in cash. Plus (10%) Of Any Generated Gross Revenue, In Add i t i on to The To ta l Contributions.', which appears to be assigned an 'Agreed Value' of $3,500,00

Evaluating explanations (SBERT):  32%|███▏      | 8/25 [00:01<00:02,  6.45it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\inconsistencies_inText\BORROWMONEYCOM,INC_06_11_2020-EX-10.1-JOINTVENTUREAGREEMENT.txt_0.json
GT (top sim): The change alters the valuation method of the Venture, replacing the fair market value appraisal of assets with the total revenue generated over the last 3 months. This contradicts the sentence that states: 'In the absence of a written agreement setting a value, the value of the Venture will be determined based on the fair market value appraisal of all Venture assets (less liabilities) in accordance with generally accepted accounting principles (GAAP) by an independent accounting firm agreed to by all Members.' This introduces ambiguity, as both valuation methods are presented, and the section now lacks the previous detail of being determined in accordance to GAAP by an independent accounting firm, which contradicts the binding natu




📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\inconsistencies_inText\BORROWMONEYCOM,INC_06_11_2020-EX-10.1-JOINTVENTUREAGREEMENT.txt_0.json
GT (top sim): The change alters the valuation method of the Venture, replacing the fair market value appraisal of assets with the total revenue generated over the last 3 months. This contradicts the sentence that states: 'In the absence of a written agreement setting a value, the value of the Venture will be determined based on the fair market value appraisal of all Venture assets (less liabilities) in accordance with generally accepted accounting principles (GAAP) by an independent accounting firm agreed to by all Members.' This introduces ambiguity, as both valuation methods are presented, and the section now lacks the previous detail of being determined in accordance to GAAP by an independent accounting firm, which contradicts the binding natu

Evaluating explanations (SBERT):  36%|███▌      | 9/25 [00:01<00:02,  6.12it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\few-shot-cot\self_consistency\inconsistencies_inText\DOMINIADVISORTRUST_02_18_2005-EX-99.(H)(2)-SPONSORSHIPAGREEMENT.txt_0.json
GT (top sim): This change introduces an in-text contradiction by stating that a penalty of $10,000 is required for termination. This directly conflicts with the initial statement in the same section that termination can occur 'without the payment of any penalty,' causing uncertainty and potential disputes about termination penalties.
Model: The text suggests that termination can occur 'at any time' but also requires 60 days' written notice. The 'at any time' aspect conflicts with the notice period, creating uncertainty whether a party can immediately terminate with the payment of a penalty or if the 60 days notice period still applies.
Score: 0.5526 → ❌ No Match

📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\resp

Evaluating explanations (SBERT):  40%|████      | 10/25 [00:01<00:02,  6.34it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\inconsistencies_inText\DOMINIADVISORTRUST_02_18_2005-EX-99.(H)(2)-SPONSORSHIPAGREEMENT.txt_0.json
GT (top sim): This change introduces an in-text contradiction. The original clause states that Domini is fully responsible for its subcontractors' actions unless the Trust agrees otherwise in writing. The added sentence negates that responsibility entirely, creating ambiguity and conflicting obligations regarding liability for subcontractors' actions. This is contradictory and has legal and financial ramifications.
Model: This section presents a legal contradiction. While subcontracting is permissible, stating that Domini is not responsible for the acts or omissions of the subcontractor could be problematic under agency law. Generally, a principal (Domini) is liable for the actions of its agent (the subcontractor) acting within the scope of t

Evaluating explanations (SBERT):  44%|████▍     | 11/25 [00:01<00:02,  6.26it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\inconsistencies_inText\Freecook_20180605_S-1_EX-10.3_11233807_EX-10.3_HostingAgreement.txt_0.json
GT (top sim): The original text clearly defines the surcharge calculation. The modified text introduces an alternative, potentially conflicting, calculation method (flat fee vs. hourly rate + percentage). This in-text contradiction makes it ambiguous which surcharge rate applies and under what circumstances, creating enforcement uncertainty.
Model: The calculation of the rush work surcharge is unclear and contradictory. The text specifies that the surcharge will be 'either the standard hourly rate of $40 plus twenty-five percent (25%) or a flat fee of $100 per hour, whichever is lower.'  $40 plus 25% is $50. The lower amount should be $50 per hour instead of $100 per hour. This leads to a misaligned amount.
Score: 0.7422 → ❌ No Match
✅ Update

Evaluating explanations (SBERT):  48%|████▊     | 12/25 [00:01<00:02,  6.21it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\misaligned_terminalogy_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT (top sim): The original text defines 'ENDORSEMENT' broadly for marketing and sales. The changed text restricts 'ENDORSEMENT' to only internal documentation. This contradicts later sections (e.g., Section 3) that rely on the broader definition for promotional activities.
Model: The definition of "ENDORSEMENT" is limited to ADAMS GOLF'S internal documentation. This is contradictory. An endorsement typically involves public-facing activities, not just internal documentation. This severely restricts the value of the endorsement to ADAMS GOLF and contradicts the stated purpose of the agreement.
Score: 0.6702 → ❌ No Match

📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\few-shot-cot\self_consi

Evaluating explanations (SBERT):  60%|██████    | 15/25 [00:02<00:00, 10.12it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\misaligned_terminalogy_inText\BORROWMONEYCOM,INC_06_11_2020-EX-10.1-JOINTVENTUREAGREEMENT.txt_0.json
GT (top sim): The addition of 'unless otherwise decided by BorrowMoney.com inc.' gives BorrowMoney.com, inc unilateral power over management voting. This creates contradiction with Section 8 as now Managers cannot be appointed, replaced, or removed upon unanimous consent of the Members.
Model: This clause initially states that each manager has equal voting weight, but then introduces an exception allowing BorrowMoney.com inc. to unilaterally alter this arrangement. This creates an imbalance of power and uncertainty regarding the decision-making process within the joint venture. The caveat essentially undermines the initial statement of equal voting rights, as one member can override the established voting structure.
Score: 0.7509 → ❌ No Ma




📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\misaligned_terminalogy_inText\CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEBSITEHOSTINGAGREEMENT.txt_0.json
GT (top sim): The term "Customer" is replaced with "subscriber" for one instance. This introduces ambiguity as to who is responsible for payments, creating an in-text contradiction with other locations where only 'Customer' is mentioned. The contradiction arises because it's unclear if 'Customer' and 'Subscriber' refer to the same entity.
Model: The first sentence refers to 'Customer', while the second sentence refers to 'subscriber' to define who is responsible for paying the recurring monthly fee. It is misaligned since the document does not define if they mean the same thing. This could be confusing and lead to disputes.
Score: 0.7401 → ❌ No Match

📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\

Evaluating explanations (SBERT):  68%|██████▊   | 17/25 [00:02<00:01,  7.83it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\misaligned_terminalogy_inText\DOMINIADVISORTRUST_02_18_2005-EX-99.(H)(2)-SPONSORSHIPAGREEMENT.txt_0.json
GT (top sim): By changing 'shall not be responsible' to 'shall also be fully responsible,' there's now a direct contradiction concerning the Sponsor's responsibility for managing the Trust's assets. The original text explicitly states the Sponsor is *not* responsible, whereas the changed text makes them *fully* responsible, creating a conflicting obligation within the same paragraph. Specifically, the introductory phrase is now in direct conflict with the 'nor shall the Sponsor be deemed to have assumed' phrase.
Model: The definition of 'Sponsor' includes Domini's affiliates and their personnel, which broadens the scope of liability limitations. While common, the term 'reckless disregard' is vague. This creates ambiguity and could lead

Evaluating explanations (SBERT):  72%|███████▏  | 18/25 [00:02<00:01,  6.96it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\few-shot-cot\self_consistency\omissions_inText\CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEBSITEHOSTINGAGREEMENT.txt_0.json
GT (top sim): By removing the mention of the specific Microsoft SQL Server 6.5 or higher the contract, now only generally defines database server software, within allocated computer storage, according to clause number 5. This creates an in-text contradiction because the previously defined specifications are now missing, potentially leading to disputes over the type of database software to be used.
Model: While database server software is mentioned, there is a lack of specific definitions of what software is provided or the configuration and functionality. Additionally, point 11 mentions access to the Hosted Site, but no definition or restriction on the access.
Score: 0.4602 → ❌ No Match

📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark



❌ No response files found for: omissions_inText\DOMINIADVISORTRUST_02_18_2005-EX-99.(H)(2)-SPONSORSHIPAGREEMENT.txt
❌ No response files found for: omissions_inText\Freecook_20180605_S-1_EX-10.3_11233807_EX-10.3_HostingAgreement.txt
❌ No response files found for: structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt
❌ No response files found for: structural_flaws_inText\BORROWMONEYCOM,INC_06_11_2020-EX-10.1-JOINTVENTUREAGREEMENT.txt
❌ No response files found for: structural_flaws_inText\CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEBSITEHOSTINGAGREEMENT.txt

📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\omissions_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT (top sim): The phrase '(It is expressly understood by the parties that CONSULTANT may play [* ****] clubs in the bag other than ADAMS GOLF clubs including, but not limited to

Evaluating explanations (SBERT): 100%|██████████| 25/25 [00:02<00:00, 15.32it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\few-shot-cot\self_consistency\structural_flaws_inText\Freecook_20180605_S-1_EX-10.3_11233807_EX-10.3_HostingAgreement.txt_0.json
GT (top sim): This change introduces a contradiction in Section 2 regarding payment terms. The original states the remaining balance is due upon completion. The added sentence allows the client to pay anytime before completion, including in installments. This creates uncertainty around the payment schedule. This contradicts the statement that the remaining amount of $3,100 needs to be paid on completion.
Model: The original text mandates the client to pay the remaining $3,100 upon completion of the scope of work. However, the added sentence says that the client can pay the remaining at any time or in multiple installments before the completion of the scope of work. This creates contradicting requirements of when payments are due.
Score: 0.8545 → ✅ Match

📄

Evaluating explanations (SBERT): 100%|██████████| 25/25 [00:02<00:00,  8.69it/s]



📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\omissions_inText\CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEBSITEHOSTINGAGREEMENT.txt_0.json
GT (top sim): By replacing the specific T1 line and UUNET details with a vague reference to a 'separate service level agreement (SLA),' this change creates an in-text contradiction if no SLA is actually provided or if other sections of the agreement still refer to the original connectivity specifications. This omission introduces uncertainty as the connectivity type is no longer defined in this agreement. The services provided are no longer clearly defined, which directly impacts the enforcement of the contract.
Model: The main agreement states that the specifics for connectivity are outlined in the SLA, however, the SLA is a seperate agreement and is not present within this contract. This means that there is a missing piece that is not present




📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\omissions_inText\DOMINIADVISORTRUST_02_18_2005-EX-99.(H)(2)-SPONSORSHIPAGREEMENT.txt_0.json
GT (top sim): By removing the explicit list of duties, and specifically sections (d) overseeing regulatory compliance, (h) providing reports on compliance, and (k) developing a budget, the modified text contradicts the implication in Section 2 that the Trust has defined responsibilities regarding expenses like SEC fees and reporting costs. The omission creates uncertainty as to who is responsible for these crucial tasks. While it states that it includes 'maintaining office facilities and supervising the overall administration of the Trust.' this is ambigious in terms of responsibilites and accountabilities.
Model: While it states the Sponsor maintains facilities and supervises administration, the contract explicitly states in the next paragraph tha




📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\omissions_inText\Freecook_20180605_S-1_EX-10.3_11233807_EX-10.3_HostingAgreement.txt_0.json
GT (top sim): By removing the standard hourly rate definition ($40), the changed text creates an in-text contradiction, due to the Hourly Rate not being defined. This will affect payment disputes, as it creates uncertainty.
Model: The paragraph does not provide a clear process for acceptance of change orders by the Company. It states that changes must be submitted and accepted, but it does not specify the criteria or time frame for the Company's acceptance. This ambiguity can lead to disputes about whether a change order has been properly accepted and whether the Company is obligated to perform the additional work.
Score: 0.2864 → ❌ No Match

📁 Directory: ambiguity_inText
Text Match (any): 15 / 15
  ├─ v1 (changed_text): 15 / 15
  └─ v2 (contradict




📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\structural_flaws_inText\BORROWMONEYCOM,INC_06_11_2020-EX-10.1-JOINTVENTUREAGREEMENT.txt_0.json
GT (top sim): This change introduces a contradiction because Section 3 defines the exclusive purpose of the Venture as IT Development, internet Back office Maintenance And Deployment of medical Service, while the added sentence allows BorrowMoney.com, inc. to unilaterally change the purpose at any time. This contradicts the original, fixed purpose and creates uncertainty. By adding "This purpose may be changed at any time at the discretion of BorrowMoney.com, inc." a contradiction in terms appears, because first it is said that the purpose is exclusivly something, but with the change it is no longer exclusive, creating incertainty.
Model: The initial sentence defines the purpose of the joint venture as "IT Development. internet Back office Maint




📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\structural_flaws_inText\DOMINIADVISORTRUST_02_18_2005-EX-99.(H)(2)-SPONSORSHIPAGREEMENT.txt_0.json
GT (top sim): By changing 'shall be prorated' to 'will be subject to renegotiation', the contract now introduces uncertainty about how the Sponsor's compensation is determined if they serve for less than a full year. Previously, it was clearly defined as prorated. Now, the parties must renegotiate, which could lead to a different outcome than a simple proration. The contradiction is in the determination of the fees to be paid.
Model: The first sentence establishes a clear fee structure: 0.50% of average daily net assets, accrued daily, and payable monthly. The second sentence introduces uncertainty by stating that if Domini serves for less than a full period, the compensation is 'subject to renegotiation.' This creates ambiguity. It's unclea

Evaluating explanations (SBERT): 100%|██████████| 25/25 [00:03<00:00,  7.36it/s]


📄 Evaluated: \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses_v2\zero-shot-self-verification\self_consistency\structural_flaws_inText\Freecook_20180605_S-1_EX-10.3_11233807_EX-10.3_HostingAgreement.txt_0.json
GT (top sim): This perturbation introduces a contradiction by stating that due dates are 'guaranteed' regardless of client delays, directly contradicting the original text that allowed for delays if the client did not provide materials promptly. This creates confusion and uncertainty. Section 1(b) originally states that project timelines are subject to client dependencies. The modified text now states timelines are guaranteed irrespective of client action, contradicting the original clause in the same section.
Model: This creates an inconsistency. The contract states a project timeline with specific start and end dates. However, it also asserts that all due dates are guaranteed by the Company regardless of delays caused by the client. This i





📁 Directory: ambiguity_inText
Text Match (any): 15 / 15
  ├─ v1 (changed_text): 15 / 15
  └─ v2 (contradicted_text): 7 / 15
Explanation Match: 5 / 15
Text + Explanation Match: 5 / 15

📁 Directory: inconsistencies_inText
Text Match (any): 19 / 19
  ├─ v1 (changed_text): 19 / 19
  └─ v2 (contradicted_text): 18 / 19
Explanation Match: 2 / 19
Text + Explanation Match: 2 / 19

📁 Directory: misaligned_terminalogy_inText
Text Match (any): 11 / 15
  ├─ v1 (changed_text): 11 / 15
  └─ v2 (contradicted_text): 2 / 15
Explanation Match: 1 / 15
Text + Explanation Match: 1 / 15

📁 Directory: omissions_inText
Text Match (any): 19 / 19
  ├─ v1 (changed_text): 19 / 19
  └─ v2 (contradicted_text): 7 / 19
Explanation Match: 1 / 19
Text + Explanation Match: 1 / 19

📁 Directory: structural_flaws_inText
Text Match (any): 15 / 15
  ├─ v1 (changed_text): 15 / 15
  └─ v2 (contradicted_text): 12 / 15
Explanation Match: 8 / 15
Text + Explanation Match: 8 / 15
✅ DONE


### **Analysis**

In [29]:
import pandas as pd

df = pd.DataFrame.from_dict(run_results, orient="index")
df

Unnamed: 0,ambiguity_inText,inconsistencies_inText,misaligned_terminalogy_inText,omissions_inText,structural_flaws_inText
zero-shot-cot,"{'text_matches': 15, 'text_match_v1': 15, 'tex...",,,,
few-shot-cot,"{'text_matches': 15, 'text_match_v1': 15, 'tex...","{'text_matches': 12, 'text_match_v1': 12, 'tex...","{'text_matches': 8, 'text_match_v1': 8, 'text_...","{'text_matches': 9, 'text_match_v1': 9, 'text_...","{'text_matches': 3, 'text_match_v1': 3, 'text_..."
zero-shot-self-verification,"{'text_matches': 15, 'text_match_v1': 15, 'tex...","{'text_matches': 19, 'text_match_v1': 19, 'tex...","{'text_matches': 11, 'text_match_v1': 11, 'tex...","{'text_matches': 19, 'text_match_v1': 19, 'tex...","{'text_matches': 15, 'text_match_v1': 15, 'tex..."


In [30]:
text_match_df = df.copy()
for column in text_match_df.columns:
    text_match_df[column] = text_match_df[column].apply(
        lambda x: x["text_matches"] / x["total"] if x["total"] > 0 else 0
    )
text_match_df

TypeError: 'float' object is not subscriptable

In [None]:
text_match_df = df.copy()
for column in text_match_df.columns:
    text_match_df[column] = text_match_df[column].apply(
        lambda x: x["correct"] / x["total"] if x["total"] > 0 else 0
    )
text_match_df

Unnamed: 0,ambiguity_inText
zero-shot-cot,0.2


In [None]:
def aggregate_correct_score(row):
    total = 0
    correct = 0
    for col in row.index:
        total += row[col]["total"]
        correct += row[col]["correct"]
    return correct / total if total > 0 else 0
        
# Text Match
total_score = df.copy()
total_score.apply(aggregate_correct_score, axis=1)

zero-shot-cot    0.2
dtype: float64

In [None]:
def aggregate_correct_score(row):
    total = 0
    correct = 0
    for col in row.index:
        total += row[col]["total"]
        correct += row[col]["text_matches"]
    return correct / total if total > 0 else 0
        
# Text Match
total_score = df.copy()
total_score.apply(aggregate_correct_score, axis=1)

zero-shot-cot    1.0
dtype: float64

#### Few-shot variations

## TODO 
---
- Z ✅
- Z + COT ✅
- Z + SV ✅
- Z + COT + SV ✅
- Z + SC ✅
- Z + COT + SC ✅
---
- FS ✅⚠️
- FS + COT ✅⚠️
- FS + SV ✅⚠️
- FS + COT + SV ✅⚠️
- FS + SC ✅⚠️
- FS + COT + SC ✅⚠️
---
- Z + SV + SC (SKIP THIS FOR NOW) ✅
- Z + COT + SV + SC (SKIP THIS FOR NOW) ✅
- FS + SV + SC (SKIP THIS FOR NOW) ✅⚠️
- FS + COT + SV + SC (SKIP THIS FOR NOW) ✅⚠️
---
- **Output into a .csv**❌
- **Eventually need to repeat with different LLMs**❌

# Metrics
1) `text match` but `explanation !match` = -1
2) `text match` and `explanation match` = +1
3) `text !match` and `explanation match` = -1
4) `text !match` and `explanation !match` = -1