### ⚠️ **TODO: Add structural_flaws back**

In [46]:
import os
import json
import shutil
import google.generativeai as genai
import tqdm
from collections import defaultdict
from google.api_core.exceptions import ResourceExhausted
import glob
import time

### Set up the mini-eval directory with the 'answers' (LLM-based ground truth) and 'documents' (perturbed documents without tags).


In [47]:
base_dir = 'perturbed_legal_documents'
PERTURBATION_TYPES = ['ambiguity', 'inconsistencies', 'misaligned_terminalogy', 'omission', 'structural_flaws']
CATEGORIES = ['inText', 'legal']

In [48]:
# for pt in PERTURBATION_TYPES:
#     for ct in CATEGORIES:
#         print(f"\nProcessing: {pt}_{ct}_contradiction")

#         input_dir = f'{base_dir}/{pt}_{ct}_contradiction/'
#         doc_dir = os.path.join(input_dir, 'modified_files_no_tags')

#         if not os.path.exists(input_dir):
#             print(f"Input dir not found: {input_dir}")
#             continue
#         if not os.path.exists(doc_dir):
#             print(f"Document dir not found: {doc_dir}")
#             continue

#         output_answers = f'mini-eval/answers/{pt}_{ct}_contradiction/'
#         output_documents = f'mini-eval/documents/{pt}_{ct}_contradiction/'

#         # Check if outputs already exist and contain at least 5 files
#         if (os.path.exists(output_answers) and len(os.listdir(output_answers)) >= 5 and
#             os.path.exists(output_documents) and len(os.listdir(output_documents)) >= 5):
#             print(f"Skipping {pt}_{ct}_contradiction — already processed.")
#             continue

#         os.makedirs(output_answers, exist_ok=True)
#         os.makedirs(output_documents, exist_ok=True)

#         # Collect all valid json->txt pairs
#         json_files = sorted([f for f in os.listdir(input_dir) if f.endswith('.json')])
#         print(f"🔎 Found {len(json_files)} JSON files")

#         valid_pairs = []

#         for json_file in json_files:
#             if not json_file.startswith("perturbed_") or not json_file.endswith(".pdf.json"):
#                 print(f"  ⚠️ Skipping incorrectly named file: {json_file}")
#                 continue

#             base_name = json_file[len("perturbed_"):-len(".pdf.json")]
#             txt_file = f"modified_{base_name}.pdf.txt"
#             txt_path = os.path.join(doc_dir, txt_file)

#             if os.path.exists(txt_path):
#                 valid_pairs.append((json_file, txt_file))
#                 print(f"  ✅ Matched: {json_file} <-> {txt_file}")
#             else:
#                 print(f"  ❌ Missing TXT: {txt_file}")

#             if len(valid_pairs) == 5:
#                 break

#         if not valid_pairs:
#             print("Can't find corresponding files????")
#             continue

#         # Copy matched pairs
#         for json_file, txt_file in valid_pairs:
#             src_json = os.path.join(input_dir, json_file)
#             dst_json = os.path.join(output_answers, json_file)

#             src_txt = os.path.join(doc_dir, txt_file)
#             dst_txt = os.path.join(output_documents, txt_file)

#             shutil.copy(src_json, dst_json)
#             shutil.copy(src_txt, dst_txt)
#             print(f"  📁 Copied: {json_file} and {txt_file}")

In [49]:
API_KEYS = [
    "AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k", # Aditya
    "AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE", # Foo
    "AIzaSyAH4zpotMPNF-GlGYmMMAi6ZoCte5b95Hk", # Ezra
    "AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4", # Noel
    "AIzaSyBTYgTD42xCABfJy1jsHchkZEhFaw8X1_c", # Mannan
]

In [50]:
# os.environ["GOOGLE_API_KEY"] = "AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k"
# API_KEY = os.getenv("GOOGLE_API_KEY")
# genai.configure(api_key=API_KEY)

## Datasets

In [51]:
from abc import ABC, abstractmethod


class Dataset(ABC):
    @abstractmethod
    def __len__(self):
        pass

    @abstractmethod
    def __getitem__(self, idx):
        pass


class MiniEvalDataset(Dataset):

    def __init__(self):

        self.mini_eval_dir = "mini-eval"

        self.mini_eval_answers_dir = os.path.join(self.mini_eval_dir, "answers")

        self.mini_eval_documents_dir = os.path.join(self.mini_eval_dir, "documents")

        self.files = [

            os.path.relpath(
                os.path.join(root, file), self.mini_eval_answers_dir
            ).replace(".json", "")

            for root, _, files in os.walk(self.mini_eval_answers_dir)
            for file in files

        ]
        self.files.sort()


    def __len__(self):

        return len(self.files)


    def __getitem__(self, idx):

        with open(
            os.path.join(self.mini_eval_answers_dir, self.files[idx] + ".json"),
            "r",
            encoding="utf-8",
        ) as f:

            answers = "\n".join(f.readlines())

            answers = self.__remove_non_ascii(answers)

            answers = json.loads(answers)


        with open(
            os.path.join(self.mini_eval_documents_dir, self.files[idx] + ".txt"),
            "r",
            encoding="utf-8",
        ) as f:

            documents = "\n".join(f.readlines())

            documents = self.__remove_non_ascii(documents)


        return {

            "file_name": self.files[idx],

            "answers": answers,

            "documents": documents,

        }


    def __remove_non_ascii(self, s):

        return "".join(filter(lambda x: ord(x) < 128, s))

## Model

In [52]:
from abc import ABC, abstractmethod


class Model(ABC):
    @abstractmethod
    def generate(self, prompt):
        pass


# class GeminiModel(Model):
#     def __init__(self):
#         self.model = genai.GenerativeModel("gemini-2.0-flash")

#     def generate(self, prompt):
#         response = self.model.generate_content(prompt)
#         return response.to_dict()["candidates"][0]["content"]["parts"][0]["text"]
    
# New version with API key cycling
class GeminiModel(Model):
    def __init__(self, api_keys):
        self.api_keys = api_keys
        self.key_index = 0
        self._set_key(self.api_keys[self.key_index])
    
    def _set_key(self, key):
        os.environ["GOOGLE_API_KEY"] = key
        genai.configure(api_key=key)
        self.model = genai.GenerativeModel("gemini-2.0-flash")

    def generate(self, prompt, max_retries=5):
        for attempt in range(max_retries):
            try:
                response = self.model.generate_content(prompt)
                return response.to_dict()["candidates"][0]["content"]["parts"][0]["text"]
            except ResourceExhausted:
                print(f"⚠️ API key {self.api_keys[self.key_index]} exhausted. Switching...")
                self.key_index = (self.key_index + 1) % len(self.api_keys)
                self._set_key(self.api_keys[self.key_index])
        print("❌ All keys exhausted or failed.")
        return ""

## Prompting Methods
These ones take in a base model and does some prompting stuff with it.

In [53]:
class SelfVerificationModel(Model):
    def __init__(self, model: Model):
        self.model = model

    def generate(self, prompt):

        failed = True

        while failed:
            print("💡 Asking questions")
            response = self.model.generate(prompt)
            is_model_sure_response = self.model.generate(
                f"You are a grader. Verify if the following response to the question is correct. If the answer is correct, say yes. Otherwise, say no.\nQuestion: {prompt}\nAnswer: {response}"
            )

            print("🤖 Model response:", response)
            print("🤓 Model sure response:", is_model_sure_response)

            if "yes" in is_model_sure_response.lower():
                print("✅ Model is sure about the answer.")
                failed = False
            else:
                print("❌ Model is not sure. Retrying...")


        return response

You retrieve elements in each dataset like this:

In [54]:
dataset = MiniEvalDataset()
display(dataset[0]["answers"], dataset[0]["documents"])


[{'file_name': '2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt',
  'perturbation': [{'type': 'Ambiguities - In Text Contradiction',
    'original_text': '(c) "CUSTOMERS" means all users who access Co-Branded Site.',
    'changed_text': '(c) "CUSTOMERS" means all users who access Co-Branded Site and complete at least one transaction per month.',
    'explanation': "The original definition of 'Customers' is broad, encompassing all users of the Co-Branded Site. The modified definition adds a requirement of completing at least one transaction per month, creating a narrower and conflicting definition. This ambiguity could lead to disputes regarding who qualifies as a 'Customer' for purposes of marketing reports, promotional discounts, or other benefits.",
    'location': '1(c)'},
   {'type': 'Ambiguities - In Text Contradiction',
    'original_text': '8.1 TERM.  The term of this Agreement shall continue for one (1) year following the Launch

'CO-BRANDING AND ADVERTISING AGREEMENT THIS CO-BRANDING AND ADVERTISING AGREEMENT (the "Agreement") is made as of June 21, 1999 (the "Effective Date") by and between I-ESCROW, INC., with its principal place of business at 1730 S. Amphlett Blvd., Suite 233, San Mateo, California 94402 ("i-Escrow"), and 2THEMART.COM, INC. having its principal place of business at 18301 Von Karman Avenue, 7th Floor, Irvine, California 92612 ("2TheMart"). 1. DEFINITIONS. (a) "CONTENT" means all content or information, in any medium, provided by a party to the other party for use in conjunction with the performance of its obligations hereunder, including without limitation any text, music, sound, photographs, video, graphics, data or software. Content provided by 2TheMart is referred to herein as "2TheMart Content" and Content provided by i-Escrow is referred to herein as "i-Escrow Content." (b) "CO-BRANDED SITE" means the web-site accessible through Domain Name, for the Services implemented by i-Escrow. Th

**You check the length like this:**

In [55]:
len(dataset)
print(dataset[5]["file_name"])

inconsistencies_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt


**Helper functions:**

In [56]:
def clean_and_parse_model_response(raw_response):
    raw_response = raw_response.strip().strip("`")
    if raw_response.startswith("json"):
        raw_response = raw_response[4:].strip()

    try:
        parsed = json.loads(raw_response)
    except json.JSONDecodeError as e:
        print("Failed to parse JSON:", e)
        return None

    return parsed


def add_section_identified_flag(predictions, ground_truth_perturbations):
    gt_locations = {p["location"].strip() for p in ground_truth_perturbations}
    gt_changed_texts = [p["changed_text"] for p in ground_truth_perturbations]

    for pred in predictions:
        # LOCATION MATCH
        pred_loc = pred.get("location", "").strip()
        pred["location_match"] = pred_loc in gt_locations

        # TEXT MATCH (check if model's reponse for 'section' matches what was perturbed)
        pred_section = pred.get("section", "").strip()
        pred["text_match"] = any(pred_section in gt_text or gt_text in pred_section for gt_text in gt_changed_texts)

    return predictions

### Implementation of `generate_responses`

In [57]:
def generate_responses(model, dataset, prompt: str, output_dir, num_responses: int = 1):
    for sample in tqdm.tqdm(dataset, desc="Processing samples"):
        # Prepare base directory and document text
        base_name = sample["file_name"]
        document_with_tags_removed = sample["documents"].replace("<*$p$*>", "") 
        ground_truth = sample["answers"][0]["perturbation"]

        for i in range(num_responses):
            # Construct output path: outputs/self_consistency/<subdir>/<filename>_i.json
            subdir = os.path.join(output_dir, "self_consistency", os.path.dirname(base_name))
            os.makedirs(subdir, exist_ok=True)
            output_path = os.path.join(subdir, os.path.basename(base_name) + f"_{i}.json")

            # Skip if file already exists
            if os.path.exists(output_path):
                continue

            # Generate model response
            model_response = model.generate(
                prompt.replace("[DOCUMENT]", document_with_tags_removed)
            )
            parsed_response = clean_and_parse_model_response(model_response)

            if parsed_response:
                updated_predictions = add_section_identified_flag(parsed_response, ground_truth)
                with open(output_path, "w", encoding="utf-8") as f:
                    json.dump(updated_predictions, f, indent=4)


### Implementation of `explanation_match`

In [58]:
def explanation_match(evaluation_model: Model, dataset, responses_dir):
    for sample in tqdm.tqdm(dataset, desc="Evaluating explanations"):
        file_name = sample["file_name"]
        
        # Normalize and split into subdir + base filename (fixes Windows paths)
        normalized_path = os.path.normpath(file_name)
        subdir = os.path.dirname(normalized_path).replace("\\", "/")
        base_filename = os.path.basename(normalized_path).replace(".json", "")

        # Match all _i.json variant files for this sample
        pattern = os.path.join(responses_dir, "self_consistency", subdir, f"{base_filename}_*.json")
        response_paths = sorted(glob.glob(pattern))

        if not response_paths:
            print(f"❌ No response files found for: {file_name}")
            continue

        # Extract GT explanations
        gt_explanations = [
            p["explanation"].strip()
            for p in sample["answers"][0]["perturbation"]
            if "explanation" in p
        ]

        for response_path in response_paths:
            with open(response_path, "r", encoding="utf-8") as f:
                try:
                    model_preds = json.load(f)
                except json.JSONDecodeError as e:
                    print(f"❌ JSON decode error in {response_path}: {e}")
                    continue

            updated = False
            for pred in model_preds:
                if "explanation_match" in pred:
                    continue

                model_exp = pred.get("explanation", "").strip()
                if not model_exp:
                    pred["explanation_match"] = False
                    updated = True
                    continue

                match_found = False
                for gt_exp in gt_explanations:
                    prompt = f"""
You are evaluating whether the following model explanation captures the **same core reasoning** as the human (ground truth) explanation.

Ground Truth Explanation:
"{gt_exp}"

Model Explanation:
"{model_exp}"

Does the model explanation capture the same core reasoning as the ground truth explanation, even if phrased differently?

Answer "yes" or "no" only.
                    """.strip()

                    print(f"\n📄 Evaluating: {response_path}")
                    print(f"GT: {gt_exp}")
                    print(f"Model: {model_exp}")

                    try:
                        response = evaluation_model.generate(prompt)
                        result_text = response.strip().lower()
                        print(f"LLM response: {result_text}")

                        if "yes" in result_text:
                            match_found = True
                            break

                    except ResourceExhausted as e:
                        print(f"⚠️ Rate limit hit: {e}")
                        print("⏳ Sleeping for 40 seconds...")
                        time.sleep(40)
                        continue

                    except Exception as e:
                        print(f"⚠️ Unexpected error: {e}")
                        break

                    time.sleep(1.5)

                pred["explanation_match"] = match_found
                updated = True

            if updated:
                with open(response_path, "w", encoding="utf-8") as f:
                    json.dump(model_preds, f, indent=4)
                print(f"✅ Updated explanation_match in: {response_path}")
            else:
                print(f"⚠️ Skipped (no update needed): {response_path}")

## `evaluate_scoring`

In [59]:
def evaluate_scoring(responses_dir):
    scores = defaultdict(lambda: {
        "total": 0,
        "correct": 0,
        "text_matches": 0,
        "explanation_matches": 0
    })

    for root, _, files in os.walk(responses_dir):
        if not files:
            continue

        # Get the name of the subdirectory (e.g., "ambiguity")
        subdir = os.path.basename(root)

        for file in files:
            if not file.endswith(".json"):
                continue

            file_path = os.path.join(root, file)
            # print(file_path)
            with open(file_path, "r", encoding="utf-8") as f:
                try:
                    predictions = json.load(f)
                except json.JSONDecodeError:
                    print(f"❌ Skipping malformed JSON: {file_path}")
                    continue

            for pred in predictions:
                if not isinstance(pred, dict):
                    continue
                if "text_match" in pred and "explanation_match" in pred:
                    scores[subdir]["total"] += 1
                    if pred["text_match"] and pred["explanation_match"]:
                        scores[subdir]["correct"] += 1
                    if pred["text_match"]:
                        scores[subdir]["text_matches"] += 1
                    if pred["explanation_match"]:
                        scores[subdir]["explanation_matches"] += 1

    for subdir, stats in scores.items():
        total = stats["total"]
        if total == 0:
            continue
        print(f"\n📁 Directory: {subdir}")
        print(f"Text Match: {stats['text_matches']} / {total}")
        print(f"Explanation Match: {stats['explanation_matches']} / {total}")
        print(f"Text + Explanation Match: {stats['correct']} / {total}")

In [60]:
def run(
    model: Model,
    dataset: Dataset,
    prompt: str,
    responses_dir: str,
    num_responses: int,
    evaluation_model: Model = None
):
    """
    Runs the evaluation process.
    :param model: The model to generate responses.
    :param dataset: The dataset to evaluate.
    :param prompt: The prompt to use for generating responses.
    :param responses_dir: Directory to save the responses.
    :param num_responses: The number of responses to collect per document (for self-consistency)
    :param evaluation_model: Model for evaluating model responses.
    """
    generate_responses(model, dataset, prompt, responses_dir, num_responses)
    explanation_match(evaluation_model, dataset, responses_dir)
    evaluate_scoring(responses_dir)

#### Base Instruction

In [61]:
INSTRUCTIONS = """You are a legal contract expert and know how to check legal documents properly and find any discrepancies or contradictions within a file. You are also aware of all state and national laws when it comes to legal docuements.
The file is a legal document and you are to check for any discrepancies or contradictions within the file.
There are 10 categories when it comes to discrepancies or contradictions:
1. Ambiguity in text - Ambiguities in text occur when key terms are **inconsistently defined within the document itself**, creating internal contradictions. This type of **in-text contradiction** confuses contract enforcement by allowing multiple interpretations of the same term in different sections, leading to potential legal disputes over meaning.
2. Ambiguity in legal terms - Ambiguities in legal terms occur when a legal statement is vague, leading to multiple interpretations. A **legal contradiction** under this category happens when an obligation is introduced ambiguously, making it difficult to enforce under state or national law. This can result in non-compliance with regulatory requirements, leaving legal obligations open to dispute.
3. Inconsistencies in text - Inconsistencies in text also lead to **in-text contradictions** when **different sections of a contract provide conflicting deadlines, obligations, or penalties**. This creates ambiguity regarding which terms should be enforced, leading to disputes over contractual obligations.
4. Inconsistencies in legal terms - Inconsistencies in legal terms arise when **time-sensitive obligations** in a contract do not align with legal requirements. A **legal contradiction** in this category happens when a contract sets **a deadline or requirement that violates federal or state law**, making the contractual terms unenforceable or illegal.
5. Misaligned in text - Misaligned terminology leads to **in-text contradictions** when the contract **uses multiple terms interchangeably without defining them**, leading to conflicting obligations.
6. Misaligned in legal terms - Inconsistencies arise when **time-sensitive obligations** in a contract do not align with legal requirements. A **legal contradiction** in this category happens when a contract sets **a deadline or requirement that violates federal or state law**, making the contractual terms unenforceable or illegal.
7. Omission in text - Omissions also cause **in-text contradictions** when a **key contractual clause is removed**, but **other sections still reference it**, creating an internal contradiction.
8. Omission in legal terms - Omissions occur when a contract **removes essential information**, creating legal loopholes. A **legal contradiction** in this category happens when a contract omits **a legally mandated consumer protection**, making it non-compliant.
9. Structural Flaws in text - this means that the text is not structured properly and does not make sense.
10. Structural Flaws in legal terms - this means that the legal terms used in the text are not structured properly and do not make sense.

Instructions:
1. Read the file and check for any discrepancies or contradictions within the file.
2. Provide a detailed explanation of why this is a discrepancy or contradiction.
3. Provide the section where the discrepancy or contradiction exists.
4. Provide the section location. Like Section 5.4.                                    
5. Categorize the discrepancy or contradiction into one of the 10 categories above (return the number of the category).
There are 2-3 contradictions in each text.

Return the results in json format. Example:
[{
    "section": "Sponsor shall pay Club the Annual Fee for each Contract Year of this Agreement in six (6) equal installments, each\ndue on or prior to the 1st of each month between June and November of the applicable Contract Year."
    "explanation": "This change introduces a contradiction regarding the payment deadline. Section 3(a) states that all installments are due by November 1st, but the added sentence allows the final payment to be made as late as December 15th without penalty. This creates ambiguity as to the actual deadline for the final installment and whether late fees would apply between November 2nd and December 15th."
    "location": "Section 5.2"
    "category": 3
}]
"""

#### Chain-of-thought Prompt

In [62]:
COT = "Make your explanations as detailed as possible and show your reasoning."

### **Zero-shot prompt**

In [63]:
zero_shot_prompt = f"""{INSTRUCTIONS}
This is the document:
[DOCUMENT]
"""

### **Few-shot prompt**

#### ⚠️ **TODO: Describe the few-shot**
- Feed 1 entire document with tokens to the LLM for the few-shot. Describe that what's enclosed in the token is perturbed text.
- Show the correct un-perturbed text. Not the doc.
- Keep in mind each perturbed doc has 3 tokened parts.
- Keep it category specific.

### Prompt design:
- Keep the `INSTRUCTION` variable
- Feed the whole [Perturbed Document] Tell the LLM the text enclosed in the <*$p$*><*$p$*> tokens are the perturbed parts
- [The the line Original Unperturbed Document] - Take it from the .json
- Then feed it the test document `[DOCUMENT]` and tell it to answer
- Keep it category specific

In [64]:
few_shot_prompt = INSTRUCTIONS + """

Question:
Section 3.1 Payment Terms: The Client agrees to pay the Consultant a flat fee of $5,000 for services rendered, payable in full within 30 days of the invoice date. Late payments will incur a penalty of 1.5% per month, starting 15 days after the due date.
Section 5.2 Termination: Either party may terminate this Agreement upon 30 days’ written notice. If the Agreement is terminated, the Consultant is entitled to payment for all services performed up to the termination date, provided that payment is made within 15 days of termination.
Section 7.4 Governing Law: This Agreement shall be governed by and construed in accordance with the laws of the State of New York, excluding its conflict of laws principles.
Section 8.3 Confidentiality: All confidential information shared under this Agreement shall remain confidential for a period of one year from the date of termination. However, this obligation shall remain in effect indefinitely with respect to trade secrets as defined by federal law.

Answer:
[
  {
    "section": "The Client agrees to pay the Consultant a flat fee of $5,000 for services rendered, payable in full within 30 days of the invoice date. Late payments will incur a penalty of 1.5% per month, starting 15 days after the due date.",
    "explanation": "There is a contradiction in payment terms. The text says payment is due in 30 days, but the penalty begins only after 15 days beyond the due date. This creates ambiguity about when the penalty period starts and could confuse enforcement.",
    "location": "Section 3.1",
    "category": 3
  },
  {
    "section": "If the Agreement is terminated, the Consultant is entitled to payment for all services performed up to the termination date, provided that payment is made within 15 days of termination.",
    "explanation": "This clause contradicts Section 3.1, which states payment is due within 30 days of the invoice. It's unclear whether termination changes the timeline from 30 to 15 days, leading to inconsistency in payment obligations.",
    "location": "Section 5.2",
    "category": 3
  },
  {
    "section": "This Agreement shall be governed by and construed in accordance with the laws of the State of New York, excluding its conflict of laws principles.",
    "explanation": "The phrase 'excluding its conflict of laws principles' is ambiguous. It is unclear whether this excludes only procedural conflict of law doctrines or also affects substantive rights, which could cause interpretation issues.",
    "location": "Section 7.4",
    "category": 2
  }
]

Question:
Section 2.1 Services Provided: The Contractor shall provide web development services including front-end and back-end development. All services shall be completed no later than September 30, 2024.
Section 4.3 Payment Schedule: The Client agrees to pay the Contractor $10,000 in two equal installments. The first installment is due upon signing, and the second installment is due upon completion of services.
Section 6.2 Indemnification: The Contractor agrees to indemnify and hold harmless the Client from any and all claims, liabilities, damages, and expenses arising from the Contractor’s breach of this Agreement, except in cases where such claims result from the Client’s gross negligence or willful misconduct.
Section 9.1 Force Majeure: Neither party shall be liable for delays or failures in performance due to causes beyond their control, including natural disasters, war, or internet outages.

Answer:
[
  {
    "section": "All services shall be completed no later than September 30, 2024.",
    "explanation": "This deadline is rigid, but Section 9.1 introduces a Force Majeure clause that excuses delays due to events beyond control, including internet outages. This creates ambiguity around whether a delay past September 30 is acceptable under Force Majeure.",
    "location": "Section 2.1",
    "category": 1
  },
  {
    "section": "The Client agrees to pay the Contractor $10,000 in two equal installments. The first installment is due upon signing, and the second installment is due upon completion of services.",
    "explanation": "There is an omission in the event of early termination. It does not address whether any part of the second installment is owed if the agreement is terminated early, which is critical given Section 2.1's strict deadline.",
    "location": "Section 4.3",
    "category": 7
  },
  {
    "section": "The Contractor agrees to indemnify and hold harmless the Client from any and all claims, liabilities, damages, and expenses arising from the Contractor’s breach of this Agreement, except in cases where such claims result from the Client’s gross negligence or willful misconduct.",
    "explanation": "The phrase 'any and all claims' is overly broad and may conflict with state limitations on indemnification. In some jurisdictions, such blanket indemnification may be unenforceable without clearer limits.",
    "location": "Section 6.2",
    "category": 6
  }
]

Question: [DOCUMENT]
Answer:
"""

#### Zero-shot variations

In [65]:
# import os
# import threading



# Corrects path name such that it ignores path length limit and formats based on your OS definition
def correct_path_name(path):
    return r"\\?\{}".format(os.path.abspath(path))



# # Z | Z + SC
# run(

#     model=GeminiModel(API_KEYS),

#     dataset=MiniEvalDataset(),

#     prompt=zero_shot_prompt,

#     responses_dir=correct_path_name("mini-eval/responses/zero-shot/"),

#     num_responses=1,

#     evaluation_model=GeminiModel(API_KEYS),
# )


# # Z + COT | Z + COT + SC

# run(

#     model=GeminiModel(API_KEYS),

#     dataset=MiniEvalDataset(),

#     prompt=zero_shot_prompt + COT,

#     responses_dir=correct_path_name("mini-eval/responses/zero-shot-cot/"),

#     num_responses=1,

#     evaluation_model=GeminiModel(API_KEYS),
# )


# # Z + SV | Z + SV + SC

# run(

#     model=SelfVerificationModel(GeminiModel(API_KEYS)),

#     dataset=MiniEvalDataset(),

#     prompt=zero_shot_prompt,

#     responses_dir=correct_path_name("mini-eval/responses/zero-shot-self-verification/"),

#     num_responses=1,

#     evaluation_model=GeminiModel(API_KEYS),
# )


# # Z + COT + SV | Z + COT + SV + SC

# run(

#     model=SelfVerificationModel(GeminiModel(API_KEYS)),

#     dataset=MiniEvalDataset(),

#     prompt=zero_shot_prompt + COT,
#     responses_dir=correct_path_name(
#         "mini-eval/responses/zero-shot-self-verification-cot/"
#     ),

#     num_responses=1,

#     evaluation_model=GeminiModel(API_KEYS),
# )

In [66]:
runs = [
    {
        "model": GeminiModel(API_KEYS),
        "dataset": MiniEvalDataset(),
        "prompt": zero_shot_prompt,
        "responses_dir": correct_path_name("mini-eval/responses/zero-shot/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "model": GeminiModel(API_KEYS),
        "dataset": MiniEvalDataset(),
        "prompt": zero_shot_prompt + COT,
        "responses_dir": correct_path_name("mini-eval/responses/zero-shot-cot/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "model": SelfVerificationModel(GeminiModel(API_KEYS)),
        "dataset": MiniEvalDataset(),
        "prompt": zero_shot_prompt,
        "responses_dir": correct_path_name("mini-eval/responses/zero-shot-self-verification/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "model": SelfVerificationModel(GeminiModel(API_KEYS)),
        "dataset": MiniEvalDataset(),
        "prompt": zero_shot_prompt + COT,
        "responses_dir": correct_path_name("mini-eval/responses/zero-shot-self-verification-cot/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "model": GeminiModel(API_KEYS),
        "dataset": MiniEvalDataset(),
        "prompt": few_shot_prompt,
        "responses_dir": correct_path_name("mini-eval/responses/few-shot/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "model": GeminiModel(API_KEYS),
        "dataset": MiniEvalDataset(),
        "prompt": few_shot_prompt + COT,
        "responses_dir": correct_path_name("mini-eval/responses/few-shot-cot/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "model": SelfVerificationModel(GeminiModel(API_KEYS)),
        "dataset": MiniEvalDataset(),
        "prompt": few_shot_prompt,
        "responses_dir": correct_path_name("mini-eval/responses/few-shot-self-verification/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "model": SelfVerificationModel(GeminiModel(API_KEYS)),
        "dataset": MiniEvalDataset(),
        "prompt": few_shot_prompt + COT,
        "responses_dir": correct_path_name("mini-eval/responses/few-shot-self-verification-cot/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
]

import threading
from concurrent.futures import ThreadPoolExecutor

# Silence stdout and stderr
import sys
import os
import contextlib
import io

@contextlib.contextmanager
def suppress_output():
    with open(os.devnull, 'w') as fnull:
        with contextlib.redirect_stdout(fnull), contextlib.redirect_stderr(fnull):
            yield

# Semaphore to limit the number of concurrent threads to the number of API keys
api_key_semaphore = threading.Semaphore(len(API_KEYS))

def run_with_semaphore(run_config):
    """
    Wrapper function to run a task while respecting the semaphore.
    """
    with api_key_semaphore:
        run(
            model=run_config["model"],
            dataset=run_config["dataset"],
            prompt=run_config["prompt"],
            responses_dir=run_config["responses_dir"],
            num_responses=run_config["num_responses"],
            evaluation_model=run_config["evaluation_model"],
        )

with ThreadPoolExecutor(max_workers=len(API_KEYS)) as executor:
    for run_config in runs:
        executor.submit(run_with_semaphore, run_config)

print("✅ DONE")

Processing samples:   0%|          | 0/25 [00:00<?, ?it/s]
[A


[A[A[A

[A[A
Processing samples:  12%|█▏        | 3/25 [00:00<00:00, 25.03it/s]


[A[A[A

Processing samples:  24%|██▍       | 6/25 [00:00<00:01, 15.67it/s]


[A[A[A
[A

[A[A
Processing samples:  32%|███▏      | 8/25 [00:00<00:01, 14.37it/s]


[A[A[A

[A[A


Processing samples:  40%|████      | 10/25 [00:00<00:01, 13.60it/s]
[A

[A[A
Processing samples:  48%|████▊     | 12/25 [00:00<00:00, 13.93it/s]


[A[A[A

[A[A
Processing samples:  56%|█████▌    | 14/25 [00:01<00:00, 13.48it/s]


[A[A[A

[A[A
Processing samples:  64%|██████▍   | 16/25 [00:01<00:00, 13.33it/s]


[A[A[A

[A[A


Processing samples:  72%|███████▏  | 18/25 [00:01<00:00, 14.72it/s]
[A

[A[A


[A[A[A
Processing samples:  80%|████████  | 20/25 [00:01<00:00, 12.15it/s]

💡 Asking questions💡 Asking questions

🤖 Model response: ```json
[
  {
    "section": "5.1 ADVERTISING FEES. After the Launch Date, i-Escrow shall pay 2TheMart advertising fees based on the number of Transaction Inquiries. These advertising fees shall consist of a per Transaction Inquiry amount calculated at i-Escrow's sole discretion. The formula for arriving at the per Transaction Inquiry amount may be revised from time to time during the term of this Agreement to reflect present market conditions at i-Escrow's discretion. i-Escrow may adjust the rate as it sees fit without needing consent.",
    "explanation": "This section allows i-Escrow to unilaterally determine and revise the advertising fees paid to 2TheMart, effectively removing any negotiating power from 2TheMart. The statement 'i-Escrow may adjust the rate as it sees fit without needing consent' makes the agreement one-sided and potentially unfair, as 2TheMart has no control over its compensation. This could be seen as a stru



[A[A

🤖 Model response: ```json
[
  {
    "section": "5.1 ADVERTISING FEES. After the Launch Date, i-Escrow shall pay 2TheMart advertising fees based on the number of Transaction Inquiries. These advertising fees shall consist of a per Transaction Inquiry amount calculated at i-Escrow's sole discretion. The formula for arriving at the per Transaction Inquiry amount may be revised from time to time during the term of this Agreement to reflect present market conditions at i-Escrow's discretion. i-Escrow may adjust the rate as it sees fit without needing consent.",
    "explanation": "This section gives i-Escrow the sole discretion to determine and revise the advertising fees, potentially to the detriment of 2TheMart. It states that i-Escrow can adjust the rate \"as it sees fit without needing consent\". This creates an imbalance in the agreement, as 2TheMart is obligated to promote the services but has no control over the compensation they receive, which can be changed unilaterally by i-Escrow




Processing samples:  88%|████████▊ | 22/25 [00:08<00:03,  1.08s/it]
[A

🤖 Model response: ```json
[
  {
    "section": "For such time as any employees of Provider are providing the Services to Recipient under this Agreement, (a) such employees will remain employees of Provider unless otherwise agreed upon in writing, and may under certain circumstances, be deemed employees of Recipient for specific purposes as determined by Recipient, and (b) while Provider generally remains responsible for the payment of wages, Recipient may, at its discretion, contribute to or assume responsibility for certain wages, bonuses, commissions, or employee benefits, with terms to be negotiated separately for each individual case. The responsibility for withholding and payment of applicable taxes will be determined on a case-by-case basis.",
    "explanation": "This section creates ambiguity regarding the employment status of the Provider's employees. It states they remain employees of the Provider, but also indicates they 'may... be deemed employees of Recipient for specific p



⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...





Processing samples:  92%|█████████▏| 23/25 [00:11<00:02,  1.46s/it]

⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...
🤖 Model response: ```json
[
  {
    "section": "3.2 Each of the Recipient and the Provider may, in their sole discretion, terminate this Agreement in whole or in part, at any time without cause. However, termination by either party is subject to a termination fee of [ * * * ], and liability may arise based on a review of outstanding commitments. To do so, providing at least 90 (ninety) days' prior written notice to the other party (such date, the \"Services Termination Date\").",
    "explanation": "This section presents an inconsistency. It states that either party can terminate the agreement at any time without cause, but then immediately adds a requirement of providing at least 90 days' prior written notice. The phrase \"at any time\" implies immediate termination, which contradicts the 90-day notice period. This creates ambiguity about the actual conditions required for termination. While a termination fee i



[A[A

💡 Asking questions
⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...
⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...
⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...



[A

🤖 Model response: ```json
[
  {
    "section": "The Joint Venturers shall be required to make capital contributions to the Joint Venture as determined necessary by either Joint Venturer, regardless of mutual consent.",
    "explanation": "This statement implies that one Joint Venturer can unilaterally demand capital contributions from the other, irrespective of whether the other agrees. This contradicts the principle of mutual agreement typically required in joint ventures for significant financial decisions. It creates ambiguity about how capital contributions are determined and enforced, especially if one party disagrees with the necessity or amount.",
    "location": "Section 3",
    "category": 1
  },
  {
    "section": "The Joint Venturers shall be required to make capital contributions to the Joint Venture as determined necessary by either Joint Venturer, regardless of mutual consent.",
    "explanation": "The requirement for capital contributions determined necessary by *either*




Processing samples:  96%|█████████▌| 24/25 [00:15<00:01,  1.98s/it]

[A[A

🤖 Model response: ```json
[
  {
    "section": "The Joint Venturers shall be required to make capital contributions to the Joint Venture as determined necessary by either Joint Venturer, regardless of mutual consent.",
    "explanation": "This clause states that either joint venturer can unilaterally determine the necessity and amount of capital contributions from both parties, regardless of the other's consent. This contradicts the fundamental principle of a joint venture, which typically requires mutual agreement on significant financial matters. It gives one party unchecked power over the other's financial obligations to the venture.",
    "location": "Section 3",
    "category": 3
  },
  {
    "section": "During the term of the Joint Venture, no interest shall be allowed to any Joint Venturer upon the amount of his contribution. A Joint Venturer may unilaterally withdraw funds or property from the Joint Venture at any time, regardless of the other Joint Venturer's consent. However,




Processing samples: 100%|██████████| 25/25 [00:17<00:00,  1.42it/s]



[A[A[A


[A[A[A

⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\ambiguity_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\ambiguity_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\a




[A[A[A

⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\misaligned_terminalogy_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\misaligned_terminalogy_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\omissions_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consis

Processing samples: 100%|██████████| 25/25 [00:18<00:00,  1.35it/s]
Evaluating explanations:  64%|██████▍   | 16/25 [00:00<00:00, 70.44it/s]

⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_1.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_2.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy



[A[A


📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
GT: The original text mandates that Confidential Information is marked as such, either in writing or orally. The modified text removes this requirement, making all information confidential by default. Furthermore, original text prevents parties from disclosing such Information to third parties. The modified text now states that the parties are able to disclose such information creating uncertainty and contradiction.
Model: This statement contradicts the general principle of good faith and fair dealing expected in contractual relationships. Allowing i-Escrow to unilaterally change the per-transaction inquiry rate without 2TheMart's consent creates an imbalance of power and potentially undermines the purpo


[A

🤖 Model response: ```json
[
  {
    "section": "1.1. \"Accuray Regions\" means Accuray's sales regions (as of the Effective Date) of the Americas (North America and South America), APAC (Asia Pacific, including Australia and other than India and Japan), EIMEA (Europe, India, Middle East, and Africa), and Japan. For the avoidance of doubt, 'Accuray Regions' shall only refer to North America.",
    "explanation": "The definition of \"Accuray Regions\" initially includes multiple regions (Americas, APAC, EIMEA, and Japan), but then it's immediately restricted to only North America. This creates an internal contradiction, as the initial definition is negated by the subsequent clarification.",
    "location": "Section 1.1",
    "category": 1
  },
  {
    "section": "2.3.3. Purchase. To purchase Products or Services based on a Quote provided by Accuray, Distributor will issue a purchase order, which shall include specific references to the quote number of such Quote (the \"Purchase Order\").

Processing samples: 100%|██████████| 25/25 [00:23<00:00,  1.04it/s]


⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_1.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_2.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document



Processing samples: 100%|██████████| 25/25 [00:24<00:00,  3.05s/it]

⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\misaligned_terminalogy_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\misaligned_terminalogy_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_1.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\misaligned_terminalogy_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_2.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\misal

Processing samples: 100%|██████████| 25/25 [00:24<00:00,  1.03it/s]


🤖 Model response: ```json
[
  {
    "section": "TERM OF CONTRACT The Term of this Agreement shall be for a period of [* ****] years and [*****] months commencing the 1st day of September 2004 and terminating the [*****] day of [*****].",
    "explanation": "The contract states that the term begins on September 1, 2004, but the specific end date is redacted. This omission makes it impossible to determine the exact duration of the agreement, creating uncertainty about the parties' obligations and the overall term of the contract. This lack of a defined end date means that the duration of the contract is vague.",
    "location": "Section 1",
    "category": 7
  },
  {
    "section": "MANDATORY PRODUCTS\" shall mean the following ADAMS GOLF PRODUCTS that CONSULTANT must exclusively play/use in all Champions/Senior Professional Golf Association (SPGA) and Professional Golf Association (PGA) events at all times. However, ADAMS GOLF acknowledges that from time to time, CONSULTANT may, at his 


[A

Processing samples: 100%|██████████| 25/25 [00:24<00:00,  1.03it/s]


[A[A
[A

[A[A

⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification-cot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\ambiguity_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document


[A

[A[A

[A[A
[A

⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification-cot\self_consistency\inconsistencies_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\inconsistencies_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification-cot\self_consistency\inconsistencies_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\



[A[A
[A

⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\omissions_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification-cot\self_consistency\omissions_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\omissions_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document



[A[A

⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\omissions_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification-cot\self_consistency\omissions_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
GT: The original text states that the contract auto-renews if no notice is given. The modified version states that the agreement terminates 



LLM response: yes
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
LLM response: no

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\structural_flaws_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
GT: The original text clearly stated Provider is responsible for all wages and that Provider's employees would not be considered Recipient's employees. The modified version introduces uncertainty by suggesting that Provider's employees can be deemed Recipient's employees and Recipient might contribute to payments. This introduces ambiguity and contradicts the original intent, creating potential disputes ove

Evaluating explanations:  64%|██████▍   | 16/25 [00:16<00:00, 70.44it/s]
[A

[A[A


[A[A[A


📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
GT: The original clause requires mutual consent to adjust advertising fees, but the modified clause gives i-Escrow sole discretion to adjust the fees. This contradicts the original requirement of mutual agreement.
Model: This section contains a direct contradiction. It first states that each party shall hold the other's Confidential Information in confidence. However, it immediately contradicts this by stating that either party may disclose the other party's confidential information to any third party at any time. This completely negates the confidentiality clause and makes it meaningless.
⚠️ API key AIzaSyAH4zpotMPNF-GlGYmMMAi6ZoCte5b95Hk exhausted. Switching...
❌ All keys exhausted o



⚠️ API key AIzaSyBTYgTD42xCABfJy1jsHchkZEhFaw8X1_c exhausted. Switching...
⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\structural_flaws_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
GT: The original clause states that either party can terminate the agreement 'without liability.' However, the modified clause introduces a 'termination fee' and potential liability based on 'outstanding commitments', which directly contradicts the original statement. This creates confusion and potential legal disputes about the financial consequences of termination.
Model: The different communication methods have inconsistencies in text. (a) and (b) are when they are received. (d) is when they are sent. (c) is only valid if recipient confirms receipt within 72 hours. This means there are conflicting 


[A

⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...
❌ All keys exhausted or failed.
LLM response: 
⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\structural_flaws_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
GT: The original text clearly stated Provider is responsible for all wages and that Provider's employees would not be considered Recipient's employees. The modified version introduces uncertainty by suggesting that Provi



[A[A

⚠️ API key AIzaSyBTYgTD42xCABfJy1jsHchkZEhFaw8X1_c exhausted. Switching...
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification-cot\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification-cot\self_consistency\structural_flaws_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
GT: The original text clearly stated Provider is responsible for all wages and that Provider's employees would not be considered Recipient's employees. The modified version introduces uncertainty by suggesting that Provider's employees can be deemed Recipient's employees and Recipient might contribute to payments. This introduces am

Evaluating explanations:  88%|████████▊ | 22/25 [00:33<00:06,  2.14s/it]

LLM response: no
⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\structural_flaws_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\structural_flaws_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json
GT: This change directly contradicts the original clause. Initially, capital contributions required mutual consent, offering predictability. The modification allows either Joint Venturer to unilaterally demand capital contributions, creating uncertainty and potential disputes over funding obligations.
Model: This statement contradicts general principles of joint ventures, whi




📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\structural_flaws_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
GT: The original clause dictates that notices must be in writing and establishes specific methods for determining when a notice is deemed to have been given. The modified version introduces the possibility of oral notices, which directly contradicts the requirement for written notices. Furthermore, it adds a condition that email/fax communications are invalid unless confirmed within 72 hours and that mail is deemed delivered upon dispatch, regardless of receipt. This creates ambiguity about valid communication methods and timelines, potentially leading to disputes over whether proper notice was given.
Model: This section discusses what is accepted when it comes to communications. Contradiction can exist when it comes to receipt. Section (c) states t




[A[A[A

LLM response: yes
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\structural_flaws_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\structural_flaws_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json
GT: This change directly contradicts the original clause. Initially, capital contributions required mutual consent, offering predictability. The modification allows either Joint Venturer to unilaterally demand capital contributions, creating uncertainty and potential disputes over funding obligations.
Model: This clause grants unilateral authority to either Joint Venturer to demand capital contributions, which is inconsistent with the general princip

Evaluating explanations:  92%|█████████▏| 23/25 [00:45<00:06,  3.01s/it]

✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\structural_flaws_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\structural_flaws_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json
GT: This change introduces a contradiction by first defining 'Accuray Regions' to include multiple geographic areas and then immediately restricting the definition to only North America. This creates ambiguity about the scope of the regions, impacting obligations tied to specific regions within the contract.
Model: The definition of 'Accuray Regions' initially includes multiple regions but then is immediately limited to only North America. This creates an intern



[A[A

⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\structural_flaws_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
GT: The original text clearly stated Provider is responsible for all wages and that Provider's employees would not be considered Recipient's employees. The modified version introduces uncertainty by suggesting that Provider's employees can be deemed Recipient's employees and Recipient might contribute to payments. This introduces ambiguity and contradicts the original intent, creating potential disputes over employment status and financial responsibility.
Model: The conditions outlined for when notices are 'deemed given' are structurally flawed and can lead to disputes. Especially (c) and (d). (c) requiring a recipient to confirm receipt within 72 hours is 





LLM response: yes

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification-cot\self_consistency\structural_flaws_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json
GT: This change directly contradicts the original clause. Initially, capital contributions required mutual consent, offering predictability. The modification allows either Joint Venturer to unilaterally demand capital contributions, creating uncertainty and potential disputes over funding obligations.
Model: This section presents conflicting statements regarding distributions. Initially, it allows a joint venturer to unilaterally withdraw funds or property, with a limit of 10% of their capital contribution per quarter. However, it then states that 'All other distributions will adhere to the original terms.' The 'original terms' are not defined, creating ambiguity. This presents an

Evaluating explanations:  88%|████████▊ | 22/25 [00:47<00:13,  4.56s/it][A[A


📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\structural_flaws_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json
GT: This change directly contradicts the original clause. Initially, capital contributions required mutual consent, offering predictability. The modification allows either Joint Venturer to unilaterally demand capital contributions, creating uncertainty and potential disputes over funding obligations.
Model: This statement implies that one Joint Venturer can unilaterally demand capital contributions from the other, irrespective of whether the other agrees. This contradicts the principle of mutual agreement typically required in joint ventures for significant financial decisions. It creates ambiguity about how capital contributions are determined and enforced, especially if one party disagrees with the 



✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\structural_flaws_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\structural_flaws_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json
GT: This change introduces a contradiction by first defining 'Accuray Regions' to include multiple geographic areas and then immediately restricting the definition to only North America. This creates ambiguity about the scope of the regions, impacting obligations tied to specific regions within the contract.
Model: The definition of "Accuray Regions" initially includes several regions but then explicitly states that it shall only refer to North America. 




[A[A[A


📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\structural_flaws_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json
GT: This perturbation introduces a contradiction in the warranty terms. It starts by defining a specific warranty period (1 year following installation, up to 18 months after shipment), and then adds a statement guaranteeing the system will be free from defects for the product's 'expected lifetime.' This contradicts the defined warranty period, creating confusion about the duration of the warranty coverage.
Model: This section presents conflicting information regarding the acceptance of a Purchase Order. It states that failure to respond within two weeks constitutes disapproval, but it also states that acceptance occurs upon execution of the Purchase Order by an authorized representative. This creates ambiguity as to whether a Purchase Order is



[A[A


📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\structural_flaws_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json
GT: This perturbation creates a contradiction by stating the distributor is a 'non-exclusive, worldwide distributor' and then adding that Accuray can appoint exclusive distributors that may overlap with the distributor's territory. This introduces uncertainty regarding the distributor's rights and market access.
Model: This section includes contradictory statements regarding the warranty period. It initially defines the warranty period as one year following installation (up to 18 months after shipment), but then states that Accuray warrants the system to be free from defects for the expected lifetime of the product. This creates ambiguity and uncertainty regarding the actual duration and scope of the warranty.
✅ Updated explanation_match in: \\



LLM response: yes
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\structural_flaws_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT: The original text defines 'MANDATORY PRODUCTS' as those CONSULTANT *must* exclusively use. The added sentence introduces an exception allowing CONSULTANT to use competitor products under certain circumstances, directly contradicting the definition of 'mandatory' and creating uncertainty about the exclusivity requirement.
Model: The use of the word "intends" weakens the obligation of ADAMS GOLF to pay the base compensation. Furthermore, making the payment 

Evaluating explanations:  96%|█████████▌| 24/25 [01:09<00:05,  5.19s/it]

⚠️ API key AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4 exhausted. Switching...
LLM response: no
LLM response: yes
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\structural_flaws_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT: The original text defines 'MANDATORY PRODUCTS' as those CONSULTANT *must* exclusively use. The added sentence introduces an exception allowing CONSULTANT to use competitor products under certain circumstances, directly contradicting the definition of 'mandatory' and creating uncertainty about the exclusivity requirement.
Model: The contract term's end date is left undefine





⚠️ API key AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4 exhausted. Switching...
⚠️ API key AIzaSyBTYgTD42xCABfJy1jsHchkZEhFaw8X1_c exhausted. Switching...
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\structural_flaws_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json


Evaluating explanations:  92%|█████████▏| 23/25 [01:03<00:12,  6.10s/it][A[A


📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\structural_flaws_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json
GT: This change introduces a contradiction by first defining 'Accuray Regions' to include multiple geographic areas and then immediately restricting the definition to only North America. This creates ambiguity about the scope of the regions, impacting obligations tied to specific regions within the contract.
Model: The definition of "Accuray Regions" initially includes multiple regions (Americas, APAC, EIMEA, and Japan), but then it's immediately restricted to only North America. This creates an internal contradiction, as the initial definition is negated by the subsequent clarification.

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot




Evaluating explanations: 100%|██████████| 25/25 [01:16<00:00,  3.08s/it]


✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\structural_flaws_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT: The original text defines 'MANDATORY PRODUCTS' as those CONSULTANT *must* exclusively use. The added sentence introduces an exception allowing CONSULTANT to use competitor products under certain circumstances, directly



⚠️ API key AIzaSyBTYgTD42xCABfJy1jsHchkZEhFaw8X1_c exhausted. Switching...




⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...




⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...
⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...
⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...
⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...
⚠️ API key AIzaSyAH4zpotMPNF-GlGYmMMAi6ZoCte5b95Hk exhausted. Switching...

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\structural_flaws_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json
GT: This perturbation introduces a contradiction in the warranty terms. It starts by defining a specific warranty period (1 year following installation, up to 18 months after shipment), and then adds a statement guaranteeing the system will be free from defects for the product's 'expected lifetime.' This contradicts the defined warranty period, creating confu


[A

LLM response: yes
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\structural_flaws_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json
⚠️ API key AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4 exhausted. Switching...
❌ All keys exhausted or failed.
LLM response: 

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT: The original text defines 'MANDATORY PRODUCTS' as those CONSULTANT *must* exclusively use. The added sentence introduces an exception allowing CONSULTANT to use competitor products under certain circumstances, directly contradicting the definition of 'mandatory' and creating uncertainty about the exclusivity



[A[A

✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification-cot\self_consistency\structural_flaws_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification-cot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT: The original text defines 'MANDATORY PRODUCTS' as those CONSULTANT *must* exclusively use. The added sentence introduces an exception allowing CONSULTANT to use competitor products under certain circumstances, directly contradicting the definition of 'mandatory' and creating uncertainty about the exclusivity requirement.
Model: The contract states that the term begins on September 1, 2004, but the specific end date is redacted. This omis

Evaluating explanations: 100%|██████████| 25/25 [01:34<00:00,  3.79s/it]



📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT: The original clause states that the CONSULTANT *shall* achieve a minimum number of tournaments and *shall* repay ADAMS GOLF if the minimum is not met. The modified version weakens this by saying CONSULTANT *shall endeavor* to achieve minimum and *may* be required to repay *at ADAMS GOLF's sole discretion.* Also it changes the payment period to a notification basis, implying the repayment isn't guaranteed. This weakens the enforceability of the repayment obligation.
Model: The language 'Adams Golf intends to pay' and 'subject to Adams Golf's financial performance and continued evaluation of Consultant's brand value' introduces significant ambiguity and contradictions. It is not a firm commitment to pay the base compensation, making the obli

Processing samples:   0%|          | 0/25 [00:00<?, ?it/s]

LLM response: no


Processing samples:  28%|██▊       | 7/25 [00:00<00:00, 66.75it/s]

⚠️ API key AIzaSyBTYgTD42xCABfJy1jsHchkZEhFaw8X1_c exhausted. Switching...
⚠️ API key AIzaSyAH4zpotMPNF-GlGYmMMAi6ZoCte5b95Hk exhausted. Switching...


Processing samples:  60%|██████    | 15/25 [00:00<00:00, 71.05it/s]

⚠️ API key AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4 exhausted. Switching...
❌ All keys exhausted or failed.
LLM response: 
💡 Asking questions
LLM response: no
⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...
⚠️ API key AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4 exhausted. Switching...
❌ All keys exhausted or failed.
Failed to parse JSON: Expecting value: line 1 column 1 (char 0)
⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...
⚠️ API key AIzaSyBTYgTD42xCABfJy1jsHchkZEhFaw8X1_c exhausted. Switching...
⚠️ API key AIzaSyAH4zpotMPNF-GlGYmMMAi6ZoCte5b95Hk exhausted. Switching...

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification-cot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT: The original text defines 'MANDATORY PRODUCTS' as those CONSULTANT *must* exclusively use. The ad



⚠️ API key AIzaSyBTYgTD42xCABfJy1jsHchkZEhFaw8X1_c exhausted. Switching...
⚠️ API key AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4 exhausted. Switching...
LLM response: yes

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification-cot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT: The original text defines 'MANDATORY PRODUCTS' as those CONSULTANT *must* exclusively use. The added sentence introduces an exception allowing CONSULTANT to use competitor products under certain circumstances, directly contradicting the definition of 'mandatory' and creating uncertainty about the exclusivity requirement.
Model: The use of the word 'intends' to pay creates a discrepancy. In contract law, 'intends' does not create an obligation. Furthermore, stating that the base compensation is 'subject to ADAMS GOLF's financial performance and continue

Processing samples: 100%|██████████| 25/25 [00:17<00:00,  1.40it/s]


⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-cot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-cot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_1.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-cot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_2.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Di



⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-cot\self_consistency\omissions_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-cot\self_consistency\omissions_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-cot\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
GT: The original text states that the contract auto-renews if no notice is given. The modified version states that the agreement terminates if no notice is given. This creates uncertainty in 



Evaluating explanations: 100%|██████████| 25/25 [01:35<00:00,  3.81s/it]


LLM response: yes
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification-cot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
LLM response: no

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT: The original clause states that ADAMS GOLF *shall* pay the base compensation. The modified version changes this to ADAMS GOLF *intends* to pay, and adds conditions for payment (financial performance, continued evaluation of brand value). This introduces uncertainty as to whether the base compensation will actually be paid, despite the initial agreement, which makes it difficult to predict future payments.
Model: This create



[A[A

[A[A

⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...
💡 Asking questions
⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...
⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...
⚠️ API key AIzaSyAH4zpotMPNF-GlGYmMMAi6ZoCte5b95Hk exhausted. Switching...
⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...
⚠️ API key AIzaSyAH4zpotMPNF-GlGYmMMAi6ZoCte5b95Hk exhausted. Switching...
⚠️ API key AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4 exhausted. Switching...
⚠️ API key AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4 exhausted. Switching...
⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-cot\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
GT: The origina




Evaluating explanations: 100%|██████████| 25/25 [01:45<00:00,  4.21s/it]

✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json

📁 Directory: ambiguity_inText
Text Match: 1 / 15
Explanation Match: 1 / 15
Text + Explanation Match: 0 / 15

📁 Directory: inconsistencies_inText
Text Match: 4 / 15
Explanation Match: 1 / 15
Text + Explanation Match: 1 / 15

📁 Directory: misaligned_terminalogy_inText
Text Match: 0 / 15
Explanation Match: 1 / 15
Text + Explanation Match: 0 / 15

📁 Directory: omissions_inText
Text Match: 1 / 15
Explanation Match: 1 / 15
Text + Explanation Match: 1 / 15

📁 Directory: structural_flaws_inText
Text Match: 9 / 15
Explanation Match: 5 / 15
Text + Explanation Match: 4 / 15





⚠️ API key AIzaSyAH4zpotMPNF-GlGYmMMAi6ZoCte5b95Hk exhausted. Switching...
⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...
⚠️ API key AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4 exhausted. Switching...
⚠️ API key AIzaSyBTYgTD42xCABfJy1jsHchkZEhFaw8X1_c exhausted. Switching...
❌ All keys exhausted or failed.
⚠️ API key AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4 exhausted. Switching...
LLM response: no
⚠️ API key AIzaSyBTYgTD42xCABfJy1jsHchkZEhFaw8X1_c exhausted. Switching...
❌ All keys exhausted or failed.
LLM response: 


Processing samples:  60%|██████    | 15/25 [00:12<00:00, 71.05it/s]


📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-cot\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
GT: The original clause requires mutual consent to adjust advertising fees, but the modified clause gives i-Escrow sole discretion to adjust the fees. This contradicts the original requirement of mutual agreement.
Model: This section is contradictory because it states that advertising fees are based on Transaction Inquiries, but then it gives i-Escrow sole discretion to determine the per Transaction Inquiry amount and to revise the formula at any time without 2TheMart's consent. This effectively gives i-Escrow complete control over the advertising fees, potentially rendering the 'based on Transaction Inquiries' aspect meaningless. It creates a situation where 2TheMart has no ability to predict or neg


Evaluating explanations: 100%|██████████| 25/25 [01:42<00:00,  4.08s/it]

LLM response: yes
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
⚠️ API key AIzaSyAH4zpotMPNF-GlGYmMMAi6ZoCte5b95Hk exhausted. Switching...

📁 Directory: ambiguity_inText
Text Match: 2 / 15
Explanation Match: 1 / 15
Text + Explanation Match: 0 / 15

📁 Directory: inconsistencies_inText
Text Match: 5 / 15
Explanation Match: 4 / 15
Text + Explanation Match: 3 / 15

📁 Directory: misaligned_terminalogy_inText
Text Match: 4 / 15
Explanation Match: 1 / 15
Text + Explanation Match: 1 / 15

📁 Directory: omissions_inText
Text Match: 2 / 15
Explanation Match: 2 / 15
Text + Explanation Match: 2 / 15

📁 Directory: structural_flaws_inText
Text Match: 11 / 15
Explanation Match: 6 / 15
Text + Explanation Match: 4 / 15





⚠️ API key AIzaSyAH4zpotMPNF-GlGYmMMAi6ZoCte5b95Hk exhausted. Switching...
LLM response: yes

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-cot\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
GT: The original text states that the contract auto-renews if no notice is given. The modified version states that the agreement terminates if no notice is given. This creates uncertainty in enforcement of the renewal terms.
Model: This section presents a clear contradiction. It initially states that each party must hold the other's Confidential Information in confidence, which implies a duty of non-disclosure. However, it immediately contradicts this by stating that either party may disclose the other party's confidential information to any third party at any time. This completely nullifies the confidentiali



[A[A

⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...
❌ All keys exhausted or failed.
⚠️ API key AIzaSyAH4zpotMPNF-GlGYmMMAi6ZoCte5b95Hk exhausted. Switching...
⚠️ API key AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4 exhausted. Switching...

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-cot\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
GT: The original text states that the contract auto-renews if no notice is given. The modified version states that the agreement terminates if no notice is given. This creates uncertainty in enforcement of the renewal terms.
Model: This section contains an inconsistency. It states that a force majeure event that lasts 60 days or more allows either party to terminate the agreement without penalty. However, it then clarifies that the occurrence of a 

Processing samples:  84%|████████▍ | 21/25 [00:27<00:07,  1.83s/it]

LLM response: no
🤖 Model response: ```json
[
  {
    "section": "i-Escrow may place banner advertising on the Co-Branded Site upon prior written approval of 2TheMart, which shall be at the discretion of 2TheMart.",
    "explanation": "The phrase 'which shall be at the discretion of 2TheMart' grants 2TheMart unlimited power to deny i-Escrow's advertising, contradicting the general principle of good faith expected in contractual relationships.",
    "location": "Section 2.4",
    "category": 1
  },
  {
    "section": "Each party shall hold the other party's Confidential Information in confidence, except that either party may disclose the other party's confidential information to any third party at any time.",
    "explanation": "This clause is contradictory. It states that each party must keep the other's information confidential, but immediately allows either party to disclose the other's confidential information to any third party, which nullifies the confidentiality obligation.",
    



✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-cot\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
❌ No response files found for: structural_flaws_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt
❌ No response files found for: structural_flaws_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt
❌ No response files found for: structural_flaws_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-cot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT: The original text defines 'MANDATORY PRODUCTS' as those CONSULTANT *must* exclusive



[A[A

🤖 Model response: ```json
[
  {
    "section": "In the event that the parties are unable to agree to an SOW within 2 months following the Effective Date, either party may, in its sole discretion, terminate this Agreement by providing written notice. Once approved, the parties shall use commercially reasonable efforts to diligently implement their respective obligations under the SOW. Upon completion of its duties under the SOW, a party shall notify the other party and provide the other party with the opportunity to test and evaluate its work.",
    "explanation": "This section contains ambiguity. While it provides a mechanism for termination if an SOW isn't agreed upon within 2 months, it doesn't specify the implications or obligations if the parties fail to agree within that timeframe but *continue* to negotiate. Does the termination right expire after 2 months, or does it continue indefinitely? Also, the phrase 'commercially reasonable efforts' is a legal term that is vague and is su

Processing samples:  88%|████████▊ | 22/25 [00:31<00:06,  2.02s/it]

LLM response: no
🤖 Model response: ```json
[
  {
    "section": "This Agreement be deemed effective as of the Effective Date, Agreement and shall terminate on December 31, 2020, unless terminated earlier in accordance with Section 3.2.",
    "explanation": "The sentence structure in this section is flawed and unclear. The phrase 'This Agreement be deemed effective as of the Effective Date, Agreement' does not logically connect, making it difficult to understand when the agreement is considered effective.",
    "location": "Section 3.1",
    "category": 9
  },
  {
    "section": "Each of the Recipient and the Provider may, in their sole discretion, terminate this Agreement in whole or in part, at any time without cause. However, termination by either party is subject to a termination fee of [ * * * ], and liability may arise based on a review of outstanding commitments. To do so, providing at least 90 (ninety) days' prior written notice to the other party (such date, the \"Services Term



[A[A

🤖 Model response: ```json
[
  {
    "section": "5.1 ADVERTISING FEES. After the Launch Date, i-Escrow shall pay 2TheMart advertising fees based on the number of Transaction Inquiries. These advertising fees shall consist of a per Transaction Inquiry amount calculated at i-Escrow's sole discretion. The formula for arriving at the per Transaction Inquiry amount may be revised from time to time during the term of this Agreement to reflect present market conditions at i-Escrow's discretion. i-Escrow may adjust the rate as it sees fit without needing consent.",
    "explanation": "This section gives i-Escrow sole discretion to set and revise the per Transaction Inquiry advertising fees. This creates a structural flaw because it renders the payment obligation of i-Escrow vague and potentially unenforceable, as 2TheMart has no control or input over the fee amount, which lacks sufficient clarity regarding the actual compensation mechanism.",
    "location": "Section 5.1",
    "category": 10
  

Processing samples:  88%|████████▊ | 22/25 [00:42<00:06,  2.02s/it]


📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-cot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT: The original clause states that ADAMS GOLF *shall* pay the base compensation. The modified version changes this to ADAMS GOLF *intends* to pay, and adds conditions for payment (financial performance, continued evaluation of brand value). This introduces uncertainty as to whether the base compensation will actually be paid, despite the initial agreement, which makes it difficult to predict future payments.
Model: This clause requires repayment by the Consultant's estate if he dies, but it fails to consider that he may have already fully performed some of the requirements of the agreement, such as promotional appearances. Requiring a repayment without any consideration for the services he has already rendered contradicts that the CONSULT

Processing samples: 100%|██████████| 25/25 [00:43<00:00,  1.75s/it]


LLM response: no
🤖 Model response: ```json
[
  {
    "section": "TERM OF CONTRACT The Term of this Agreement shall be for a period of [* ****] years and [*****] months commencing the 1st day of September 2004 and terminating the [*****] day of [*****].",
    "explanation": "The contract term is not clearly defined due to redactions. This makes it impossible to determine the actual duration of the agreement, which creates ambiguity regarding the obligations and rights of both parties over time.",
    "location": "Section 1",
    "category": 1
  },
  {
    "section": "During the term of this Agreement, from September 1, 2004 through [*****] ADAMS GOLF *intends* to pay CONSULTANT a base compensation of [*****] dollars. Payment of the base compensation will be subject to ADAMS GOLF's financial performance and continued evaluation of CONSULTANT's brand value.",
    "explanation": "The word 'intends' weakens the obligation to pay the base compensation, making it conditional on ADAMS GOLF's f

Evaluating explanations:  32%|███▏      | 8/25 [00:00<00:00, 77.46it/s]

⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification\self_consistency\ambiguity_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification\self_consistency\ambiguity_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-D

Evaluating explanations:  64%|██████▍   | 16/25 [00:00<00:00, 76.36it/s]

⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification\self_consistency\misaligned_terminalogy_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification\self_consistency\misaligned_terminalogy_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification\self_consistency\omissions_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benc

Evaluating explanations: 100%|██████████| 25/25 [00:39<00:00,  1.59s/it]

✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-cot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json

📁 Directory: ambiguity_inText
Text Match: 9 / 36
Explanation Match: 11 / 36
Text + Explanation Match: 7 / 36

📁 Directory: inconsistencies_inText
Text Match: 2 / 15
Explanation Match: 2 / 15
Text + Explanation Match: 1 / 15

📁 Directory: misaligned_terminalogy_inText
Text Match: 3 / 15
Explanation Match: 2 / 15
Text + Explanation Match: 1 / 15

📁 Directory: omissions_inText
Text Match: 1 / 15
Explanation Match: 0 / 15
Text + Explanation Match: 0 / 15

📁 Directory: structural_flaws_inText
Text Match: 3 / 6
Explanation Match: 2 / 6
Text + Explanation Match: 2 / 6
LLM response: no





⚠️ API key AIzaSyAH4zpotMPNF-GlGYmMMAi6ZoCte5b95Hk exhausted. Switching...

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
GT: The original text mandates that Confidential Information is marked as such, either in writing or orally. The modified text removes this requirement, making all information confidential by default. Furthermore, original text prevents parties from disclosing such Information to third parties. The modified text now states that the parties are able to disclose such information creating uncertainty and contradiction.
Model: The phrase 'which shall be at the discretion of 2TheMart' grants 2TheMart unlimited power to deny i-Escrow's advertising, contradicting the general principle of good faith expected in contractu

Processing samples: 100%|██████████| 25/25 [00:44<00:00,  1.77s/it]


🤖 Model response: ```json
[
  {
    "section": "TERM OF CONTRACT The Term of this Agreement shall be for a period of [* ****] years and [*****] months commencing the 1st day of September 2004 and terminating the [*****] day of [*****].",
    "explanation": "The contract period is not clearly defined. While the start date is specified, the end date is redacted. Without a defined end date, the duration of the agreement is unknown, creating ambiguity in determining the obligations and rights of both parties. This lack of clarity can lead to disputes regarding the agreement's validity and enforceability over time.",
    "location": "Section 1",
    "category": 1
  },
  {
    "section": "Payment of the base compensation will be subject to ADAMS GOLF's financial performance and continued evaluation of CONSULTANT's brand value.",
    "explanation": "This clause introduces significant ambiguity regarding the base compensation. The phrase 'subject to ADAMS GOLF's financial performance' makes th



⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\ambiguity_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\ambiguity_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy

Evaluating explanations:  64%|██████▍   | 16/25 [00:18<00:00, 76.36it/s]

LLM response: no

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification\self_consistency\structural_flaws_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
GT: The original clause dictates that notices must be in writing and establishes specific methods for determining when a notice is deemed to have been given. The modified version introduces the possibility of oral notices, which directly contradicts the requirement for written notices. Furthermore, it adds a condition that email/fax communications are invalid unless confirmed within 72 hours and that mail is deemed delivered upon dispatch, regardless of receipt. This creates ambiguity about valid communication methods and timelines, potentially leading to disputes over whether proper notice was given.
Model: The sentence structure in this section is flawed and unclear. The phrase 'This Agreement be deemed effective



✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\omissions_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\omissions_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\omissions_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepanc

Evaluating explanations:  88%|████████▊ | 22/25 [00:28<00:05,  1.86s/it]

✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification\self_consistency\structural_flaws_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification\self_consistency\structural_flaws_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json
GT: This change directly contradicts the original clause. Initially, capital contributions required mutual consent, offering predictability. The modification allows either Joint Venturer to unilaterally demand capital contributions, creating uncertainty and potential disputes over funding obligations.
Model: This clause allows either joint venturer to unilaterally require capital contributions, which contradicts the general principl

Evaluating explanations:  92%|█████████▏| 23/25 [00:34<00:04,  2.14s/it]

LLM response: yes

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
GT: The original text states that the contract auto-renews if no notice is given. The modified version states that the agreement terminates if no notice is given. This creates uncertainty in enforcement of the renewal terms.
Model: Section 11 states that a party can release confidential information to a third party whenever they want, but then says there will be legal ramifications if that section is breached, but it can't be breached if the section says that they can release the information.
LLM response: yes
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot



LLM response: yes
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\structural_flaws_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
GT: The original text clearly stated Provider is responsible for all wages and that Provider's employees would not be considered Recipient's employees. The modified version introduces uncertainty by suggesting that Provider's employees can be deemed Recipient's employees and Recipient might contribute to payments. This introduces ambiguity and contradicts the original intent, creating poten

Evaluating explanations:  92%|█████████▏| 23/25 [00:48<00:04,  2.14s/it]


📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\structural_flaws_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
GT: The original text clearly stated Provider is responsible for all wages and that Provider's employees would not be considered Recipient's employees. The modified version introduces uncertainty by suggesting that Provider's employees can be deemed Recipient's employees and Recipient might contribute to payments. This introduces ambiguity and contradicts the original intent, creating potential disputes over employment status and financial responsibility.
Model: This section describes the conditions under which communication is considered given. Clause (c) states that communications via email or fax are valid only if the recipient confirms receipt within 72 hours. Conversely, clause (d) says notices sent via mail are deemed rec



LLM response: yes
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\structural_flaws_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\structural_flaws_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json
GT: This change directly contradicts the original clause. Initially, capital contributions required mutual consent, offering predictability. The modification allows either Joint Venturer to unilaterally demand capital contributions, creating uncertainty and potential disputes over funding obligations.
Model: This section is problematic because it allows either joint venturer to unilaterally determine that c

Evaluating explanations: 100%|██████████| 25/25 [00:57<00:00,  2.29s/it]

✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
LLM response: yes

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\structural_flaws_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json
GT: This change directly contradicts the original clause. Initially, capital contributions required mutual consent, offering predictability. The modification allows either Joint Venturer to unilaterally demand capital contributions, creating uncertainty and potential disputes over funding obligations.
Model: While attempting to avoid partnership classification, this section emphasizes cooperation and cont




LLM response: no

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\structural_flaws_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json
GT: The original text specifies a fixed term with a renewal clause based on mutual written agreement. The changed text allows unilateral termination by either party with 30 days' notice, contradicting the initial fixed term and the requirement for mutual agreement to extend the venture. This introduces uncertainty about the venture's lifespan and the parties' commitment.
Model: While attempting to avoid partnership classification, this section emphasizes cooperation and contribution of capital, labor, and services. Many aspects mirror partnership functions. Attempting to opt out of Subchapter K of the Internal Revenue Code carries certain implications and requirements. This complexi



✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\structural_flaws_inText\ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINTVENTUREAGREEMENT.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\structural_flaws_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json
GT: This change introduces a contradiction by first defining 'Accuray Regions' to include multiple geographic areas and then immediately restricting the definition to only North America. This creates ambiguity about the scope of the regions, impacting obligations tied to specific regions within the contract.
Model: This definition provides a list of Accuray Regions but then specifically states that it shall only refer to



LLM response: yes
✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\structural_flaws_inText\ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTORAGREEMENT.txt_0.json

📄 Evaluating: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
GT: The original text defines 'MANDATORY PRODUCTS' as those CONSULTANT *must* exclusively use. The added sentence introduces an exception allowing CONSULTANT to use competitor products under certain circumstances, directly contradicting the definition of 'mandatory' and creating uncertainty about the exclusivity requirement.
Model: The contract period is not clearly defined. While the start date is specified, the end date is 

Evaluating explanations: 100%|██████████| 25/25 [01:22<00:00,  3.30s/it]

✅ Updated explanation_match in: \\?\c:\Users\Noel\Documents\Projects\School\CSE 576\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\few-shot-self-verification-cot\self_consistency\structural_flaws_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json

📁 Directory: ambiguity_inText
Text Match: 2 / 15
Explanation Match: 4 / 15
Text + Explanation Match: 2 / 15

📁 Directory: inconsistencies_inText
Text Match: 3 / 15
Explanation Match: 4 / 15
Text + Explanation Match: 3 / 15

📁 Directory: misaligned_terminalogy_inText
Text Match: 1 / 15
Explanation Match: 1 / 15
Text + Explanation Match: 0 / 15

📁 Directory: omissions_inText
Text Match: 1 / 15
Explanation Match: 1 / 15
Text + Explanation Match: 1 / 15

📁 Directory: structural_flaws_inText
Text Match: 7 / 15
Explanation Match: 8 / 15
Text + Explanation Match: 5 / 15
✅ DONE





#### Few-shot variations

In [67]:
# # FS | FS + SC
# run(
#     model=GeminiModel(API_KEYS),
#     dataset=MiniEvalDataset(),
#     prompt=few_shot_prompt,
#     responses_dir="mini-eval/responses/few-shot/",
#     num_responses=1,    # SC
#     evaluation_model=GeminiModel(API_KEYS)
# )

# # FS + COT | FS + COT + SC
# run(
#     model=GeminiModel(API_KEYS),
#     dataset=MiniEvalDataset(),
#     prompt=few_shot_prompt + COT,
#     responses_dir="mini-eval/responses/few-shot-cot/",
#     num_responses=1,    # SC
#     evaluation_model=GeminiModel(API_KEYS)
# )

# # FS + SV | FS + SV + SC
# run(
#     model=SelfVerificationModel(GeminiModel(API_KEYS)),
#     dataset=MiniEvalDataset(),
#     prompt=few_shot_prompt,
#     responses_dir="mini-eval/responses/few-shot-self-verification/",
#     num_responses=1,    # SC
#     evaluation_model=GeminiModel(API_KEYS)
# )

# # FS + COT + SV | FS + COT + SV + SC
# run(
#     model=SelfVerificationModel(GeminiModel(API_KEYS)),
#     dataset=MiniEvalDataset(),
#     prompt=few_shot_prompt + COT,
#     responses_dir="mini-eval/responses/self-verification/",
#     num_responses=1,    # SC
#     evaluation_model=GeminiModel(API_KEYS)
# )

## TODO 
---
- Z ✅
- Z + COT ✅
- Z + SV ✅
- Z + COT + SV ✅
- Z + SC ✅
- Z + COT + SC ✅
---
- FS ✅⚠️
- FS + COT ✅⚠️
- FS + SV ✅⚠️
- FS + COT + SV ✅⚠️
- FS + SC ✅⚠️
- FS + COT + SC ✅⚠️
---
- Z + SV + SC (SKIP THIS FOR NOW) ✅
- Z + COT + SV + SC (SKIP THIS FOR NOW) ✅
- FS + SV + SC (SKIP THIS FOR NOW) ✅⚠️
- FS + COT + SV + SC (SKIP THIS FOR NOW) ✅⚠️
---
- **Output into a .csv**❌
- **Eventually need to repeat with different LLMs**❌

# Metrics
1) `text match` but `explanation !match` = -1
2) `text match` and `explanation match` = +1
3) `text !match` and `explanation match` = -1
4) `text !match` and `explanation !match` = -1