In [1]:
import os
import json
import shutil
import google.generativeai as genai
import tqdm
from collections import defaultdict
from google.api_core.exceptions import ResourceExhausted
import glob
import time
import few_shot_prompts as fs_prompts

  from .autonotebook import tqdm as notebook_tqdm


### Set up the mini-eval directory with the 'answers' (LLM-based ground truth) and 'documents' (perturbed documents without tags).


In [2]:
base_dir = 'perturbed_legal_documents'
PERTURBATION_TYPES = ['ambiguity', 'inconsistencies', 'misaligned_terminalogy', 'omission', 'structural_flaws']
CATEGORIES = ['inText', 'legal']

In [3]:
# for pt in PERTURBATION_TYPES:
#     for ct in CATEGORIES:
#         print(f"\nProcessing: {pt}_{ct}_contradiction")

#         input_dir = f'{base_dir}/{pt}_{ct}_contradiction/'
#         doc_dir = os.path.join(input_dir, 'modified_files_no_tags')

#         if not os.path.exists(input_dir):
#             print(f"Input dir not found: {input_dir}")
#             continue
#         if not os.path.exists(doc_dir):
#             print(f"Document dir not found: {doc_dir}")
#             continue

#         output_answers = f'mini-eval/answers/{pt}_{ct}_contradiction/'
#         output_documents = f'mini-eval/documents/{pt}_{ct}_contradiction/'

#         # Check if outputs already exist and contain at least 5 files
#         if (os.path.exists(output_answers) and len(os.listdir(output_answers)) >= 5 and
#             os.path.exists(output_documents) and len(os.listdir(output_documents)) >= 5):
#             print(f"Skipping {pt}_{ct}_contradiction — already processed.")
#             continue

#         os.makedirs(output_answers, exist_ok=True)
#         os.makedirs(output_documents, exist_ok=True)

#         # Collect all valid json->txt pairs
#         json_files = sorted([f for f in os.listdir(input_dir) if f.endswith('.json')])
#         print(f"🔎 Found {len(json_files)} JSON files")

#         valid_pairs = []

#         for json_file in json_files:
#             if not json_file.startswith("perturbed_") or not json_file.endswith(".pdf.json"):
#                 print(f"  ⚠️ Skipping incorrectly named file: {json_file}")
#                 continue

#             base_name = json_file[len("perturbed_"):-len(".pdf.json")]
#             txt_file = f"modified_{base_name}.pdf.txt"
#             txt_path = os.path.join(doc_dir, txt_file)

#             if os.path.exists(txt_path):
#                 valid_pairs.append((json_file, txt_file))
#                 print(f"  ✅ Matched: {json_file} <-> {txt_file}")
#             else:
#                 print(f"  ❌ Missing TXT: {txt_file}")

#             if len(valid_pairs) == 5:
#                 break

#         if not valid_pairs:
#             print("Can't find corresponding files????")
#             continue

#         # Copy matched pairs
#         for json_file, txt_file in valid_pairs:
#             src_json = os.path.join(input_dir, json_file)
#             dst_json = os.path.join(output_answers, json_file)

#             src_txt = os.path.join(doc_dir, txt_file)
#             dst_txt = os.path.join(output_documents, txt_file)

#             shutil.copy(src_json, dst_json)
#             shutil.copy(src_txt, dst_txt)
#             print(f"  📁 Copied: {json_file} and {txt_file}")

In [4]:
API_KEYS = [
    "AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k", # Aditya
    "AIzaSyDgafwAgDi2Zjvu6jdt_SIZ60VgK1Na32E", # Aditya
    "AIzaSyCWI7QJXWYBGGWGdL37W8ll0sDIwz0zqlo", # Aditya
    "AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE", # Foo
    "AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k", # Foo
    "AIzaSyAjby-dj9aBsolOdTDpvU7_x5uje8l4yiQ", # Foo
    "AIzaSyCN-EJ7s6CIeEybjT3tM_zN0-4xx4Rcqqw", # Foo
    "AIzaSyCKWwXUILaUvHkyppqY87-cqBad16vZb00", # Foo
    "AIzaSyCfYpaD89nvVJ6GIitszeWI0KXdlgEAv-Q", # Foo
    "AIzaSyCsA0PVE_BygEVMdrGs7Upyo4nBk2FTbhM", # Foo
    "AIzaSyAcqO6uxgeIP5qyxcDZLAY2TC9xyTlBmC0", # Foo
    "AIzaSyC_86XS-IZzhdfmhBSThwQoYMoQuFeY4mQ", # Foo
    "AIzaSyAH4zpotMPNF-GlGYmMMAi6ZoCte5b95Hk", # Ezra
    "AIzaSyDSG4tUWCN6oA7b2XMS8zLOfXG7R987D2Y", # Ezra
    "AIzaSyDwBOvWeSweppAjbU3fwWqBm0a_M7JGOWw", # Ezra
    "AIzaSyCqqBjoa2M6HF7aEagzJn_2ckEYrW1s7wY", # Ezra
    "AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4", # Noel
    "AIzaSyBTYgTD42xCABfJy1jsHchkZEhFaw8X1_c", # Mannan
]

In [5]:
# os.environ["GOOGLE_API_KEY"] = "AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k"
# API_KEY = os.getenv("GOOGLE_API_KEY")
# genai.configure(api_key=API_KEY)

## Datasets

In [6]:
from abc import ABC, abstractmethod


class Dataset(ABC):
    @abstractmethod
    def __len__(self):
        pass

    @abstractmethod
    def __getitem__(self, idx):
        pass

class MiniEvalDataset(Dataset):

    def __init__(self):

        self.mini_eval_dir = "mini-eval"
        self.mini_eval_answers_dir = os.path.join(self.mini_eval_dir, "answers")
        self.mini_eval_documents_dir = os.path.join(self.mini_eval_dir, "documents")
        self.files = [
            os.path.relpath(
                os.path.join(root, file), self.mini_eval_answers_dir
            ).replace(".json", "")

            for root, _, files in os.walk(self.mini_eval_answers_dir)
            for file in files

        ]
        self.files.sort()


    def __len__(self):
        return len(self.files)


    def __getitem__(self, idx):

        file_base = self.files[idx]

        with open(
            os.path.join(self.mini_eval_answers_dir, self.files[idx] + ".json"),
            "r",
            encoding="utf-8",
        ) as f:
            answers = "\n".join(f.readlines())
            answers = self.__remove_non_ascii(answers)
            answers = json.loads(answers)


        with open(
            os.path.join(self.mini_eval_documents_dir, self.files[idx] + ".txt"),
            "r",
            encoding="utf-8",
        ) as f:
            documents = "\n".join(f.readlines())
            documents = self.__remove_non_ascii(documents)

        # Extract category as the top-level folder (e.g., ambiguity_inText)
        category_full = file_base.split("\\")[0]  # or "\\" if running on Windows paths
        # category = category_full.split("_")[0]


        return {
            "file_name": file_base,
            "category": category_full,
            "answers": answers,
            "documents": documents,
        }


    def __remove_non_ascii(self, s):
        return "".join(filter(lambda x: ord(x) < 128, s))

## Model

In [7]:
from abc import ABC, abstractmethod


class Model(ABC):
    @abstractmethod
    def generate(self, prompt):
        pass
    
# New version with API key cycling
class GeminiModel(Model):
    def __init__(self, api_keys):
        self.api_keys = api_keys
        self.key_index = 0
        self._set_key(self.api_keys[self.key_index])
    
    def _set_key(self, key):
        os.environ["GOOGLE_API_KEY"] = key
        genai.configure(api_key=key)
        self.model = genai.GenerativeModel("gemini-2.0-flash")

    def generate(self, prompt, max_retries=5):
        for attempt in range(max_retries):
            try:
                response = self.model.generate_content(prompt)
                return response.to_dict()["candidates"][0]["content"]["parts"][0]["text"]
            except ResourceExhausted:
                print(f"⚠️ API key {self.api_keys[self.key_index]} exhausted. Switching...")
                self.key_index = (self.key_index + 1) % len(self.api_keys)
                self._set_key(self.api_keys[self.key_index])
        print("❌ All keys exhausted or failed.")
        return ""

## Prompting Methods
These ones take in a base model and does some prompting stuff with it.

In [8]:
class SelfVerificationModel(Model):
    def __init__(self, model: Model):
        self.model = model

    def generate(self, prompt):

        failed = True

        while failed:
            print("💡 Asking questions")
            response = self.model.generate(prompt)
            is_model_sure_response = self.model.generate(
                f"You are a grader. Verify if the following response to the question is correct. If the answer is correct, say yes. Otherwise, say no.\nQuestion: {prompt}\nAnswer: {response}"
            )

            print("🤖 Model response:", response)
            print("🤓 Model sure response:", is_model_sure_response)

            if "yes" in is_model_sure_response.lower():
                print("✅ Model is sure about the answer.")
                failed = False
            else:
                print("❌ Model is not sure. Retrying...")


        return response

You retrieve elements in each dataset like this:

In [9]:
dataset = MiniEvalDataset()
display(dataset[0]["answers"], dataset[0]["documents"], dataset[10]["category"])


[{'file_name': '2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt',
  'perturbation': [{'type': 'Ambiguities - In Text Contradiction',
    'original_text': '(c) "CUSTOMERS" means all users who access Co-Branded Site.',
    'changed_text': '(c) "CUSTOMERS" means all users who access Co-Branded Site and complete at least one transaction per month.',
    'explanation': "The original definition of 'Customers' is broad, encompassing all users of the Co-Branded Site. The modified definition adds a requirement of completing at least one transaction per month, creating a narrower and conflicting definition. This ambiguity could lead to disputes regarding who qualifies as a 'Customer' for purposes of marketing reports, promotional discounts, or other benefits.",
    'location': '1(c)'},
   {'type': 'Ambiguities - In Text Contradiction',
    'original_text': '8.1 TERM.  The term of this Agreement shall continue for one (1) year following the Launch

'CO-BRANDING AND ADVERTISING AGREEMENT THIS CO-BRANDING AND ADVERTISING AGREEMENT (the "Agreement") is made as of June 21, 1999 (the "Effective Date") by and between I-ESCROW, INC., with its principal place of business at 1730 S. Amphlett Blvd., Suite 233, San Mateo, California 94402 ("i-Escrow"), and 2THEMART.COM, INC. having its principal place of business at 18301 Von Karman Avenue, 7th Floor, Irvine, California 92612 ("2TheMart"). 1. DEFINITIONS. (a) "CONTENT" means all content or information, in any medium, provided by a party to the other party for use in conjunction with the performance of its obligations hereunder, including without limitation any text, music, sound, photographs, video, graphics, data or software. Content provided by 2TheMart is referred to herein as "2TheMart Content" and Content provided by i-Escrow is referred to herein as "i-Escrow Content." (b) "CO-BRANDED SITE" means the web-site accessible through Domain Name, for the Services implemented by i-Escrow. Th

'misaligned_terminalogy_inText'

**You check the length like this:**

In [10]:
len(dataset)
print(dataset[5]["file_name"])

inconsistencies_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt


**Helper functions:**

In [11]:
def clean_and_parse_model_response(raw_response):
    raw_response = raw_response.strip().strip("`")
    if raw_response.startswith("json"):
        raw_response = raw_response[4:].strip()

    try:
        parsed = json.loads(raw_response)
    except json.JSONDecodeError as e:
        print("Failed to parse JSON:", e)
        return None

    return parsed


def add_section_identified_flag(predictions, ground_truth_perturbations):
    gt_locations = {p["location"].strip() for p in ground_truth_perturbations}
    gt_changed_texts = [p["changed_text"] for p in ground_truth_perturbations]

    for pred in predictions:
        # LOCATION MATCH
        pred_loc = pred.get("location", "").strip()
        pred["location_match"] = pred_loc in gt_locations

        # TEXT MATCH (check if model's reponse for 'section' matches what was perturbed)
        pred_section = pred.get("section", "").strip()
        pred["text_match"] = any(pred_section in gt_text or gt_text in pred_section for gt_text in gt_changed_texts)

    return predictions

def few_shot_samples(category, num_examples=1):
    answers_dir = os.path.join("mini-eval", "training-answers", category)
    documents_dir = os.path.join("mini-eval", "training-documents", category)

    # Match all training answers
    json_files = sorted(glob.glob(os.path.join(answers_dir, "perturbed_*.txt.json")))
    if len(json_files) < num_examples:
        raise ValueError(f"Requested {num_examples} examples, but found only {len(json_files)} in {answers_dir}")

    prompts = []

    for i in range(num_examples):
        json_path = json_files[i]

        # Strip "perturbed_" prefix and ".txt.json" suffix
        full_filename = os.path.basename(json_path)
        base_name = full_filename.replace("perturbed_", "").replace(".txt.json", "")

        # Reconstruct document path
        txt_path = os.path.join(documents_dir, f"modified_{base_name}.txt.txt")

        if not os.path.exists(txt_path):
            raise FileNotFoundError(f"Missing matching document file: {txt_path}")

        # Load perturbed document text
        with open(txt_path, "r", encoding="utf-8") as f:
            perturbed_text = f.read().strip()

        # Load perturbed answer JSON
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Get first 3 original_texts
        originals = []
        for entry in data:
            for pert in entry.get("perturbation", []):
                originals.append(pert["original_text"])
                if len(originals) >= 3:
                    break
            if len(originals) >= 3:
                break

        while len(originals) < 3:
            originals.append("[NO EXAMPLE]")

        # Format few-shot example
        example_prompt = f"""
This is a perturbed document:
{perturbed_text}

The text enclosed within the <*$p$*><*$p$*> tags are modified pieces of text.
Here are the original texts:
1. {originals[0]}
2. {originals[1]}
3. {originals[2]}
        """.strip()

        prompts.append(example_prompt)

    return "\n\n---\n\n".join(prompts)

### Implementation of `generate_responses`

In [12]:
def generate_responses(model, dataset, prompt: str, output_dir, num_responses: int = 1):
    for sample in tqdm.tqdm(dataset, desc="Processing samples"):
        # Prepare base directory and document text
        base_name = sample["file_name"]
        category = sample["category"]
        # print(category)
        document_with_tags_removed = sample["documents"].replace("<*$p$*>", "") 
        ground_truth = sample["answers"][0]["perturbation"]

        for i in range(num_responses):
            # Construct output path: outputs/self_consistency/<subdir>/<filename>_i.json
            subdir = os.path.join(output_dir, "self_consistency", os.path.dirname(base_name))
            os.makedirs(subdir, exist_ok=True)
            output_path = os.path.join(subdir, os.path.basename(base_name) + f"_{i}.json")

            # Skip if file already exists
            if os.path.exists(output_path):
                continue

            # Generate model response
            model_response = model.generate(
                prompt.replace("[DOCUMENT]", document_with_tags_removed)
            )

            # Fill placeholders
            filled_prompt = prompt.replace("[DOCUMENT]", document_with_tags_removed)
            if "[FEW SHOT PLACEHOLDER]" in filled_prompt:
                few_shot = few_shot_samples(category)
                filled_prompt = filled_prompt.replace("[FEW SHOT PLACEHOLDER]", few_shot)

            model_response = model.generate(filled_prompt)

            parsed_response = clean_and_parse_model_response(model_response)

            if parsed_response:
                updated_predictions = add_section_identified_flag(parsed_response, ground_truth)
                with open(output_path, "w", encoding="utf-8") as f:
                    json.dump(updated_predictions, f, indent=4)

### Implementation of `explanation_match`

In [13]:
def explanation_match(evaluation_model: Model, dataset, responses_dir):
    for sample in tqdm.tqdm(dataset, desc="Evaluating explanations"):
        file_name = sample["file_name"]
        
        # Normalize and split into subdir + base filename (fixes Windows paths)
        normalized_path = os.path.normpath(file_name)
        subdir = os.path.dirname(normalized_path).replace("\\", "/")
        base_filename = os.path.basename(normalized_path).replace(".json", "")

        # Match all _i.json variant files for this sample
        pattern = os.path.join(responses_dir, "self_consistency", subdir, f"{base_filename}_*.json")
        response_paths = sorted(glob.glob(pattern))

        if not response_paths:
            print(f"❌ No response files found for: {file_name}")
            continue

        # Extract GT explanations
        gt_explanations = [
            p["explanation"].strip()
            for p in sample["answers"][0]["perturbation"]
            if "explanation" in p
        ]

        for response_path in response_paths:
            with open(response_path, "r", encoding="utf-8") as f:
                try:
                    model_preds = json.load(f)
                except json.JSONDecodeError as e:
                    print(f"❌ JSON decode error in {response_path}: {e}")
                    continue

            updated = False
            for pred in model_preds:
                if "explanation_match" in pred:
                    continue

                model_exp = pred.get("explanation", "").strip()
                if not model_exp:
                    pred["explanation_match"] = False
                    updated = True
                    continue

                match_found = False
                for gt_exp in gt_explanations:
                    prompt = f"""
You are evaluating whether the following model explanation captures the **same core reasoning** as the human (ground truth) explanation.

Ground Truth Explanation:
"{gt_exp}"

Model Explanation:
"{model_exp}"

Does the model explanation capture the same core reasoning as the ground truth explanation, even if phrased differently?

Answer "yes" or "no" only.
                    """.strip()

                    print(f"\n📄 Evaluating: {response_path}")
                    print(f"GT: {gt_exp}")
                    print(f"Model: {model_exp}")

                    try:
                        response = evaluation_model.generate(prompt)
                        result_text = response.strip().lower()
                        print(f"LLM response: {result_text}")

                        if "yes" in result_text:
                            match_found = True
                            break

                    except ResourceExhausted as e:
                        print(f"⚠️ Rate limit hit: {e}")
                        print("⏳ Sleeping for 40 seconds...")
                        time.sleep(40)
                        continue

                    except Exception as e:
                        print(f"⚠️ Unexpected error: {e}")
                        break

                    time.sleep(1.5)

                pred["explanation_match"] = match_found
                updated = True

            if updated:
                with open(response_path, "w", encoding="utf-8") as f:
                    json.dump(model_preds, f, indent=4)
                print(f"✅ Updated explanation_match in: {response_path}")
            else:
                print(f"⚠️ Skipped (no update needed): {response_path}")

## `evaluate_scoring`

In [14]:
def evaluate_scoring(responses_dir):
    scores = defaultdict(lambda: {
        "total": 0,
        "correct": 0,
        "text_matches": 0,
        "explanation_matches": 0
    })

    for root, _, files in os.walk(responses_dir):
        if not files:
            continue

        # Get the name of the subdirectory (e.g., "ambiguity")
        subdir = os.path.basename(root)

        for file in files:
            if not file.endswith(".json"):
                continue

            file_path = os.path.join(root, file)
            # print(file_path)
            with open(file_path, "r", encoding="utf-8") as f:
                try:
                    predictions = json.load(f)
                except json.JSONDecodeError:
                    print(f"❌ Skipping malformed JSON: {file_path}")
                    continue

            for pred in predictions:
                if not isinstance(pred, dict):
                    continue
                if "text_match" in pred and "explanation_match" in pred:
                    scores[subdir]["total"] += 1
                    if pred["text_match"] and pred["explanation_match"]:
                        scores[subdir]["correct"] += 1
                    if pred["text_match"]:
                        scores[subdir]["text_matches"] += 1
                    if pred["explanation_match"]:
                        scores[subdir]["explanation_matches"] += 1

    for subdir, stats in scores.items():
        total = stats["total"]
        if total == 0:
            continue
        print(f"\n📁 Directory: {subdir}")
        print(f"Text Match: {stats['text_matches']} / {total}")
        print(f"Explanation Match: {stats['explanation_matches']} / {total}")
        print(f"Text + Explanation Match: {stats['correct']} / {total}")

    return {
        subdir: {
            "text_matches": stats["text_matches"],
            "explanation_matches": stats["explanation_matches"],
            "correct": stats["correct"],
            "total": stats["total"]
        }
        for subdir, stats in scores.items()
    }

In [15]:
def run(
    model: Model,
    dataset: Dataset,
    prompt: str,
    responses_dir: str,
    num_responses: int,
    evaluation_model: Model = None
):
    """
    Runs the evaluation process.
    :param model: The model to generate responses.
    :param dataset: The dataset to evaluate.
    :param prompt: The prompt to use for generating responses.
    :param responses_dir: Directory to save the responses.
    :param num_responses: The number of responses to collect per document (for self-consistency)
    :param evaluation_model: Model for evaluating model responses.
    """
    generate_responses(model, dataset, prompt, responses_dir, num_responses)
    explanation_match(evaluation_model, dataset, responses_dir)
    return evaluate_scoring(responses_dir)

#### Base Instruction

In [16]:
INSTRUCTIONS = """You are a legal contract expert and know how to check legal documents properly and find any discrepancies or contradictions within a file. You are also aware of all state and national laws when it comes to legal docuements.
The file is a legal document and you are to check for any discrepancies or contradictions within the file.
There are 10 categories when it comes to discrepancies or contradictions:
1. Ambiguity in text - Ambiguities in text occur when key terms are **inconsistently defined within the document itself**, creating internal contradictions. This type of **in-text contradiction** confuses contract enforcement by allowing multiple interpretations of the same term in different sections, leading to potential legal disputes over meaning.
2. Ambiguity in legal terms - Ambiguities in legal terms occur when a legal statement is vague, leading to multiple interpretations. A **legal contradiction** under this category happens when an obligation is introduced ambiguously, making it difficult to enforce under state or national law. This can result in non-compliance with regulatory requirements, leaving legal obligations open to dispute.
3. Inconsistencies in text - Inconsistencies in text also lead to **in-text contradictions** when **different sections of a contract provide conflicting deadlines, obligations, or penalties**. This creates ambiguity regarding which terms should be enforced, leading to disputes over contractual obligations.
4. Inconsistencies in legal terms - Inconsistencies in legal terms arise when **time-sensitive obligations** in a contract do not align with legal requirements. A **legal contradiction** in this category happens when a contract sets **a deadline or requirement that violates federal or state law**, making the contractual terms unenforceable or illegal.
5. Misaligned in text - Misaligned terminology leads to **in-text contradictions** when the contract **uses multiple terms interchangeably without defining them**, leading to conflicting obligations.
6. Misaligned in legal terms - Inconsistencies arise when **time-sensitive obligations** in a contract do not align with legal requirements. A **legal contradiction** in this category happens when a contract sets **a deadline or requirement that violates federal or state law**, making the contractual terms unenforceable or illegal.
7. Omission in text - Omissions also cause **in-text contradictions** when a **key contractual clause is removed**, but **other sections still reference it**, creating an internal contradiction.
8. Omission in legal terms - Omissions occur when a contract **removes essential information**, creating legal loopholes. A **legal contradiction** in this category happens when a contract omits **a legally mandated consumer protection**, making it non-compliant.
9. Structural Flaws in text - this means that the text is not structured properly and does not make sense.
10. Structural Flaws in legal terms - this means that the legal terms used in the text are not structured properly and do not make sense.

Instructions:
1. Read the file and look for the text enclosed between the tags "<*$p$*>" within the file.
2. Provide a detailed explanation of why this is a discrepancy or contradiction.
3. Provide the section where the discrepancy or contradiction exists.
4. Provide the section location. Example: Section 5.4.                                    
5. Categorize the discrepancy or contradiction into one of the 10 categories above (return the number of the category).
There are 2-3 contradictions in each text.

Return the results in json format. Example:
[{
    "section": "Sponsor shall pay Club the Annual Fee for each Contract Year of this Agreement in six (6) equal installments, each\ndue on or prior to the 1st of each month between June and November of the applicable Contract Year."
    "explanation": "This change introduces a contradiction regarding the payment deadline. Section 3(a) states that all installments are due by November 1st, but the added sentence allows the final payment to be made as late as December 15th without penalty. This creates ambiguity as to the actual deadline for the final installment and whether late fees would apply between November 2nd and December 15th."
    "location": "Section 5.2"
    "category": 3
}]
"""

### **Chain-of-thought Prompt**

In [17]:
COT = "Make your explanations as detailed as possible and show your reasoning."

### **Zero-shot prompt**

In [18]:
zero_shot_prompt = f"""{INSTRUCTIONS}
This is the document:
[DOCUMENT]
"""

### **Few-shot prompt**

#### ⚠️ **TODO: Describe the few-shot**
- Feed 1 entire document with tokens to the LLM for the few-shot. Describe that what's enclosed in the token is perturbed text.
- Show the correct un-perturbed text. Not the doc.
- Keep in mind each perturbed doc has 3 tokened parts.
- Keep it category specific.

### Prompt design:
- Keep the `INSTRUCTION` variable
- Feed the whole [Perturbed Document] Tell the LLM the text enclosed in the <*$p$*><*$p$*> tokens are the perturbed parts
- [The the line Original Unperturbed Document] - Take it from the .json
- Then feed it the test document `[DOCUMENT]` and tell it to answer
- Keep it category specific

In [19]:
def few_shot_placeholder(cot=True):
    if cot:
        return f"[FEW SHOT PLACEHOLDER]\n{INSTRUCTIONS}\Here's a document I want you to evaluate as per the instructions: [DOCUMENT]\nAnswer:\n{COT}"
    return f"[FEW SHOT PLACEHOLDER]\n{INSTRUCTIONS}\Here's a document I want you to evaluate as per the instructions: [DOCUMENT]\nAnswer:"

# print(few_shot_placeholder(True))

In [20]:
fs = few_shot_samples('structural_flaws_inText')
print(fs)

This is a perturbed document:
Exhibit 10.11 ***Certain portions of this exhibit have been omitted based on a request for confidential treatment pursuant to Rule 24b-2 under the Securities Exchange Act of 1934, as amended. The omitted portions have been filed separately with the Securities and Exchange Commission. STRATEGIC ALLIANCE AGREEMENT This Strategic Collaboration Agreement ("Agreement"), effective as of the 23rd day of September, 2016 ("Effective Date"), is entered into by and between The University of Texas M. D. Anderson Cancer Center, with a place of business located at 1515 Holcombe Blvd., Houston, TX 77030, USA ("MD Anderson"), a member institution of The University of Texas System ("System") and Adaptimmune LLC, with a place of business located at 2001 Market Street, Philadelphia, PA 1903, USA ("Adaptimmune"); and Adaptimmune Limited, with a place of business at 101 Milton Park, Abingdon, Oxfordshire, OX14 4RY ("Adaptimmune Limited") (MD Anderson and Adaptimmune each a "Pa

In [21]:
few_shot_prompt = INSTRUCTIONS + """

Question:
Section 1.1 Ambiguities - In Text Contradiction: Adaptimmune shall have responsibility for IND filing and monitoring unless otherwise agreed by JSC.
Section 2.9 Ambiguities - In Text Contradiction: MD Anderson and Adaptimmune will promptly notify each other upon identifying any aspect of a Protocol, including information discovered during site monitoring visits, or Study results that may adversely affect the safety, well-being, or medical care of the Study subjects, or that may affect the willingness of Study subjects to continue participation in a Study, influence the conduct of the Study, or that may alter the IRB's approval to continue the Study.
Section 8.3 Ambiguities - In Text Contradiction: The Parties agree that any termination of a Study Order shall allow for: (i) the wind down of the Study to ensure the safety of Study subjects; and (ii) Adaptimmune's final reconciliation of Data related to the Study in addition to Adaptimmune's final monitoring visit.

Answer:
[
  {
    "section": "Adaptimmune shall have responsibility for IND filing and monitoring unless otherwise agreed by JSC.",
    "explanation": "This change introduces a contradiction regarding the responsibility for IND monitoring. The original text assigns it to Adaptimmune unless the JSC decides otherwise. The modified version definitively assigns monitoring to MD Anderson, creating a conflict if the JSC makes a different decision later, or if other sections assume Adaptimmune's monitoring role.",
    "location": "1.1",
    "category": 1
  },
  {
    "section": "MD Anderson and Adaptimmune will promptly notify each other upon identifying any aspect of a Protocol, including information discovered during site monitoring visits, or Study results that may adversely affect the safety, well-being, or medical care of the Study subjects, or that may affect the willingness of Study subjects to continue participation in a Study, influence the conduct of the Study, or that may alter the IRB's approval to continue the Study.",
    "explanation": "This edit creates conflicting requirements for reporting adverse findings. Previously, both parties were responsible for mutual notification. Now, Adaptimmune's notification to MD Anderson is limited to data results from *Adaptimmune's* monitoring. If MD Anderson discovers issues through their own oversight, it's unclear if Adaptimmune should be notified, creating uncertainty in communication and potential safety oversight.",
    "location": "2.9",
    "category": 1
  },
  {s
    "section": "The Parties agree that any termination of a Study Order shall allow for: (i) the wind down of the Study to ensure the safety of Study subjects; and (ii) Adaptimmune's final reconciliation of Data related to the Study in addition to Adaptimmune's final monitoring visit.",
    "explanation": "Conflicting responsibilities are defined, at termination. Data reconcilation responsibilities is given to both parites.",
    "location": "8.3",
    "category": 1
  }
]

Question:
Section 5.1 Inconsistencies - In Text Contradiction: The term of this Agreement shall commence on the Effective Date and continue for six (6) months.
Section 3.2 Inconsistencies - In Text Contradiction: Detto shall pay to PivX for each unit of Qwik-Fix Pro distributed hereunder per copy fees (the "Per Copy Fees") as governed by the terms set forth in Exhibit A. In the event that PivX changes the Third Party prices, Per Copy Fees based on such changed prices shall apply to any order for Qwik-Fix Pro received by PivX after the effective date of the increase. PivX shall provide Detto with at least forty-five (45) days written notice of any increase in the Per Copy Fees.
Section 1.3 Inconsistencies - In Text Contradiction: PivX grants Detto access to PivX ChannelNet as governed by the terms set forth in Exhibit A.

Answer:
[
  {
    "section": "The term of this Agreement shall commence on the Effective Date and continue for six (6) months.",
    "explanation": "The original text specifies a fixed six-month term. The modified text introduces an automatic renewal clause, conflicting with the initial fixed term. This creates uncertainty about the agreement's duration and renewal conditions.",
    "location": "5.1",
    "category": 3
  },
  {
    "section": "Detto shall pay to PivX for each unit of Qwik-Fix Pro distributed hereunder per copy fees (the \"Per Copy Fees\") as governed by the terms set forth in Exhibit A. In the event that PivX changes the Third Party prices, Per Copy Fees based on such changed prices shall apply to any order for Qwik-Fix Pro received by PivX after the effective date of the increase. PivX shall provide Detto with at least forty-five (45) days written notice of any increase in the Per Copy Fees.",
    "explanation": "The original clause stipulates a 45-day notice period for Per Copy Fee increases. The modified text allows immediate price changes, creating a direct contradiction. This introduces uncertainty for Detto regarding pricing stability and predictability, potentially leading to disputes.",
    "location": "3.2",
    "category": 3
  },
  {
    "section": "PivX grants Detto access to PivX ChannelNet as governed by the terms set forth in Exhibit A.",
    "explanation": "Originally, access to PivX ChannelNet is presented as governed by Exhibit A, implying some defined terms and conditions. The change introduces an arbitrary right for PivX to terminate access at any time, contradicting the idea of governance by Exhibit A and creating uncertainty regarding the reliability and scope of Detto's access.",
    "location": "1.3",
    "category": 3
  }
]

Question:
Section 1.3 Misaligned Terminology - In Text Contradiction: Adaptimmune agrees to commit funding in an amount of at least nineteen million six hundred and forty four thousand Dollars US ($19,644,000) for the performance of the Studies as set out in Exhibit I during the term ("Alliance Funding"). The JSC may allocate and/or re-allocate funds to Studies as necessary and agreed by JSC.
Section 8.3 Misaligned Terminology - In Text Contradiction: The Parties agree that any termination of a Study Order shall allow for: (i) the wind down of the Study to ensure the safety of Study subjects; and (ii) Adaptimmune's final reconciliation of Data related to the Study in addition to Adaptimmune's final monitoring visit. All reasonable fees associated with the wind-down activities and final monitoring visit shall be paid by Adaptimmune, to the extent not covered by Alliance Funding.
Section 10.1 Misaligned Terminology - In Text Contradiction: Adaptimmune shall assume responsibility for reasonable medical expenses incurred by a Study subject for reasonable and necessary treatment if the Study subject experiences an illness, adverse event or injury that is a result of the Study Drug or any procedure required by the Protocol that the subject would not have undergone were it not for such Study subject's participation in the Study. Adaptimmune shall not be responsible for expenses to the extent that they are due to pre-existing medical conditions, underlying disease, or the negligence or intentional misconduct or due to breach of this Agreement by MD Anderson or Principal Investigator.

Answer:
[
  {
    "section": "Adaptimmune agrees to commit funding in an amount of at least nineteen million six hundred and forty four thousand Dollars US ($19,644,000) for the performance of the Studies as set out in Exhibit I during the term (\"Alliance Funding\"). The JSC may allocate and/or re-allocate funds to Studies as necessary and agreed by JSC.",
    "explanation": "This introduces ambiguity and contradiction by changing 'allocate and/or re-allocate funds to Studies as necessary and agreed by JSC' to 'adjust funds to Studies as necessary', granting the JSC unilateral control without requiring agreement from both parties. This creates uncertainty about whether Adaptimmune has a say in how the funds are spent.",
    "location": "1.3",
    "category": 5
  },
  {
    "section": "The Parties agree that any termination of a Study Order shall allow for: (i) the wind down of the Study to ensure the safety of Study subjects; and (ii) Adaptimmune's final reconciliation of Data related to the Study in addition to Adaptimmune's final monitoring visit. All reasonable fees associated with the wind-down activities and final monitoring visit shall be paid by Adaptimmune, to the extent not covered by Alliance Funding.",
    "explanation": "This change creates a contradiction regarding who is responsible for the fees associated with winding down a study. The original text states that Adaptimmune pays 'all reasonable fees...to the extent not covered by Alliance Funding', whereas the modified text states that all fees are split equally by Adaptimmune and MD Anderson 'regardless of Alliance Funding.' This uncertainty could lead to disputes.",
    "location": "8.3",
    "category": 5
  },
  {
    "section": "Adaptimmune shall assume responsibility for reasonable medical expenses incurred by a Study subject for reasonable and necessary treatment if the Study subject experiences an illness, adverse event or injury that is a result of the Study Drug or any procedure required by the Protocol that the subject would not have undergone were it not for such Study subject's participation in the Study. Adaptimmune shall not be responsible for expenses to the extent that they are due to pre-existing medical conditions, underlying disease, or the negligence or intentional misconduct or due to breach of this Agreement by MD Anderson or Principal Investigator.",
    "explanation": "This modifies 'shall assume responsibility' to 'may, at its sole discretion, cover', which introduces ambiguity regarding Adaptimmune's obligation to cover medical expenses. In original text, it's a requirement but in the changed one, it is up to Adaptimmune's decision. This contradiction creates uncertainty about patient care costs.",
    "location": "10.1",
    "category": 5
  }
]

Question:
Section 7.2 Omissions - In Text Contradiction: Patient records, research notebooks, all original source documents, Protected Health Information (as such term is defined by HIPAA), MD Anderson's business records, regulatory and compliance documents, original medical records or any information required to be maintained by MD Anderson in accordance with Applicable Laws, that is generated in the conduct of the Studies (collectively, "MD Anderson Records") will be owned by MD Anderson. All results, data and work product (excluding MD Anderson Records) generated in the conduct of the Studies ("Data") shall be owned by Adaptimmune Limited. MD Anderson shall maintain all such Data as confidential, subject to the publication rights granted in Section 12 below. Data will be promptly disclosed by MD Anderson to Adaptimmune in the form of a Study report or as otherwise reasonably requested by Adaptimmune. Notwithstanding any other provision of this Agreement, MD Anderson shall have the right to use results and Data of the Study for its internal research, academic, and patient care purposes and for publication in accordance with Section 12 below, save that no right or license is granted to MD Anderson under any of Adaptimmune's Background IP. Adaptimmune shall promptly disclose any Data it generates to MD Anderson.
Section 8.3 Omissions - In Text Contradiction: A Party may terminate a Study Order: (a) if the other Party commits a material breach of this Agreement or the Study Order and fails to cure such breach within thirty (30) days of receiving notice from the non-breaching Party of such breach; or (b) in the case of any Clinical Studies, due to health and safety concerns related to the Study Drug or procedures in the Study (including regulatory holds due to the health and safety of the Study Subjects); or (c) in the case of MD Anderson and in relation to any Clinical Studies, where IRB requests termination of any Study; or (d) in the case of Adaptimmune, *** set out in Section 1.2 above. The Parties agree that any termination of a Study Order shall allow for: (i) the wind down of the Study to ensure the safety of Study subjects; and (ii) Adaptimmune's final reconciliation of Data related to the Study in addition to Adaptimmune's final monitoring visit. All reasonable fees associated with the wind-down activities and final monitoring visit shall be paid by Adaptimmune, to the extent not covered by Alliance Funding. Termination of one or more Study Orders will not automatically result in the termination of this Agreement or termination of any other Study Orders. Upon termination of a Study Order, MD Anderson will immediately return (at Adaptimmune's cost) any Study Drugs provided by Adaptimmune for such Study as directed by Adaptimmune.
Section 12.2 Omissions - In Text Contradiction: Clinical Studies: In relation to any Clinical Study, Adaptimmune shall have the *** right to publish or publicly disclose any Data or results arising from such Clinical Study including where such publication arises from the submission of data and/or results to the regulatory authorities. Such right to publish shall not include any MD Anderson Records or any public health information protected by HIPAA or where any publication would be in breach of the Consent and/or Authorization. MD Anderson and/or Principal Investigator shall have the right to independently publish or publicly disclose, either in writing or orally, the Data and results of the Clinical Study/ies after the earlier of the (i) first publication (including any multi-site publication) of such Data and/or results; (ii) twelve (12) months after completion of any multi-site study encompassing any Study or if none, six (6) months after completion of Study. MD Anderson shall, at least thirty (30) days ahead of any proposed date for submission, furnish Adaptimmune with a written copy of the proposed publication or public disclosure. Within such thirty (30) day period, Adaptimmune shall review such proposed publication for any Confidential Information of Adaptimmune provided hereunder or patentable Data. Adaptimmune may also comment on such proposed publication and MD Anderson shall consider such comments in good faith during the aforementioned thirty (30) day period. MD Anderson and/or Principal Investigator shall remove Confidential Information of Adaptimmune provided hereunder that has been so identified (other than Data or Study results), provided that Adaptimmune agrees to act in good faith when requiring the deletion of Adaptimmune Confidential Information. In addition Adaptimmune may request delay of publication for a period not to exceed *** (***) days from the date of receipt of request by MD Anderson, to permit Adaptimmune or Adaptimmune Limited or any Joint Research Partner to file patent applications or to otherwise seek to protect any intellectual property rights contained in such publication or disclosure. Upon such request, MD Anderson shall delay such publication until the relevant protection is filed up to a maximum of *** (***) days from date of receipt of request for delay by MD Anderson.

Answer:
[
  {
    "section": "Patient records, research notebooks, all original source documents, Protected Health Information (as such term is defined by HIPAA), MD Anderson's business records, regulatory and compliance documents, original medical records or any information required to be maintained by MD Anderson in accordance with Applicable Laws, that is generated in the conduct of the Studies (collectively, \"MD Anderson Records\") will be owned by MD Anderson. All results, data and work product (excluding MD Anderson Records) generated in the conduct of the Studies (\"Data\") shall be owned by Adaptimmune Limited. MD Anderson shall maintain all such Data as confidential, subject to the publication rights granted in Section 12 below. Data will be promptly disclosed by MD Anderson to Adaptimmune in the form of a Study report or as otherwise reasonably requested by Adaptimmune. Notwithstanding any other provision of this Agreement, MD Anderson shall have the right to use results and Data of the Study for its internal research, academic, and patient care purposes and for publication in accordance with Section 12 below, save that no right or license is granted to MD Anderson under any of Adaptimmune's Background IP. Adaptimmune shall promptly disclose any Data it generates to MD Anderson.",
    "explanation": "By removing the section 'Protected Health Information (as such term is defined by HIPAA)', this creates an uncertainty on how to treat this information as it could now be considered 'Data', which is owned by Adaptimmune Limited. This would make the treatment of PHI ambiguous and could lead to breaches of HIPAA if Adaptimmune treats it as regular data.",
    "location": "7.2",
    "category": 7
  },
  {
    "section": "A Party may terminate a Study Order: (a) if the other Party commits a material breach of this Agreement or the Study Order and fails to cure such breach within thirty (30) days of receiving notice from the non-breaching Party of such breach; or (b) in the case of any Clinical Studies, due to health and safety concerns related to the Study Drug or procedures in the Study (including regulatory holds due to the health and safety of the Study Subjects); or (c) in the case of MD Anderson and in relation to any Clinical Studies, where IRB requests termination of any Study; or (d) in the case of Adaptimmune, *** set out in Section 1.2 above. The Parties agree that any termination of a Study Order shall allow for: (i) the wind down of the Study to ensure the safety of Study subjects; and (ii) Adaptimmune's final reconciliation of Data related to the Study in addition to Adaptimmune's final monitoring visit. All reasonable fees associated with the wind-down activities and final monitoring visit shall be paid by Adaptimmune, to the extent not covered by Alliance Funding. Termination of one or more Study Orders will not automatically result in the termination of this Agreement or termination of any other Study Orders. Upon termination of a Study Order, MD Anderson will immediately return (at Adaptimmune's cost) any Study Drugs provided by Adaptimmune for such Study as directed by Adaptimmune.",
    "explanation": "The section '(d) in the case of Adaptimmune, *** set out in Section 1.2 above' contains an omission already in the original contract. By removing the asterisks, it suggests that Adaptimmune does have a unilateral right to terminate, but the grounds are unknown because the text is still omitted as the text references Section 1.2 above. Thus Section 1.2 is an essential condition for Adaptimmune to terminate, but the details are omitted.",
    "location": "8.3",
    "category": 7
  },
  {
    "section": "Clinical Studies: In relation to any Clinical Study, Adaptimmune shall have the *** right to publish or publicly disclose any Data or results arising from such Clinical Study including where such publication arises from the submission of data and/or results to the regulatory authorities. Such right to publish shall not include any MD Anderson Records or any public health information protected by HIPAA or where any publication would be in breach of the Consent and/or Authorization. MD Anderson and/or Principal Investigator shall have the right to independently publish or publicly disclose, either in writing or orally, the Data and results of the Clinical Study/ies after the earlier of the (i) first publication (including any multi-site publication) of such Data and/or results; (ii) twelve (12) months after completion of any multi-site study encompassing any Study or if none, six (6) months after completion of Study. MD Anderson shall, at least thirty (30) days ahead of any proposed date for submission, furnish Adaptimmune with a written copy of the proposed publication or public disclosure. Within such thirty (30) day period, Adaptimmune shall review such proposed publication for any Confidential Information of Adaptimmune provided hereunder or patentable Data. Adaptimmune may also comment on such proposed publication and MD Anderson shall consider such comments in good faith during the aforementioned thirty (30) day period. MD Anderson and/or Principal Investigator shall remove Confidential Information of Adaptimmune provided hereunder that has been so identified (other than Data or Study results), provided that Adaptimmune agrees to act in good faith when requiring the deletion of Adaptimmune Confidential Information. In addition Adaptimmune may request delay of publication for a period not to exceed *** (***) days from the date of receipt of request by MD Anderson, to permit Adaptimmune or Adaptimmune Limited or any Joint Research Partner to file patent applications or to otherwise seek to protect any intellectual property rights contained in such publication or disclosure. Upon such request, MD Anderson shall delay such publication until the relevant protection is filed up to a maximum of *** (***) days from date of receipt of request for delay by MD Anderson.",
    "explanation": "In the original text, there are asterisks indicating that some wording is omitted, which means that there is a term that is not defined. In the changed text, the asterisks have been removed from 'Adaptimmune shall have the *** right to publish'. By removing this asterisks, it implies that Adaptimmune can publish or publicly disclose any data, giving them full rights. However, the type or extent of these rights is not defined, creating uncertainty.",
    "location": "12.2",
    "category": 7
  }
]

Question: [DOCUMENT]
Answer:
"""

#### Zero-shot variations

In [22]:
# import os
# import threading



# Corrects path name such that it ignores path length limit and formats based on your OS definition
def correct_path_name(path):
    return r"\\?\{}".format(os.path.abspath(path))



# # Z | Z + SC
# run(

#     model=GeminiModel(API_KEYS),

#     dataset=MiniEvalDataset(),

#     prompt=zero_shot_prompt,

#     responses_dir=correct_path_name("mini-eval/responses/zero-shot/"),

#     num_responses=1,

#     evaluation_model=GeminiModel(API_KEYS),
# )


# # Z + COT | Z + COT + SC

# run(

#     model=GeminiModel(API_KEYS),

#     dataset=MiniEvalDataset(),

#     prompt=zero_shot_prompt + COT,

#     responses_dir=correct_path_name("mini-eval/responses/zero-shot-cot/"),

#     num_responses=1,

#     evaluation_model=GeminiModel(API_KEYS),
# )


# # Z + SV | Z + SV + SC

# run(

#     model=SelfVerificationModel(GeminiModel(API_KEYS)),

#     dataset=MiniEvalDataset(),

#     prompt=zero_shot_prompt,

#     responses_dir=correct_path_name("mini-eval/responses/zero-shot-self-verification/"),

#     num_responses=1,

#     evaluation_model=GeminiModel(API_KEYS),
# )


# # Z + COT + SV | Z + COT + SV + SC

# run(

#     model=SelfVerificationModel(GeminiModel(API_KEYS)),

#     dataset=MiniEvalDataset(),

#     prompt=zero_shot_prompt + COT,
#     responses_dir=correct_path_name(
#         "mini-eval/responses/zero-shot-self-verification-cot/"
#     ),

#     num_responses=1,

#     evaluation_model=GeminiModel(API_KEYS),
# )

In [None]:
runs = [
    {
        "name": "zero-shot",
        "model": GeminiModel(API_KEYS),
        "dataset": MiniEvalDataset(),
        "prompt": zero_shot_prompt,
        "responses_dir": correct_path_name("mini-eval/responses/zero-shot/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "name": "zero-shot-cot",
        "model": GeminiModel(API_KEYS),
        "dataset": MiniEvalDataset(),
        "prompt": zero_shot_prompt + COT,
        "responses_dir": correct_path_name("mini-eval/responses/zero-shot-cot/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "name": "zero-shot-self-verification",
        "model": SelfVerificationModel(GeminiModel(API_KEYS)),
        "dataset": MiniEvalDataset(),
        "prompt": zero_shot_prompt,
        "responses_dir": correct_path_name(
            "mini-eval/responses/zero-shot-self-verification/"
        ),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "name": "zero-shot-self-verification-cot",
        "model": SelfVerificationModel(GeminiModel(API_KEYS)),
        "dataset": MiniEvalDataset(),
        "prompt": zero_shot_prompt + COT,
        "responses_dir": correct_path_name(
            "mini-eval/responses/zero-shot-self-verification-cot/"
        ),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "name": "few-shot",
        "model": GeminiModel(API_KEYS),
        "dataset": MiniEvalDataset(),
        "prompt": few_shot_placeholder(cot=False),
        "responses_dir": correct_path_name("mini-eval/responses/few-shot/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "name": "few-shot-cot",
        "model": GeminiModel(API_KEYS),
        "dataset": MiniEvalDataset(),
        "prompt": few_shot_placeholder(cot=True),
        "responses_dir": correct_path_name("mini-eval/responses/few-shot-cot/"),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "name": "few-shot-self-verification",
        "model": SelfVerificationModel(GeminiModel(API_KEYS)),
        "dataset": MiniEvalDataset(),
        "prompt": few_shot_placeholder(cot=False),
        "responses_dir": correct_path_name(
            "mini-eval/responses/few-shot-self-verification/"
        ),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
    {
        "name": "few-shot-self-verification-cot",
        "model": SelfVerificationModel(GeminiModel(API_KEYS)),
        "dataset": MiniEvalDataset(),
        "prompt": few_shot_placeholder(cot=True),
        "responses_dir": correct_path_name(
            "mini-eval/responses/few-shot-self-verification-cot/"
        ),
        "num_responses": 1,
        "evaluation_model": GeminiModel(API_KEYS),
    },
]

import threading
from concurrent.futures import ThreadPoolExecutor

# Silence stdout and stderr
import sys
import os
import contextlib
import io


@contextlib.contextmanager
def suppress_output():
    with open(os.devnull, "w") as fnull:
        with contextlib.redirect_stdout(fnull), contextlib.redirect_stderr(fnull):
            yield


# Semaphore to limit the number of concurrent threads to the number of API keys
api_key_semaphore = threading.Semaphore(len(API_KEYS))

run_results = {}


def run_with_semaphore(run_config):
    """
    Wrapper function to run a task while respecting the semaphore.
    """
    with api_key_semaphore:
        run_results[run_config["name"]] = run(
                model=run_config["model"],
                dataset=run_config["dataset"],
                prompt=run_config["prompt"],
                responses_dir=run_config["responses_dir"],
                num_responses=run_config["num_responses"],
                evaluation_model=run_config["evaluation_model"],
            )


with ThreadPoolExecutor(max_workers=len(API_KEYS)) as executor:
    for run_config in runs:
        executor.submit(run_with_semaphore, run_config)

print("✅ DONE")

Processing samples:   0%|          | 0/25 [00:00<?, ?it/s]


[A[A[A

[A[A




[A[A[A[A[A



[A[A[A[A
[A





[A[A[A[A[A[A

💡 Asking questions
💡 Asking questions


Processing samples:  28%|██▊       | 7/25 [00:00<00:00, 69.47it/s]




[A[A[A[A[A





Processing samples:  60%|██████    | 15/25 [00:00<00:00, 69.34it/s]




[A[A[A[A[A





Processing samples:  92%|█████████▏| 23/25 [00:00<00:00, 72.84it/s]




Processing samples: 100%|██████████| 25/25 [00:00<00:00, 70.37it/s]
Processing samples: 100%|██████████| 25/25 [00:00<00:00, 69.81it/s]
Processing samples: 100%|██████████| 25/25 [00:00<00:00, 68.12it/s]






[A[A[A[A[A[A

⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_1.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_2.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX






Processing samples: 100%|██████████| 25/25 [00:00<00:00, 75.59it/s]


⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\ambiguity_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\ambiguity_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_1.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification\self_consistency\ambiguity_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\ambiguity_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_2.json








Evaluating explanations:  20%|██        | 5/25 [00:00<00:00, 38.84it/s]




[A[A[A[A[A





[A[A[A[A[A[A

⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\ambiguity_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_3.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\ambiguity_inText\ABILITYINC_06_15_2020-EX-4.25-SERVICESAGREEMENT.txt_4.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\ambiguity_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_1.json
⚠️ Skipped (

Evaluating explanations:  40%|████      | 10/25 [00:00<00:00, 40.05it/s]




[A[A[A[A[A





Evaluating explanations: 100%|██████████| 25/25 [00:00<00:00, 77.27it/s]
Evaluating explanations: 100%|██████████| 25/25 [00:00<00:00, 80.11it/s]





[A[A[A[A[A

⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\inconsistencies_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_1.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\inconsistencies_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_2.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-self-verification-cot\self_consistency\misaligned_terminalogy_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\inconsistencies_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_3.json
⚠️ Skipped (no update needed): \\?\

Evaluating explanations: 100%|██████████| 25/25 [00:00<00:00, 55.55it/s]
Evaluating explanations: 100%|██████████| 25/25 [00:00<00:00, 60.49it/s]


⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot\self_consistency\structural_flaws_inText\2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-BrandingAgreement_AgencyAgreement.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\omissions_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_0.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\omissions_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_1.json
⚠️ Skipped (no update needed): \\?\c:\Users\manim\ASU\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\responses\zero-shot-cot\self_consistency\omissions_inText\ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt_2.json
⚠️ Skipped (no update needed)


[A

🤖 Model response: ```json
[
  {
    "section": "8.1 TERM. The term of this Agreement shall continue for one (1) year following the Launch Date, unless earlier terminated as provided herein. This Agreement may be renewed for any number of successive one (1) year terms by mutual written agreement of the parties prior to the conclusion of the term of this Agreement. A party wishing to renew this Agreement shall give the other party notice thereof no less than thirty (30) days before the expiration of the term then in effect. A party wishing to renew this Agreement shall give the other party notice thereof no less than thirty (30) days before the expiration of the term then in effect. In the event that neither party expresses a desire to renew the agreement in writing, the term of this Agreement shall expire automatically.",
    "explanation": "The section on TERM contains a duplication of the sentence, \"A party wishing to renew this Agreement shall give the other party notice thereof no 



[A[A


[A[A[A

🤖 Model response: ```json
[
  {
    "section": "2.4 RESTRICTIONS ON COMMUNICATIONS. i-Escrow may place banner advertising on the Co-Branded Site upon prior written approval of 2TheMart, which shall be at the discretion of 2TheMart. ",
    "explanation": "The term 'discretion of 2TheMart' is open to interpretation and could be seen as ambiguous. There are no standards of discretion, and the term 'discretion' is not defined, which could lead to conflict regarding refusal for advertising on the Co-Branded Site.",
    "location": "Section 2.4",
    "category": 1
  },
  {
    "section": "5.1 ADVERTISING FEES. After the Launch Date, i-Escrow shall pay 2TheMart advertising fees based on the number of Transaction Inquiries. This advertising fees shall consist of a per Transaction Inquiry amount calculated by multiplying 0.025% by the amount of the average Transaction from all Customers in the preceding quarter. The formula for arriving at the per Transaction Inquiry amount may be revised from 


[A

⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...
⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...
🤖 Model response: ```json
[
  {
    "section": "1.1 Provision of Services. (a) Provider agrees to provide the Services set forth on the Exhibit A attached hereto (as such Exhibit may be amended or supplemented pursuant to the terms of this Agreement, the \"Exhibit\") to Recipient for the respective periods and on the other terms and conditions set forth in this Agreement and in the Exhibit. However, Provider is not obligated to respond to any request by Recipient for access to any additional services and resources that are necessary for the operation of the Recipient and which are not currently contemplated in the Exhibit, unless Provider deems it fit and feasible.",
    "explanation": "This creates an ambiguity regarding the Provider's obligation to provide services. The first sentence states the Provider agrees to provide services outli





[A[A[A[A

🤖 Model response: ```json
[
  {
    "section": "2.4 RESTRICTIONS ON COMMUNICATIONS. i-Escrow may place banner advertising on the Co-Branded Site upon prior written approval of 2TheMart, which shall be at the discretion of 2TheMart. All advertising revenue arising from the banner ads shall be solely i-Escrow's. i-Escrow shall not run banner advertisements on the Co-Branded Site for any of 2TheMart's competitors. 2TheMart shall provide in writing, a list of companies they would like to exclude, including every time they wish to change this list.",
    "explanation": "The wording \"at the discretion of 2TheMart\" regarding approval for banner advertising is ambiguous. While it grants 2TheMart the power to approve or disapprove i-Escrow's banner ads, it doesn't specify the criteria for approval or disapproval. This means 2TheMart could arbitrarily disapprove ads, potentially hindering i-Escrow's ability to generate revenue or promote its services. This section gives 2TheMart control over i



[A[A

⚠️ API key AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k exhausted. Switching...





[A[A[A

🤖 Model response: ```json
[
  {
    "section": "1.1 Provision of Services.",
    "explanation": "In section 1.1(a), the contract states that the Provider agrees to provide the services set forth in Exhibit A. However, the following sentence states that the Provider is not obligated to respond to any request by the Recipient for access to any additional services and resources. This could be a contradiction because, on the one hand, the provider agrees to provide services, but then they are not obligated to respond. This is also a structural flaw because the contract is structured in such a way that the provider can't agree to the provision of services.",
    "location": "Section 1.1",
    "category": 9
  },
  {
    "section": "1.2 Standard of Service.",
    "explanation": "In section 1.2(c) The contract states that Provider Representatives shall be dedicated to solely providing the Services to Recipient and shall not provide any such services or resources to Provider or any other custom


[A

🤖 Model response: ```json
[
  {
    "section": "Except as agreed upon by mutual consent, the Joint Venturers shall not be required to make any capital contribution to the Joint Venture.",
    "explanation": "This statement is ambiguous because it does not specify the conditions or process for reaching 'mutual consent.' It's unclear what happens if mutual consent is not reached, leading to uncertainty about the financial obligations of the Joint Venturers. It does not define how the lack of capital contribution will affect the business, or if it will be possible to conduct any operations with no capital contributions.",
    "location": "Section 3",
    "category": 1
  },
  {
    "section": "On any termination of the Joint Venture, its debt shall be paid or provided for in a manner deemed reasonable by a third-party mediator, regardless of the satisfaction of the Joint Venturers. Then, any unexpended portion of Joint Venture funds shall be distributed to the Joint Venturers in accordance



[A[A



[A[A[A[A

🤖 Model response: ```json
[
  {
    "section": "1.1 Provision of Services.",
    "explanation": "Section 1.1(a) states Provider agrees to provide the Services set forth on the Exhibit A attached hereto. However, Provider is not obligated to respond to any request by Recipient for access to any additional services and resources that are necessary for the operation of the Recipient and which are not currently contemplated in the Exhibit, unless Provider deems it fit and feasible. This creates ambiguity and contradiction. If the services are 'necessary', it implies a level of obligation. The clause 'unless Provider deems it fit and feasible' gives the provider a wide discretion to refuse necessary services, undermining the initial agreement to provide services.",
    "location": "Section 1.1",
    "category": 1
  },
  {
    "section": "1.2 Standard of Service.",
    "explanation": "Section 1.2(c) states Provider Representatives shall be dedicated to solely providing the Services to Recipi




[A[A[A

🤖 Model response: ```json
[
  {
    "section": "It is agreed that either Joint Venturer shall have authority to execute instruments of any character relating to the affairs of the Joint Venture, even those instruments that could incur liability or indebtedness for borrowed funds, dispose of assets, or purchase any items for the Joint Venture. The need for written consent or approval is only required for strategic partnerships with other organizations.",
    "explanation": "This section presents an ambiguity because it grants broad authority to either Joint Venturer to execute instruments, including those incurring significant liability, without requiring the other's consent. However, it then states that written consent is required for strategic partnerships. This implies that other significant actions, such as incurring debt or disposing of assets, do not require mutual consent, which contradicts the fundamental principle of a joint venture requiring shared control and decision-making.


[A

⚠️ API key AIzaSyCWI7QJXWYBGGWGdL37W8ll0sDIwz0zqlo exhausted. Switching...
⚠️ API key AIzaSyDgafwAgDi2Zjvu6jdt_SIZ60VgK1Na32E exhausted. Switching...
⚠️ API key AIzaSyCWI7QJXWYBGGWGdL37W8ll0sDIwz0zqlo exhausted. Switching...
⚠️ API key AIzaSyDgafwAgDi2Zjvu6jdt_SIZ60VgK1Na32E exhausted. Switching...
⚠️ API key AIzaSyCWI7QJXWYBGGWGdL37W8ll0sDIwz0zqlo exhausted. Switching...
⚠️ API key AIzaSyDgafwAgDi2Zjvu6jdt_SIZ60VgK1Na32E exhausted. Switching...






[A[A[A[A

⚠️ API key AIzaSyCWI7QJXWYBGGWGdL37W8ll0sDIwz0zqlo exhausted. Switching...
🤖 Model response: ```json
[
  {
    "section": "5. Powers and Authority of the Joint Venturers.",
    "explanation": "The text states that neither joint venturer is entitled to compensation for services rendered to the Joint Venture, but each joint venturer shall be reimbursed for all direct expenses incurred. However, the following sentence grants either Joint Venturer authority to execute instruments relating to the affairs of the Joint Venture, even those that could incur liability or indebtedness for borrowed funds. This creates a potential contradiction because incurring liability could be considered a service rendered, thus warranting compensation, or it could cause expenses which should be reimbursed.",
    "location": "Section 5",
    "category": 1
  },
  {
    "section": "5. Powers and Authority of the Joint Venturers.",
    "explanation": "This section grants each Joint Venturer full and complete charg


[A

🤖 Model response: ```json
[
  {
    "section": "4.1.2. Shipment. All shipments shall be made F.C.A. Port of Oakland, California, USA. Transfer of risk from Accuray to Distributor shall occur at such F.C.A. location as provided in F.C.A. terms and transfer of title shall occur at the same time. Distributor may request Accuray to use a particular freight carrier, and Accuray agrees to do so, if feasible. If not feasible in Accuray's reasonable judgment, then Accuray shall promptly advise Distributor of the reasons. If no such request is made, Accuray shall ship in accordance with any instructions contained in the Purchase Order or via FedEx ground, with no extra insurance. Accuray shall bill any actual freight costs to Distributor. Any supplementary shipping costs arising from the need to meet the delivery deadline set forth in the Purchase Order by way of expedited delivery shall be borne by Accuray, if such delivery deadline was at least six months after the submission of such Purchase



[A[A

🤖 Model response: ```json
[
  {
    "section": "2.3.3. Purchase. To purchase Products or Services based on a Quote provided by Accuray, Distributor will issue a purchase order, which shall include specific references to the quote number of such Quote (the \"Purchase Order\"). Accuray shall either accept or reject such Purchase Order within two weeks after receipt thereof, with any failure to approve or disapprove of such Purchase Order in such period constituting disapproval. Each purchase of Accuray Components and Interfaces shall be accomplished and a Purchase Order may be accepted by the execution of the Purchase Order by an authorized representative of Accuray. To the extent of any inconsistency between the Quote and the related Purchase Order, the terms and conditions of such Quote shall govern and Distributor acknowledges and agrees that Accuray shall not be bound by any terms, conditions or boilerplate language included in a Distributor purchase order submitted to Accuray. The P




[A[A[A

🤖 Model response: ```json
[
  {
    "section": "1.1. \"Accuray Regions\" means Accuray's sales regions specifically for CyberKnife products and services (as of the Effective Date) of the Americas (North America and South America), APAC (Asia Pacific, excluding Australia, India, and Japan), EIMEA (Europe, India, Middle East, and Africa), and Japan.",
    "explanation": "The definition of \"Accuray Regions\" excludes Australia, India, and Japan from APAC, but then includes Japan as a separate region. This is an in-text contradiction as Japan cannot be both excluded from and included as a region.",
    "location": "Section 1.1",
    "category": 3
  },
  {
    "section": "Accuray shall either accept or reject such Purchase Order within two weeks after receipt thereof, with any failure to approve or disapprove of such Purchase Order in such period constituting disapproval. Each purchase of Accuray Components and Interfaces shall be accomplished and a Purchase Order may be accepted by the ex


[A

⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...




[A[A

⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...
⚠️ API key AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE exhausted. Switching...
🤖 Model response: ```json
[
  {
    "section": "The Term of this Agreement shall be for a period of [* ****] years and [*****] months commencing the 1st day of September 2004 and terminating the [*****] day of [*****].",
    "explanation": "The contract period is not clearly defined due to redactions. This makes it impossible to determine the exact duration of the agreement and creates ambiguity regarding the obligations and rights of both parties over time.",
    "location": "Section 1",
    "category": 1
  },
  {
    "section": "CONSULTANT shall achieve a satisfactory record of play in a minimum of [* ****] professional golf association events on the SPGA and/or PGA tour",
    "explanation": "The minimum number of tournaments to be played is not clearly defined due to redactions. This creates ambiguity on the obligations for the consult




[A[A[A

🤖 Model response: ```json
[
  {
    "section": "TERM OF CONTRACT",
    "explanation": "The contract states that the term begins on September 1, 2004, and lasts for a specified number of years and months, but the exact termination date is redacted i.e. the [*****] day of [*****]. Without a defined end date, the actual duration and obligations of both parties are unclear, potentially leading to disputes about when the contract ends and when obligations cease.",
    "location": "Section 1",
    "category": 7
  },
  {
    "section": "DEFINITIONS",
    "explanation": "In the definition of \"MANDATORY PRODUCTS\", specific products are listed as what the CONSULTANT must exclusively play/use. However, the actual products are redacted i.e. [***** ]. This lack of clarity creates ambiguity regarding the consultant's obligations and could lead to disputes over compliance.",
    "location": "Section 2",
    "category": 1
  },
  {
    "section": "EXCLUSIVE USE OF PRODUCT",
    "explanation": "This s


[A



[A[A[A[A

🤖 Model response: ```json
[
  {
    "section": "1.1.  \"Accuray Regions\" means Accuray's sales regions specifically for CyberKnife products and services (as of the Effective Date) of the Americas (North America and South America), APAC (Asia Pacific, excluding Australia, India, and Japan), EIMEA (Europe, India, Middle East, and Africa), and Japan.",
    "explanation": "The definition of \"Accuray Regions\" excludes certain countries (Australia, India, and Japan) from the APAC region. However, Japan is also listed as a separate region. This creates an ambiguity because Japan cannot simultaneously be excluded from APAC and be its own separate region.",
    "location": "Section 1.1",
    "category": 1
  },
  {
    "section": "2.3.2. Quote. Following receipt of Distributor's Quote request, Accuray will determine whether to approve the issuance of a Quote related to such request. Such determination shall be made in accordance with and subject to the conditions set forth in Schedule 2.3.2 a



[A[A

🤖 Model response: ```json
[
  {
    "section": "Notwithstanding paragraphs 4A, 4B and 4C above, CONSULTANT shall be entitled to endorse and play the [*****]. The parties expressly agree that CONSULTANT may permit [*****] the use of CONSULTANT'S name and/or likeness in [*****] print and/or television advertisement provided that this is executed in a manner consistent with [*****] past [*****] advertising practice using similarly situated professional golfers with competing golf club endorsement agreements that include [*****]. If [*****] use of CONSULTANT varies from past practice of utilizing similarly situated professionals, CONSULTANT will take action to remedy the situation [*****]. If CONSULTANT participates directly in any [*****] sponsored photo shoots for advertising or PR, CONSULTANT shall wear his ADAMS GOLF [*****]. The parties further expressly agree that CONSULTANT'S endorsement of the [*****] shall not include the right to place a [*****] logo on his ADAMS GOLF [*****] or 


[A


[A[A[A

🤖 Model response: ```json
[
  {
    "section": "4.1 ADVERTISING FEES. After the Launch Date, i-Escrow shall pay 2TheMart advertising fees based on the number of Transaction Inquiries. This advertising fees shall consist of a per Transaction Inquiry amount calculated by multiplying 0.025% by the amount of the average Transaction from all Customers in the preceding quarter. The formula for arriving at the per Transaction Inquiry amount may be revised from time to time during the term of this Agreement to reflect present market conditions (\"the Adjusted Rate\"), unilaterally by 2TheMart without requiring i-Escrow's consent. The Adjusted Rate will be communicated to i-Escrow and will be effective immediately.",
    "explanation": "This clause allows 2TheMart to unilaterally revise the advertising fee formula without i-Escrow's consent. This creates an imbalance of power and could lead to disputes if the Adjusted Rate becomes unreasonable, making the financial obligations of i-Escrow unpre





[A[A[A[A

🤖 Model response: ```json
[
  {
    "section": "1. TERM OF CONTRACT",
    "explanation": "The term of the contract starts in September 1, 2004 and there is a missing end date due to redaction. This is an omission that makes it difficult to ascertain the duration of the contract, leading to uncertainty about the parties' obligations and rights over time.",
    "location": "Section 1",
    "category": 7
  },
  {
    "section": "4. EXCLUSIVITY OF ENDORSEMENT",
    "explanation": "Section 4 outlines restrictions on the Consultant's ability to endorse competing products, but subsection D creates an exception for a specific product ([*****]). However, the conditions under which the Consultant can endorse this product are vaguely defined, particularly regarding the consistency with past advertising practices. The ambiguity arises from the lack of specific criteria defining 'similarly situated professional golfers' and 'competing golf club endorsement agreements,' making it difficult to determ



[A[A

🤖 Model response: ```json
[
  {
    "section": "3.1 Termination of Agreement. This Agreement be deemed effective as of the Effective Date, Agreement and shall terminate on December 31, 2020, unless terminated earlier in accordance with Section 3.2.",
    "explanation": "The sentence structure in Section 3.1 is flawed and unclear. The phrase \"This Agreement be deemed effective as of the Effective Date, Agreement\" is grammatically incorrect and does not clearly state when the agreement becomes effective. The intended meaning is likely that the agreement is effective as of the Effective Date, but the wording is confusing. Additionally, the extra \"Agreement\" thrown into the middle of the text makes this confusing to read.",
    "location": "Section 3.1",
    "category": 9
  },
  {
    "section": "5.1 Indemnification. Provider shall indemnify, defend, and hold harmless Recipient and its officers, directors, employees, agents, affiliates, successors, and permitted assigns (collectively, 


[A

⚠️ API key AIzaSyAjby-dj9aBsolOdTDpvU7_x5uje8l4yiQ exhausted. Switching...





[A[A[A

⚠️ API key AIzaSyCN-EJ7s6CIeEybjT3tM_zN0-4xx4Rcqqw exhausted. Switching...
🤖 Model response: ```json
[
  {
    "section": "5.  Confidentiality.",
    "explanation": "Section 4.1 states that each party shall protect the other party's Confidential Information with at least reasonable care, however it is never defined what reasonable care is. This is ambiguous and can cause issues with interpretations.",
    "location": "Section 4.1",
    "category": 1
  },
  {
    "section": "6. Miscellaneous.",
    "explanation": "Section 6.1 states that 'This Agreement, the Purchase Agreement and the documents referred to herein and therein constitute the entire agreement among the Parties', however the Purchase Agreement is not attached to this document. This creates an issue because the other party is not able to see what the Purchase Agreement fully entails and thus this is not the entire agreement.",
    "location": "Section 6.1",
    "category": 7
  },
  {
    "section": "3. Termination.",
    "ex



[A[A

⚠️ API key AIzaSyAcqO6uxgeIP5qyxcDZLAY2TC9xyTlBmC0 exhausted. Switching...
⚠️ API key AIzaSyC_86XS-IZzhdfmhBSThwQoYMoQuFeY4mQ exhausted. Switching...
⚠️ API key AIzaSyCKWwXUILaUvHkyppqY87-cqBad16vZb00 exhausted. Switching...
⚠️ API key AIzaSyCfYpaD89nvVJ6GIitszeWI0KXdlgEAv-Q exhausted. Switching...
🤖 Model response: ```json
[
  {
    "section": "5.1 ADVERTISING FEES. After the Launch Date, i-Escrow shall pay 2TheMart advertising fees based on the number of Transaction Inquiries. This advertising fees shall consist of a per Transaction Inquiry amount calculated by multiplying 0.025% by the amount of the average Transaction from all Customers in the preceding quarter. The formula for arriving at the per Transaction Inquiry amount may be revised from time to time during the term of this Agreement to reflect present market conditions (\"the Adjusted Rate\"), unilaterally by 2TheMart without requiring i-Escrow's consent. The Adjusted Rate will be communicated to i-Escrow and will be effecti


[A

🤖 Model response: ```json
[
  {
    "section": "5. Powers and Authority of the Joint Venturers. ... It is agreed that either Joint Venturer shall, except as provided for below, have authority to execute instruments of any character relating to the affairs of the Joint Venture; provided, that without the written consent or approval of both of the Joint Venturers: (i) the Joint Venture shall incur no liability of any sort, nor any indebtedness for borrowed funds; (ii) no assets owned in the name of the Joint Venture be disposed of; and (iii) no commitment to purchase any item for the Joint Venture shall be made.",
    "explanation": "This section grants each joint venturer the authority to execute instruments related to the Joint Venture's affairs, but then immediately restricts that authority by requiring written consent for key actions like incurring debt, disposing of assets, or making purchase commitments. The initial grant of authority is significantly undermined by the subsequent l





[A[A[A[A

🤖 Model response: ```json
[
  {
    "section": "4.1 Payments of Alliance Funding applicable to a Study will be made according to the terms specified in Sections 1.3 and 1.4 above.  Notwithstanding any other provision, all invoices must be paid within 15 days of receipt.",
    "explanation": "This creates an in-text contradiction. The first sentence states that payments will be made according to Sections 1.3 and 1.4. Then the second sentence states that payment is due within 15 days of reciept of invoice. The sections 1.3 and 1.4 should have that information, and not create a contradictory statement in section 4.1.",
    "location": "Section 4.1",
    "category": 3
  },
  {
    "section": "All payments will be paid by Adaptimmune within 45 days of receipt of an invoice from MD Anderson. However, if MD Anderson delivers the invoice in person, payments will be paid by Adaptimmune within 10 days of receipt of an invoice from MD Anderson.",
    "explanation": "This presents an in-text contr



Processing samples:  40%|████      | 10/25 [01:17<01:56,  7.79s/it]



[A[A[A

🤖 Model response: ```json
[
  {
    "section": "4.1 Payments of Alliance Funding applicable to a Study will be made according to the terms specified in Sections 1.3 and 1.4 above. Notwithstanding any other provision, all invoices must be paid within 15 days of receipt.",
    "explanation": "Section 1.3 and 1.4 does not mention anything about when the payment needs to be paid. However, the following sentence states that payments must be paid within 15 days of receipt. This can be confusing because the parties may need to refer to other documents to find the payment date. The terms in section 1.3 and 1.4 does not align with the sentence that follows.",
    "location": "Section 4.1",
    "category": 5
  },
  {
    "section": "All payments will be paid by Adaptimmune within 45 days of receipt of an invoice from MD Anderson. Such invoice shall be addressed to Adaptimmune and sent by electronic mail to accounts@adaptimmune.com with copies to lini.pandite@adaptimmune.com and susan cousounis@a

### **Analysis**

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(run_results, orient="index")
df

Unnamed: 0,ambiguity_inText,inconsistencies_inText,misaligned_terminalogy_inText,omissions_inText,structural_flaws_inText
zero-shot-self-verification-cot,"{'text_matches': 2, 'explanation_matches': 1, ...","{'text_matches': 5, 'explanation_matches': 3, ...","{'text_matches': 2, 'explanation_matches': 3, ...","{'text_matches': 2, 'explanation_matches': 1, ...","{'text_matches': 11, 'explanation_matches': 7,..."
zero-shot-self-verification,"{'text_matches': 2, 'explanation_matches': 1, ...","{'text_matches': 5, 'explanation_matches': 4, ...","{'text_matches': 4, 'explanation_matches': 1, ...","{'text_matches': 2, 'explanation_matches': 2, ...","{'text_matches': 11, 'explanation_matches': 6,..."
zero-shot-cot,"{'text_matches': 11, 'explanation_matches': 23...","{'text_matches': 27, 'explanation_matches': 19...","{'text_matches': 11, 'explanation_matches': 16...","{'text_matches': 8, 'explanation_matches': 9, ...","{'text_matches': 8, 'explanation_matches': 8, ..."
zero-shot,"{'text_matches': 12, 'explanation_matches': 20...","{'text_matches': 31, 'explanation_matches': 25...","{'text_matches': 18, 'explanation_matches': 16...","{'text_matches': 8, 'explanation_matches': 7, ...","{'text_matches': 9, 'explanation_matches': 6, ..."


In [None]:
text_match_df = df.copy()
for column in text_match_df.columns:
    text_match_df[column] = text_match_df[column].apply(
        lambda x: x["text_matches"] / x["total"] if x["total"] > 0 else 0
    )
text_match_df

Unnamed: 0,ambiguity_inText,inconsistencies_inText,misaligned_terminalogy_inText,omissions_inText,structural_flaws_inText
zero-shot-self-verification-cot,0.133333,0.333333,0.133333,0.133333,0.733333
zero-shot-self-verification,0.133333,0.333333,0.266667,0.133333,0.733333
zero-shot-cot,0.144737,0.36,0.146667,0.106667,0.533333
zero-shot,0.16,0.413333,0.24,0.106667,0.6


In [None]:
text_match_df = df.copy()
for column in text_match_df.columns:
    text_match_df[column] = text_match_df[column].apply(
        lambda x: x["correct"] / x["total"] if x["total"] > 0 else 0
    )
text_match_df

Unnamed: 0,ambiguity_inText,inconsistencies_inText,misaligned_terminalogy_inText,omissions_inText,structural_flaws_inText
zero-shot-self-verification-cot,0.0,0.133333,0.133333,0.066667,0.4
zero-shot-self-verification,0.0,0.2,0.066667,0.133333,0.266667
zero-shot-cot,0.144737,0.226667,0.133333,0.106667,0.266667
zero-shot,0.146667,0.266667,0.173333,0.08,0.333333


In [None]:
def aggregate_correct_score(row):
    total = 0
    correct = 0
    for col in row.index:
        total += row[col]["total"]
        correct += row[col]["correct"]
    return correct / total if total > 0 else 0
        
# Text Match
total_score = df.copy()
total_score.apply(aggregate_correct_score, axis=1)

zero-shot-self-verification-cot    0.146667
zero-shot-self-verification        0.133333
zero-shot-cot                      0.158228
zero-shot                          0.174603
dtype: float64

In [None]:
def aggregate_correct_score(row):
    total = 0
    correct = 0
    for col in row.index:
        total += row[col]["total"]
        correct += row[col]["text_matches"]
    return correct / total if total > 0 else 0
        
# Text Match
total_score = df.copy()
total_score.apply(aggregate_correct_score, axis=1)

zero-shot-self-verification-cot    0.293333
zero-shot-self-verification        0.320000
zero-shot-cot                      0.205696
zero-shot                          0.247619
dtype: float64

#### Few-shot variations

In [None]:
# # FS | FS + SC
# run(
#     model=GeminiModel(API_KEYS),
#     dataset=MiniEvalDataset(),
#     prompt=few_shot_prompt,
#     responses_dir="mini-eval/responses/few-shot/",
#     num_responses=1,    # SC
#     evaluation_model=GeminiModel(API_KEYS)
# )

# # FS + COT | FS + COT + SC
# run(
#     model=GeminiModel(API_KEYS),
#     dataset=MiniEvalDataset(),
#     prompt=few_shot_prompt + COT,
#     responses_dir="mini-eval/responses/few-shot-cot/",
#     num_responses=1,    # SC
#     evaluation_model=GeminiModel(API_KEYS)
# )

# # FS + SV | FS + SV + SC
# run(
#     model=SelfVerificationModel(GeminiModel(API_KEYS)),
#     dataset=MiniEvalDataset(),
#     prompt=few_shot_prompt,
#     responses_dir="mini-eval/responses/few-shot-self-verification/",
#     num_responses=1,    # SC
#     evaluation_model=GeminiModel(API_KEYS)
# )

# # FS + COT + SV | FS + COT + SV + SC
# run(
#     model=SelfVerificationModel(GeminiModel(API_KEYS)),
#     dataset=MiniEvalDataset(),
#     prompt=few_shot_prompt + COT,
#     responses_dir="mini-eval/responses/self-verification/",
#     num_responses=1,    # SC
#     evaluation_model=GeminiModel(API_KEYS)
# )

## TODO 
---
- Z ✅
- Z + COT ✅
- Z + SV ✅
- Z + COT + SV ✅
- Z + SC ✅
- Z + COT + SC ✅
---
- FS ✅⚠️
- FS + COT ✅⚠️
- FS + SV ✅⚠️
- FS + COT + SV ✅⚠️
- FS + SC ✅⚠️
- FS + COT + SC ✅⚠️
---
- Z + SV + SC (SKIP THIS FOR NOW) ✅
- Z + COT + SV + SC (SKIP THIS FOR NOW) ✅
- FS + SV + SC (SKIP THIS FOR NOW) ✅⚠️
- FS + COT + SV + SC (SKIP THIS FOR NOW) ✅⚠️
---
- **Output into a .csv**❌
- **Eventually need to repeat with different LLMs**❌

# Metrics
1) `text match` but `explanation !match` = -1
2) `text match` and `explanation match` = +1
3) `text !match` and `explanation match` = -1
4) `text !match` and `explanation !match` = -1