In [138]:
# %pip install google-generativeai

In [139]:
import os
import json
import shutil
import google.generativeai as genai

### Set up the mini-eval directory with the 'answers' (LLM-based ground truth) and 'documents' (perturbed documents without tags).


In [140]:
base_dir = 'perturbed_legal_documents'
PERTURBATION_TYPES = ['ambiguity', 'inconsistencies', 'misaligned_terminalogy', 'omission', 'structural_flaws']
CATEGORIES = ['inText', 'legal']

In [None]:
for pt in PERTURBATION_TYPES:
    for ct in CATEGORIES:
        print(f"\nProcessing: {pt}_{ct}_contradiction")

        input_dir = f'{base_dir}/{pt}_{ct}_contradiction/'
        doc_dir = os.path.join(input_dir, 'modified_files_no_tags')

        if not os.path.exists(input_dir):
            print(f"Input dir not found: {input_dir}")
            continue
        if not os.path.exists(doc_dir):
            print(f"Document dir not found: {doc_dir}")
            continue

        output_answers = f'mini-eval/answers/{pt}_{ct}_contradiction/'
        output_documents = f'mini-eval/documents/{pt}_{ct}_contradiction/'

        # Check if outputs already exist and contain at least 5 files
        if (os.path.exists(output_answers) and len(os.listdir(output_answers)) >= 5 and
            os.path.exists(output_documents) and len(os.listdir(output_documents)) >= 5):
            print(f"Skipping {pt}_{ct}_contradiction — already processed.")
            continue

        os.makedirs(output_answers, exist_ok=True)
        os.makedirs(output_documents, exist_ok=True)

        # Collect all valid json->txt pairs
        json_files = sorted([f for f in os.listdir(input_dir) if f.endswith('.json')])
        print(f"🔎 Found {len(json_files)} JSON files")

        valid_pairs = []

        for json_file in json_files:
            if not json_file.startswith("perturbed_") or not json_file.endswith(".pdf.json"):
                print(f"  ⚠️ Skipping incorrectly named file: {json_file}")
                continue

            base_name = json_file[len("perturbed_"):-len(".pdf.json")]
            txt_file = f"modified_{base_name}.pdf.txt"
            txt_path = os.path.join(doc_dir, txt_file)

            if os.path.exists(txt_path):
                valid_pairs.append((json_file, txt_file))
                print(f"  ✅ Matched: {json_file} <-> {txt_file}")
            else:
                print(f"  ❌ Missing TXT: {txt_file}")

            if len(valid_pairs) == 5:
                break

        if not valid_pairs:
            print("Can't find corresponding files????")
            continue

        # Copy matched pairs
        for json_file, txt_file in valid_pairs:
            src_json = os.path.join(input_dir, json_file)
            dst_json = os.path.join(output_answers, json_file)

            src_txt = os.path.join(doc_dir, txt_file)
            dst_txt = os.path.join(output_documents, txt_file)

            shutil.copy(src_json, dst_json)
            shutil.copy(src_txt, dst_txt)
            print(f"  📁 Copied: {json_file} and {txt_file}")


Processing: ambiguity_inText_contradiction
🔎 Found 25 JSON files
  ✅ Matched: perturbed_ArmstrongFlooringInc_20190107_8-K_EX-10.2_11471795_EX-10.2_Intellectual Property Agreement.pdf.json <-> modified_ArmstrongFlooringInc_20190107_8-K_EX-10.2_11471795_EX-10.2_Intellectual Property Agreement.pdf.txt
  ✅ Matched: perturbed_BellringBrandsInc_20190920_S-1_EX-10.12_11817081_EX-10.12_Manufacturing Agreement1.pdf.json <-> modified_BellringBrandsInc_20190920_S-1_EX-10.12_11817081_EX-10.12_Manufacturing Agreement1.pdf.txt
  ✅ Matched: perturbed_BellringBrandsInc_20190920_S-1_EX-10.12_11817081_EX-10.12_Manufacturing Agreement2.pdf.json <-> modified_BellringBrandsInc_20190920_S-1_EX-10.12_11817081_EX-10.12_Manufacturing Agreement2.pdf.txt
  ✅ Matched: perturbed_BellringBrandsInc_20190920_S-1_EX-10.12_11817081_EX-10.12_Manufacturing Agreement3.pdf.json <-> modified_BellringBrandsInc_20190920_S-1_EX-10.12_11817081_EX-10.12_Manufacturing Agreement3.pdf.txt
  ✅ Matched: perturbed_BerkshireHillsBanco

In [142]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyDgafwAgDi2Zjvu6jdt_SIZ60VgK1Na32E"
API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel("gemini-2.0-flash")

**Just a small dataset which makes it easier to retrieve answers alongside documents**

In [143]:
class MiniEvalDataset:
    def __init__(self):
        self.mini_eval_dir = "mini-eval"
        self.mini_eval_answers_dir = os.path.join(self.mini_eval_dir, "answers")
        self.mini_eval_documents_dir = os.path.join(self.mini_eval_dir, "documents")
        self.files = [
            os.path.relpath(os.path.join(root, file), self.mini_eval_answers_dir).replace(".json", "")
            for root, _, files in os.walk(self.mini_eval_answers_dir)
            for file in files
        ]
        self.files.sort()

    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        with open(os.path.join(self.mini_eval_answers_dir, self.files[idx] + ".json"), "r", encoding="utf-8") as f:
            answers = "\n".join(f.readlines())
            answers = self.__remove_non_ascii(answers)
            answers = json.loads(answers)

        with open(os.path.join(self.mini_eval_documents_dir, self.files[idx] + ".txt"), "r", encoding="utf-8") as f:
            documents = "\n".join(f.readlines())
            documents = self.__remove_non_ascii(documents)

        return {
            "file_name": self.files[idx],
            "answers": answers,
            "documents": documents,
        }
    
    def __remove_non_ascii(self, s):
        return "".join(filter(lambda x: ord(x) < 128, s))

You retrieve elements in each dataset like this:

In [144]:
dataset = MiniEvalDataset()
display(dataset[0]["answers"], dataset[0]["documents"])


[{'file_name': 'ArmstrongFlooringInc_20190107_8-K_EX-10.2_11471795_EX-10.2_Intellectual Property Agreement.pdf',
  'perturbation': [{'type': 'Ambiguities - In Text Contradiction',
    'original_text': '1.1 Certain Definitions. As used herein, capitalized terms have the meaning ascribed to them herein, including the following terms have the meanings set forth below. Capitalized terms that are not defined in this Agreement shall have the meaning set forth in the Stock Purchase Agreement.',
    'changed_text': '1.1 Certain Definitions. As used herein, capitalized terms have the meaning ascribed to them herein, including the following terms may have the meanings set forth below. Capitalized terms that are not explicitly defined in this Agreement may be interpreted based on common industry usage, irrespective of definitions found in the Stock Purchase Agreement.',
    'explanation': "The original text states capitalized terms not defined in the IP Agreement follow the Stock Purchase Agreeme

'Exhibit 10.2 Execution Version INTELLECTUAL PROPERTY AGREEMENT This INTELLECTUAL PROPERTY AGREEMENT (this Agreement), dated as of December 31, 2018 (the Effective Date) is entered into by and between Armstrong Flooring, Inc., a Delaware corporation (Seller) and AFI Licensing LLC, a Delaware limited liability company (Licensing and together with Seller, Arizona) and AHF Holding, Inc. (formerly known as Tarzan HoldCo, Inc.), a Delaware corporation (Buyer) and Armstrong Hardwood Flooring Company, a Tennessee corporation (the Company and together with Buyer the Buyer Entities) (each of Arizona on the one hand and the Buyer Entities on the other hand, a Party and collectively, the Parties). WHEREAS, Seller and Buyer have entered into that certain Stock Purchase Agreement, dated November 14, 2018 (the Stock Purchase Agreement) WHEREAS, pursuant to the Stock Purchase Agreement, Seller has agreed to sell and transfer, and Buyer has agreed to purchase and acquire, all of Sellers right, title a

**You check the length like this:**

In [145]:
len(dataset)
print(dataset[5]["file_name"])

ambiguity_legal_contradiction\ArcaUsTreasuryFund_20200207_N-2_EX-99.K5_11971930_EX-99.K5_Development Agreement.pdf


#### WIP: This needs to be adjusted to be few-shot.

In [None]:
def clean_and_parse_model_response(raw_response):
    raw_response = raw_response.strip().strip("`")
    if raw_response.startswith("json"):
        raw_response = raw_response[4:].strip()

    try:
        parsed = json.loads(raw_response)
    except json.JSONDecodeError as e:
        print("Failed to parse JSON:", e)
        return None

    return parsed


def add_section_identified_flag(predictions, ground_truth_perturbations):
    gt_locations = {p["location"].strip() for p in ground_truth_perturbations}
    gt_changed_texts = [p["changed_text"] for p in ground_truth_perturbations]

    for pred in predictions:
        # LOCATION MATCH
        pred_loc = pred.get("location", "").strip()
        pred["location_match"] = pred_loc in gt_locations

        # TEXT MATCH (check if model's reponse for 'section' matches what was perturbed)
        pred_section = pred.get("section", "").strip()
        pred["text_match"] = any(pred_section in gt_text or gt_text in pred_section for gt_text in gt_changed_texts)

    return predictions

This part needs to be part of the og loop

- This part shows if the "section" from the model's response matches the "section location" that was modified in the perturbed document.
- Also shows if the "text" that was caught by the model's response matches what was modified in the perturbed document.

In [149]:
for sample in dataset:
    # Basically doing this so that we don't have to re-run the model for the same file
    dirname = os.path.dirname(sample["file_name"])
    os.makedirs(os.path.join("mini-eval", "responses", dirname), exist_ok=True)
    output_path = os.path.join("mini-eval", "responses", sample["file_name"] + ".json")
    if os.path.exists(output_path):
        print(f"✅ File already exists: {output_path}. Skipping.")
        continue
    else:
        print(f"📁 Saving response to: {output_path}")

    response = model.generate_content(
        """You are a legal contract expert and know how to check legal documents properly and find any discrepancies or contradictions within a file. You are also aware of all state and national laws when it comes to legal docuements.
The file is a legal document and you are to check for any discrepancies or contradictions within the file.
There are 10 categories when it comes to discrepancies or contradictions:
1. Ambiguity in text - this means that the text is not clear and can be interpreted in multiple ways.
2. Ambiguity in legal terms - this means that the legal terms used in the text are not clear and can be interpreted in multiple ways.
3. Inconsistencies in text - this means that the text is not consistent and contradicts itself.
4. Inconsistencies in legal terms - this means that the legal terms used in the text are not consistent and contradict some law.
5. Misaligned in text - this means that the text is not aligned with the rest of the document and does not make sense.
6. Misaligned in legal terms - this means that the legal terms used in the text are not aligned with the law and do not make sense.
7. Omission in text - this means that there is something missing in the text that should be there.
8. Omission in legal terms - this means that there is something missing in the legal terms that should be there.
9. Structural Flaws in text - this means that the text is not structured properly and does not make sense.
10. Structural Flaws in legal terms - this means that the legal terms used in the text are not structured properly and do not make sense.

Instructions:
1. Read the file and check for any discrepancies or contradictions within the file.
2. Provide a detailed explanation of why this is a discrepancy or contradiction.
3. Provide the section where the discrepancy or contradiction exists.
4. Provide the section location. Like Section 5.4.                                    
4. Categorize the discrepancy or contradiction into one of the 10 categories above (return the number of the category).
There are 2-3 contradictions in each text.

Return the results in json format. Example:
[{
    "section": "Sponsor shall pay Club the Annual Fee for each Contract Year of this Agreement in six (6) equal installments, each\ndue on or prior to the 1st of each month between June and November of the applicable Contract Year."
    "explanation": "This change introduces a contradiction regarding the payment deadline. Section 3(a) states that all installments are due by November 1st, but the added sentence allows the final payment to be made as late as December 15th without penalty. This creates ambiguity as to the actual deadline for the final installment and whether late fees would apply between November 2nd and December 15th."
    "location": "Section 5.2"
    "category": 3
}]

This is the document:
"""
        + sample["documents"]
    )

    model_response = response.to_dict()["candidates"][0]["content"]["parts"][0]["text"]
    parsed_response = clean_and_parse_model_response(model_response)

    ground_truth = sample["answers"][0]["perturbation"]
    
    print(model_response)
    print(parsed_response)

    if parsed_response:
        updated_predictions = add_section_identified_flag(parsed_response, ground_truth)

        with open(
            os.path.join("mini-eval", "responses", sample["file_name"] + ".json"),
            "w",
            encoding="utf-8",
        ) as f:
            json.dump(updated_predictions, f, indent=4)
    else:
        print("Could not parse model response.")

✅ File already exists: mini-eval\responses\ambiguity_inText_contradiction\ArmstrongFlooringInc_20190107_8-K_EX-10.2_11471795_EX-10.2_Intellectual Property Agreement.pdf.json. Skipping.
✅ File already exists: mini-eval\responses\ambiguity_inText_contradiction\BellringBrandsInc_20190920_S-1_EX-10.12_11817081_EX-10.12_Manufacturing Agreement1.pdf.json. Skipping.
✅ File already exists: mini-eval\responses\ambiguity_inText_contradiction\BellringBrandsInc_20190920_S-1_EX-10.12_11817081_EX-10.12_Manufacturing Agreement2.pdf.json. Skipping.
✅ File already exists: mini-eval\responses\ambiguity_inText_contradiction\BellringBrandsInc_20190920_S-1_EX-10.12_11817081_EX-10.12_Manufacturing Agreement3.pdf.json. Skipping.
✅ File already exists: mini-eval\responses\ambiguity_inText_contradiction\BerkshireHillsBancorpInc_20120809_10-Q_EX-10.16_7708169_EX-10.16_Endorsement Agreement.pdf.json. Skipping.
📁 Saving response to: mini-eval\responses\ambiguity_legal_contradiction\ArcaUsTreasuryFund_20200207_N-2