# Initialization and Set Up


In [210]:
!pip install google-generativeai
!pip install pymupdf

print("Install done")

Install done


## Test Gemini API Key
- gemini-2.0-flash

In [22]:
import google.generativeai as genai

In [23]:
import os
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")


In [24]:
api_key = os.getenv("GOOGLE_API_KEY")

genai.configure(api_key=api_key)

# Initialize the Gemini model
model = genai.GenerativeModel("gemini-2.0-flash")

In [25]:
# Ask a question
response = model.generate_content("Explain quantum computing in simple terms.")

# Print the response
print(response.text)


Okay, imagine a regular computer bit is like a light switch: it can be either ON (1) or OFF (0).  That's it.  It's either one or the other.

Now, imagine a **quantum bit**, called a **qubit**, is like a dimmer switch. It can be:

*   **Mostly ON:** Like 70% ON and 30% OFF
*   **Mostly OFF:** Like 20% ON and 80% OFF
*   **Exactly ON:** Like 100% ON and 0% OFF (the same as a regular bit)
*   **Exactly OFF:** Like 0% ON and 100% OFF (the same as a regular bit)
*   **And anything in between!**

This "in-between-ness" is called **superposition**.  A qubit can be in a combination of both 0 and 1 *at the same time*.  Think of it like a coin spinning in the air - it's neither heads nor tails until it lands.

Here's why that's powerful:

*   **Doing many calculations at once:**  Because a qubit can be in multiple states (0 and 1, and everything in between) simultaneously, a quantum computer can effectively try out many different possibilities *at the same time*.  A regular computer has to try e

In [26]:
import fitz
import difflib
import re
import json

### Error log

In [28]:
# Let it be global
error_log = """Error log\n"""

# Starting of pipeline 
1. reads legal documents
2. calls LLM to add perturbations
3. creates output in json format
4. stores output files in benchmark dataset

## Change source folder and destination folder


In [73]:
# Edit these as needed

# Folder paths
"""
folder_path_read = folder path to read the pdfs, put the root folder here and it picks out all pdfs

folder_path_json = folder path to save the perturbation json files

folder_path_save = folder path to save the modified perturbed text files
"""

folder_path_read = "full_contract_txt/"

folder_path_json = "test_benchmark_dataset/inconsistencies_legal_contradiction_json"

folder_path_save = "test_benchmark_dataset/inconsistencies_legal_contradiction"

# Switch to limit read
file_read_limit_flag = False
# Limit number of files to read
file_read_limit = 10

# Error log file name
error_log_name = "error_log_inconsistencies_legal.txt"

# Change this as needed
#start_folder = ""
# start_folder = "full_contract_pdf/Part_I/License_Agreements/"

# Index of subfolder to start with from the root folder in folder_path_read
start_index = 0

"""Prompts"""
# See function generate_perturbation_new









'Prompts'

## Retrieve content for each legal document


In [75]:
def get_end_folders(root_folder, skip_folder=".ipynb_checkpoints"):
    end_folders = []

    for dirpath, dirnames, _ in os.walk(root_folder, topdown=True):
        # Remove the folders that should be skipped
        dirnames[:] = [d for d in dirnames if d != skip_folder]

        # If there are no subdirectories left, it's an end folder
        if not dirnames:
                end_folders.append(os.path.join(dirpath, ""))  # Ensure trailing backslash

    return end_folders

# Example usage:
# result = get_end_folders("full_contract_pdf")
# print(result)

### Reading the txt versions

In [77]:
def read_txt(file_path):
    """
    Reads the entire content of a text file.

    Args:
        file_path (str): Path to the text file.

    Returns:
        (str): Content of the file as a string.
    """
    global error_log
    try:
        with open(file_path, 'r', encoding="utf8") as file:
            all_text = file.read()
        return all_text
    except FileNotFoundError:
        e = "File not found. Please check the file path."
        error_log += f"""\nIn {file_path}: 
        Error name: FileNotFoundError
        Error message: {e}\n"""
        return e


### Reading the pdf versions

In [79]:
def read_pdf(file_path):
    """Reads a PDF file"""
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])
    # no error catch

In [80]:
def read_legal_files(folder_path):
    """Reads all legal files in the folder and returns a dictionary with file names and content."""
    global file_read_limit
    global file_read_limit_flag
    file_read_limit_local = file_read_limit
    legal_documents = {}

    # print(str(file_read_limit_flag) + " and " + str(file_read_limit_local))
    # Goes through the directory, looking for supported files to read
    for file_name in os.listdir(folder_path):
        # limit read files for experimentation
        if file_read_limit_flag and file_read_limit_local <= 0:
            break
        
        file_path = os.path.join(folder_path, file_name)

        # if block to determine which read function to call
        if file_name.endswith(".pdf"):
            legal_documents[file_name] = read_pdf(file_path)
            file_read_limit_local -= 1
        elif file_name.endswith(".txt"):
            legal_documents[file_name] = read_txt(file_path)
            file_read_limit_local -= 1
        else:
            print(f"Skipping unsupported file: {file_name}")

    return legal_documents

In [81]:
# Read legal files
legal_docs = read_legal_files(folder_path_read)

# Display first document
for file_name, content in legal_docs.items():
    print(f"--- {file_name} ---\n{content[:500]}...\n")
    break

Skipping unsupported file: .ipynb_checkpoints
--- 2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-Branding Agreement_ Agency Agreement.txt ---
CO-BRANDING AND ADVERTISING AGREEMENT

THIS CO-BRANDING AND ADVERTISING AGREEMENT (the "Agreement") is made as of June 21, 1999 (the "Effective Date") by and between I-ESCROW, INC., with its principal place of business at 1730 S. Amphlett Blvd., Suite 233, San Mateo, California 94402 ("i-Escrow"), and 2THEMART.COM, INC. having its principal place of business at 18301 Von Karman Avenue, 7th Floor, Irvine, California 92612 ("2TheMart").

1. DEFINITIONS.

(a) "CONTENT" means all content or informat...



In [82]:
# Printing all file names that were accepted
print(f"Total files from {folder_path_read}: {len(legal_docs)}") 
for file_name, content in legal_docs.items():
    print(f"--- {file_name} ---...\n")

Total files from full_contract_txt/: 510
--- 2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-Branding Agreement_ Agency Agreement.txt ---...

--- ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEMENT.txt ---...

--- ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINT VENTURE AGREEMENT.txt ---...

--- ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGREEMENT.txt ---...

--- ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT AGREEMENT.txt ---...

--- ADAPTIMMUNETHERAPEUTICSPLC_04_06_2017-EX-10.11-STRATEGIC ALLIANCE AGREEMENT.txt ---...

--- ADIANUTRITION,INC_04_01_2005-EX-10.D2-RESELLER AGREEMENT.txt ---...

--- ADMA BioManufacturing, LLC -  Amendment #3 to Manufacturing Agreement .txt ---...

--- ADUROBIOTECH,INC_06_02_2020-EX-10.7-CONSULTING AGREEMENT(1).txt ---...

--- ADUROBIOTECH,INC_06_02_2020-EX-10.7-CONSULTING AGREEMENT.txt ---...

--- AFSALABANCORPINC_08_01_1996-EX-1.1-AGENCY AGREEMENT.txt ---...

--- AgapeAtpCorp_20191202_10-KA_EX-10.1_11911128_EX-10.1_Supply Agreement.txt

## Prompt to read through legal file and insert different types of perturbations
- 10 different types of prompts to switch
- returns output in json format, which would be considered as lock-file

In [84]:
def generate_perturbation_new(original_text, file_name):    
    """Generates a perturbed version of the legal document section in structured JSON format."""

    prompt = f"""You are an employment law specialist ensuring that contractual deadlines comply with legal regulations. Your task is to modify a timeline in the contract so that it contradicts state or federal laws.
    
    Before modifying the text:
    - **Read the file** to determine what city, state, or country the contract applies to.
    - If the jurisdiction is unclear, default to **United States law**.
    - Make sure that when taking the original texts, there should be no jumps between sentences. Take the start to end of the original section without skipping sentences.
    
    ### **Definition:**
    Inconsistencies arise when **time-sensitive obligations** in a contract do not align with legal requirements. A **legal contradiction** in this category happens when a contract sets **a deadline or requirement that violates federal or state law**, making the contractual terms unenforceable or illegal.
    
    ### **Step-by-Step Instructions:**
    1. Identify a contractual deadline that is **regulated by law** (e.g., payment terms, claims deadlines, notice periods).  
    2. Modify the deadline to **conflict with state or federal legal requirements**.  
    3. Ensure that the change **creates non-compliance with regulatory standards**.  
    4. For that perturbation, make sure in the file there should be **2-3** of them. 
    5. Output the modified contract in structured JSON format.
    
    ---
    
    ### **Examples of Conflicts with Regulatory Timelines:**
    
    **Example 1:**
    - **Original:** "Employees must submit harassment claims within 15 days."
    - **Modified:** "Employees must submit harassment claims within 5 days."
    - **Explanation:** Some states (e.g., California) require **at least 30 days** for harassment claims.
    
    **Example 2:**
    - **Original:** "Landlords must return security deposits within 21 days of lease termination."
    - **Modified:** "Landlords must return security deposits within 60 days of lease termination."
    - **Explanation:** Many states **mandate 14-30 days** for deposit refunds.
    
    **Example 3:**
    - **Original:** "Customers have the right to cancel a contract within 10 days of signing."
    - **Modified:** "Customers may cancel contracts within 48 hours."
    - **Explanation:** Some **consumer protection laws** require **at least 7-10 days** for contract cancellations.
    
    **Example 4:**
    - **Original:** "Workers must receive final wages within 72 hours of termination."
    - **Modified:** "Workers will receive final wages at the company's discretion."
    - **Explanation:** Federal and state laws **require clear final paycheck deadlines**.
    
    **Example 5:**
    - **Original:** "Loan repayment plans must allow at least 90 days for late payments before default."
    - **Modified:** "Loan repayment plans may declare default after 30 days of non-payment."
    - **Explanation:** Some **loan regulations require longer grace periods**.
    
    **Example 6:**
    - **Original:** "The term 'Confidential Information' shall refer to any proprietary business, financial, and technical data disclosed by one party to the other, including but not limited to customer lists, pricing strategies, trade secrets, and non-public financial records. Confidential Information shall be protected for a period of five (5) years from the date of disclosure."
    - **Modified:** "The term 'Confidential Information' shall refer to sensitive business information disclosed by one party to the other. Each party shall determine what constitutes Confidential Information based on its internal policies. Confidentiality obligations shall remain in effect for a commercially reasonable period."
    - **Explanation:** The modified text **weakens the definition** of **"Confidential Information"** by removing specific protected categories (e.g., customer lists, trade secrets). Additionally, allowing **each party to define confidentiality based on "internal policies"** introduces **uncertainty and contradictions** with other parts of the agreement. The term **"commercially reasonable period"** replaces the **clear five-year obligation**, which conflicts with standard contract law principles that require clearly defined confidentiality periods.
    ---
    
    ### **Return JSON Format**
    {{
        "file_name": {file_name},
        "perturbation": [
            {{
                "type": "Inconsistencies - Legal Contradiction",
                "original_text": "EXCERPT BEFORE CHANGE",
                "changed_text": "EXCERPT AFTER CHANGE",
                "explanation": "WHY THIS CHANGE INTRODUCES A PERTURBATION",
                "contradicted_law": "SPECIFIC LAW OR REGULATION BEING VIOLATED",
                "location": "SECTION OR PARAGRAPH NUMBER"
            }}
        ]
    }}
    
    Below is the original legal text:
    -------------------
    {original_text}
    -------------------
    
    Now, return ONLY the structured JSON object with the modified text and explanation.
    """
    
    global error_log
    response = None
    response_text = None
    try:
        response = model.generate_content(prompt)
        response_text = response.text
    except ValueError as e:
        if "reciting from copyrighted material" in str(e):
            print("Error: The model was reciting from copyrighted material. Please modify your prompt.")
        error_log += f"""\nIn {file_name}: 
        Error name: ValueError
        Error message: {e}\n"""
        
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        error_log += f"""\nIn {file_name}: 
        Error name: Exception
        Error message: {e}\n"""
    
    return response_text if response else "ERROR: No response from API"

## Applies perturbations to files and stores in json format

In [86]:
def apply_perturbations(folder_path_read, folder_path_json, folder_path_save):
    legal_docs = read_legal_files(folder_path_read)
    
    print('We have got the legal docs:')

    global error_log

    for i, (file_name, content) in enumerate(legal_docs.items()):
        print("________________________________________________________________________")
        results = []
        # if i >= 50:  # Stop after processing 5 documents
        #      break

        # Strip file name whitespaces
        # file_name = file_name.replace(" ", "")
        file_name = file_name.strip()
        print(f"Processing {file_name}...")
        #print('This is the content:', content)
        #perturbed_json = generate_perturbation(content, file_name, perturbation_type)
        perturbed_json = generate_perturbation_new(content, file_name)
        
        #print('This is the perturbed json for this file:', perturbed_json)

        # If there is no returned json, return this message
        if perturbed_json.__eq__("ERROR: No response from API"):
            continue
        
        #print("This is the perturbed json:", perturbed_json)
        clean_json_text = re.sub(r"```json|```", "", perturbed_json).strip()

        # print('this is json:', clean_json_text)
        try:
            # Convert response into a Python dictionary
            perturbed_data = json.loads(clean_json_text)
            results.append(perturbed_data)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON for {file_name}, writing into logs and skipping...")
            error_log += f"""\nIn {file_name}: {e}"""
            continue
            
        # Save the JSON output
        json_output_path = os.path.join(folder_path_json, f"perturbed_{file_name}.json")
        # json_output_path = json_output_path.strip()
        try:
            with open(json_output_path, "w", encoding="utf-8") as f:
                json.dump(results, f, indent=4, ensure_ascii=False)
        except FileNotFoundError as e:
            print(f"An error occurred while writing to the file: {e}")
            error_log += f"""\nIn {file_name}: 
            Error name: FileNotFoundError
            Error message: {e}"""
            continue
        except IOError as e:
            print(f"An error occurred while writing to the file: {e}")
            error_log += f"""\nIn {file_name}:
            Error name: IOError
            Error message: {e}"""
            continue
        except json.JSONEncodeError as e:
            print(f"An error occurred while encoding JSON: {e}")
            error_log += f"""\nIn {file_name}:
            Error name: JSONEncodeError
            Error message: {e}"""
            continue

        
        # Apply the perturbation from json to text
        modified_contract = apply_perturbation_from_json(content, json_output_path, folder_path_save)

        # Write to a log file for this folder
    
    print(f"All perturbations saved in {folder_path_save}")

    return perturbed_json

In [87]:
def normalize_text(text):
    """
    Normalizes text by removing extra spaces, line breaks, and ensuring consistent spacing. Helper function.
    """
    text = text.replace("\n", " ")  # Replace newlines with space
    text = re.sub(r"\s+", " ", text).strip()  # Replace multiple spaces with a single space
    return text

## Create and store tagged modified file from its respective json log file

In [89]:
def apply_perturbation_from_json(original_text, json_file, output_folder="test_benchmark_dataset/"):
    """
    Reads the JSON metadata and applies the described perturbations to the original document,
    adding unique <*$p$*> markers around the modified sections.

    Parameters:
    - original_text (str): The original contract text.
    - json_file (str): Path to the JSON file containing the perturbation details. Stripped
    - output_folder (str): Folder to save the modified contract.

    Returns:
    - modified_text (str): The full modified document.
    """

    global error_log
    # Ensure the output directory exists
    #os.makedirs(output_folder, exist_ok=True)

    if not os.path.exists(json_file):  # Check if the file does NOT exist
        print(f"File '{json_file}' does not exist. Skipping execution.")
        return null
    
    print("json file:", json_file)
    
    # Load the JSON metadata
    with open(json_file, "r", encoding="utf-8") as file:
        json_data = json.load(file)

    #print("File successfully loaded") 
    if isinstance(json_data, list) and len(json_data) > 0:
        json_data = json_data[0]  # Extract the first item in the list
    
    # Normalize the original contract text
    normalized_text = normalize_text(original_text)

    # Apply modifications with unique markers
    modified_text = normalized_text
    
    for perturbation in json_data["perturbation"]:
        # Normalize both original and the changed section of text
        original_section = normalize_text(perturbation["original_text"])  
        #print("this is original text:", original_section)
        changed_section = normalize_text(perturbation["changed_text"])
        #print("this is the changed text:", changed_section)
        
        # Wrap changed section with unique <*$p$*> markers
        marked_section = f"<*$p$*>{changed_section}<*$p$*>"

        # Replace original section with marked modified section
        if original_section in modified_text:
            modified_text = modified_text.replace(original_section, marked_section)
        else:
            error_name = "FileModifyError"
            e = f"Could not find section in text: {original_section}"
            print("Warning: " + e)
            error_log += f"""In {json_file}: 
            Error name: {error_name}
            Error message: {e}"""
            return error_name 

    print("File modified, saving...")
    # Save the modified contract as a new file
    modified_file_name = f"modified_{json_data['file_name']}.txt"
    modified_file_path = os.path.join(output_folder, modified_file_name)

    try:
        with open(modified_file_path, "w", encoding="utf-8") as file:
            file.write(modified_text)
    except FileNotFoundError as e:
        print(f"An error occurred while writing to the file: {e}")
        error_log += f"""\nIn {file_name}: 
        Error name: FileNotFoundError
        Error message: {e}"""
        # continue

    print(f"File '{json_file}' loaded and written.") 
    return modified_text

## Functions to clean and apply highlighting to the perturbed legal documents

In [91]:
def highlight_changes(original, modified):
    """Compares original and modified text and marks changes."""
    original_lines = original.split("\n")
    modified_lines = modified.split("\n")

    diff = difflib.ndiff(original_lines, modified_lines)
    highlighted = []
    
    for line in diff:
        if line.startswith("+ "):  # Added text
            highlighted.append(f"[MODIFIED] {line[2:]}")
        elif line.startswith("- "):  # Removed text
            highlighted.append(f"[REMOVED] {line[2:]}")
        else:
            highlighted.append(line[2:])  
    
    return "\n".join(highlighted)

In [92]:
def extract_clean_text(perturbed_text):
    """
    Removes [MODIFIED], [REMOVED] tags and explanations, leaving only the modified version.
    """
    # Remove any [MODIFIED] or [REMOVED] markers
    clean_text = re.sub(r"\[MODIFIED\]|\[REMOVED\]", "", perturbed_text)
    
    # Remove explanations (assuming they are after a certain marker like "Explanation:")
    clean_text = re.sub(r"Explanation:.*", "", clean_text, flags=re.DOTALL)
    
    # Clean up extra spaces that may remain after removal
    clean_text = re.sub(r"\n\s*\n", "\n", clean_text).strip()
    
    
    
    return clean_text

# Main

In [94]:
# Destination directory creation and check
os.makedirs(folder_path_json, exist_ok=True)
os.makedirs(folder_path_save, exist_ok=True)

# Get all end folders, make it quick
end_folder_names = get_end_folders(folder_path_read)
# perturbation_type = "contradiction"  # Change to "ambiguity", "omission", etc.
# perturbed_legal_docs = apply_perturbations(folder_path_read, folder_path_json, folder_path_save, perturbation_type, prompt)

# Find the index of the start folder
# if start_folder in end_folder_names:
#     start_index = end_folder_names.index(start_folder)
# else:
#     start_index = 0  # Default to starting from the beginning if folder not found

# Initialize perturbed_json outside the loop to avoid UnboundLocalError
perturbed_json = "No perturbations applied"

for folder_name in end_folder_names[start_index:]:
    print("\nCurrently in " + folder_name + "\n")
    perturbed_check = apply_perturbations(folder_name, folder_path_json, folder_path_save)
    if not perturbed_check:
        print("Something is wrong man...")
        break


Currently in full_contract_txt/

Skipping unsupported file: .ipynb_checkpoints
We have got the legal docs:
________________________________________________________________________
Processing 2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-Branding Agreement_ Agency Agreement.txt...
json file: test_benchmark_dataset/inconsistencies_legal_contradiction_json\perturbed_2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-Branding Agreement_ Agency Agreement.txt.json
File modified, saving...
File 'test_benchmark_dataset/inconsistencies_legal_contradiction_json\perturbed_2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-Branding Agreement_ Agency Agreement.txt.json' loaded and written.
________________________________________________________________________
Processing ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEMENT.txt...
json file: test_benchmark_dataset/inconsistencies_legal_contradiction_json\perturbed_ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEMENT.txt.jso

# End of Program

In [96]:
# Output error log txt file
# print(error_log)
with open(error_log_name, "w", encoding="utf-8") as file:
    file.write(error_log)

print(f"{error_log_name} written successfully.")

Error log written successfully.


In [97]:
# # Count files as needed
# def count_files(folder_path):
#     files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
#     return len(files)

# # Example usage:
# folder_path = "benchmark_dataset_v2/misaligned_terminalogy_inText/"
# print("Number of files:", count_files(folder_path))

In [98]:
print("EOP")

EOP
