# Initialization and Set Up


In [None]:
!pip install google-generativeai
!pip install pymupdf

print("Install done")

## Test Gemini API Key
- gemini-2.0-flash

In [6]:
import google.generativeai as genai

In [7]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyDn3S7Ltgw_ABPmw6cfD_qMv7PN8KXSlwA"
# 1. AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE
# 2. AIzaSyCKtZRj1pJMu1JVO7siNYcqG15oTgPSj3k
# 3. AIzaSyAjby-dj9aBsolOdTDpvU7_x5uje8l4yiQ
# 4. AIzaSyAGHtD2RAI1geToBsVjk-mIzVeuhlZQtA4 (Noel_)
# 5. AIzaSyBTYgTD42xCABfJy1jsHchkZEhFaw8X1_c (Mannan_)
# 7. AIzaSyDgafwAgDi2Zjvu6jdt_SIZ60VgK1Na32E (adi)
# 8. AIzaSyCWI7QJXWYBGGWGdL37W8ll0sDIwz0zqlo(adi)

In [8]:
api_key = os.getenv("GOOGLE_API_KEY")

genai.configure(api_key=api_key)

# Initialize the Gemini model
model = genai.GenerativeModel("gemini-2.0-flash")

In [9]:
# Ask a question
response = model.generate_content("Explain quantum computing in simple terms.")

# Print the response
print(response.text)


Imagine a light switch. A normal computer bit is like that light switch: it can be either ON (representing 1) or OFF (representing 0).  It's one or the other, and it's always definitely one of those states.

Quantum computers use **qubits**.  Imagine a dimmer switch instead of a regular light switch. A qubit can be ON, OFF, *or somewhere in between*.  It's not just one thing or the other, but a combination of both at the same time, until you actually look at it.  This "in-between" state is called **superposition**.

Think of it like flipping a coin. Before it lands, it's sort of both heads AND tails at the same time. That's superposition.

Another key concept is **entanglement**. Imagine two of these dimmer switches magically linked. If you change one, the other instantly changes too, even if they're far apart.  Entangled qubits are linked in a similar way, allowing them to work together in a coordinated fashion.

**So, why is this useful?**

Because qubits can be in multiple states at

In [10]:
import fitz
import difflib
import re
import json

### Error log

In [11]:
# Let it be global
error_log = """Error log\n"""

# Starting of pipeline 
1. reads legal documents
2. calls LLM to add perturbations
3. creates output in json format
4. stores output files in benchmark dataset

## Change source folder and destination folder


In [102]:
# Edit these as needed

# Perturbation type - only determines file name changes
perturbation_type = "omissions_legal"

# Folder paths
"""
folder_path_read = folder path to read the pdfs, put the root folder here and it picks out all pdfs

folder_path_json = folder path to save the perturbation json files

folder_path_save = folder path to save the modified perturbed text files
"""

folder_path_read = "full_contract_txt/"

folder_path_json = os.path.join("benchmark_dataset_v2", perturbation_type)

folder_path_save = os.path.join(folder_path_json, "modified_files")

# Switch to limit read number of files for testing
#file_read_limit_flag = False
# Limit of number of files to read
#file_read_limit = 20

# Error log file name
error_log_name = f"error_log_{perturbation_type}.txt"

# Change this as needed
#start_folder = ""
# start_folder = "full_contract_pdf/Part_I/License_Agreements/"

# Index of subfolder to start with from the root folder in folder_path_read
start_index = 0

# Prompts
# See function generate_perturbation_new









## Retrieve content for each legal document


In [103]:
def get_end_folders(root_folder, skip_folder=".ipynb_checkpoints"):
    end_folders = []

    for dirpath, dirnames, _ in os.walk(root_folder, topdown=True):
        # Remove the folders that should be skipped
        dirnames[:] = [d for d in dirnames if d != skip_folder]

        # If there are no subdirectories left, it's an end folder
        if not dirnames:
            end_folders.append(os.path.join(dirpath, ""))  # Ensure trailing backslash

    return end_folders

# Example usage:
# result = get_end_folders("full_contract_pdf")
# print(result)

### Clean Text

In [14]:
def normalize_text(text):
    """
    Normalizes text by removing extra spaces, line breaks, and ensuring consistent spacing. Helper function.
    """
    text = text.replace("\n", " ")  # Replace newlines with space
    text = re.sub(r"\s+", " ", text).strip()  # Replace multiple spaces with a single space
    return text

### Reading the pdf versions


In [15]:
def read_pdf(file_path):
    """Reads a PDF file"""
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])
    # no error catch

### Reading the txt versions

In [16]:
def read_txt(file_path):
    """
    Reads the entire content of a text file.

    Args:
        file_path (str): Path to the text file.

    Returns:
        (str): Content of the file as a string.
    """
    global error_log
    try:
        with open(file_path, 'r', encoding="utf8") as file:
            all_text = file.read()
            clean_text = normalize_text(all_text)
        return clean_text
    except FileNotFoundError:
        e = "File not found. Please check the file path."
        error_log += f"""\nIn {file_path}: 
        Error name: FileNotFoundError
        Error message: {e}\n"""
        return e


In [21]:
def read_legal_files(folder_path):
    """Reads all legal files in the folder and returns a dictionary with file names and content."""
    #global file_read_limit
    #global file_read_limit_flag
    #file_read_limit_local = file_read_limit
    legal_documents = {}
    # file_start_ = 350
    # print(str(file_read_limit_flag) + " and " + str(file_read_limit_local))
    #Goes through the directory, looking for supported files to read
    for file_name in os.listdir(folder_path):
        # if file_start_ > 0:
        #     file_start_ -= 1
        #     continue
        
    # limit read files for experimentation
    # if file_read_limit_flag and file_read_limit_local <= 0:
    #     break;
        file_path = os.path.join(folder_path, file_name)

        # if block to determine which read function to call
        if file_name.endswith(".pdf"):
            legal_documents[file_name] = read_pdf(file_path)
            #file_read_limit_local -= 1
        elif file_name.endswith(".txt"):
            legal_documents[file_name] = read_txt(file_path)
            #file_read_limit_local -= 1
        else:
            print(f"Skipping unsupported file: {file_name}")

    return legal_documents

In [22]:
# Read legal files
legal_docs = read_legal_files(folder_path_read)

# Display first document
# for file_name, content in legal_docs.items():
#     print(f"--- {file_name} ---\n{content[:500]}...\n")
#     break

Skipping unsupported file: .ipynb_checkpoints


In [23]:
# Printing all file names that were accepted
print(f"Total files from {folder_path_read}: {len(legal_docs)}") 
for file_name, content in legal_docs.items():
    print(f"--- {file_name} ---...\n")

Total files from full_contract_txt/: 510
--- LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT.txt ---...

--- WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTION AND DISTRIBUTION AGREEMENT.txt ---...

--- LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_Supply Agreement.txt ---...

--- CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEB SITE HOSTING AGREEMENT.txt ---...

--- NELNETINC_04_08_2020-EX-1-JOINT FILING AGREEMENT.txt ---...

--- ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT AGREEMENT.txt ---...

--- KIROMICBIOPHARMA,INC_05_11_2020-EX-10.23-CONSULTING AGREEMENT.txt ---...

--- VEONEER,INC_02_21_2020-EX-10.11-JOINT VENTURE AGREEMENT.txt ---...

--- DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement.txt ---...

--- PACIRA PHARMACEUTICALS, INC. - A_R STRATEGIC LICENSING, DISTRIBUTION AND MARKETING AGREEMENT .txt ---...

--- MetLife, Inc. - Remarketing Agreement.txt ---...

--- FTENETWORKS,INC_02_18_2016-EX-99.4-STRATEGIC ALLIANCE AGREEMENT.txt ---...



## Prompt to read through legal file and insert different types of perturbations
- 10 different types of prompts to switch
- returns output in json format, which would be considered as lock-file

In [100]:
def generate_perturbation_new(original_text, file_name, prompt):    
    """Generates a perturbed version of the legal document section in structured JSON format."""

    prompt = f"""You are a corporate legal advisor reviewing a contract for compliance. Your task is to introduce a contractual omission that removes a legally required consumer right, thereby violating state or national law.
    
    Before modifying the text:
    - **Read the file** to determine what city, state, or country the contract applies to.
    - If the jurisdiction is unclear, default to **United States law**.
    - Make sure that when taking the original texts, there should be no jumps between sentences. Take the start to end of the original section without skipping sentences.
   
    ### **Definition:**
    Omissions occur when a contract **removes essential information**, creating legal loopholes. A **legal contradiction** in this category happens when a contract omits **a legally mandated consumer protection**, making it non-compliant.
    
    ### **Step-by-Step Instructions:**
    1. Identify a clause that **grants consumers a legally required right**.
    2. **Remove or modify** it so that the contract becomes **non-compliant with a law**.
    3. Ensure that the omission introduces **legal risk** in the identified jurisdiction.
    4. For that perturbation, make sure in the file there should be **2-3** of them. 
    5. Output the modified contract in structured JSON format.
    
    ---
    
    ### **Examples of Omitting Consumer Rights:**
    
    **Example 1:**
    - **Original:** "Customers have the right to return the product within 30 days, as required by state law."
    - **Modified:** "Customers have the right to return the product at the company’s discretion."
    - **Explanation:** Many states (e.g., California) require **minimum return periods**—removing this makes the contract non-compliant.
    
    **Example 2:**
    - **Original:** "The landlord shall provide a habitable living environment as required by law."
    - **Modified:** "The landlord shall make reasonable efforts to provide a habitable living environment."
    - **Explanation:** "Reasonable efforts" **weakens tenant rights**, violating **housing laws**.
    
    **Example 3:**
    - **Original:** "Bank deposits are insured up to $250,000 under federal law."
    - **Modified:** "Bank deposits may be insured based on company policy."
    - **Explanation:** Federal law **mandates insurance coverage**—removing this statement misleads consumers.
    
    **Example 4:**
    - **Original:** "Employees are entitled to paid overtime after 40 hours per week."
    - **Modified:** "Overtime compensation shall be determined at the company's discretion."
    - **Explanation:** **Federal labor laws** require **clear overtime pay guidelines**.
    
    **Example 5:**
    - **Original:** "All loan agreements must provide an Annual Percentage Rate (APR) disclosure."
    - **Modified:** "Loan agreements may include an APR disclosure."
    - **Explanation:** Some laws **mandate APR transparency**—removing this makes loans non-compliant.
    
    **Example 6:**
    - **Original:** "The tenant shall maintain comprehensive liability insurance covering all damages arising from negligence or misconduct on the premises, with a minimum coverage amount of $2 million per occurrence. The tenant shall provide proof of insurance annually to the landlord. Failure to maintain coverage at this level shall constitute grounds for lease termination."
    - **Modified:** "The tenant shall maintain liability insurance covering damages arising from negligence or misconduct on the premises. Proof of insurance may be requested by the landlord as necessary."
    - **Explanation:** The removal of 'comprehensive' and the 'minimum coverage amount of $2 million per occurrence' significantly weakens the insurance requirement. The deletion of the **annual proof requirement** eliminates an important compliance measure. This creates a **legal contradiction** with state laws that mandate specific minimum coverage amounts for commercial leases.
    
    ---
    
    ### **Return JSON Format**
    {{
        "file_name": {file_name},
        "perturbation": [
            {{
                "type": "Omissions - Omission Legal Contradiction",
                "original_text": "EXCERPT BEFORE CHANGE",
                "changed_text": "EXCERPT AFTER CHANGE",
                "explanation": "WHY THIS CHANGE INTRODUCES A PERTURBATION",
                "contradicted_law": "SPECIFIC LAW OR REGULATION BEING VIOLATED",
                "location": "SECTION OR PARAGRAPH NUMBER"
            }}
        ]
    }}
    
    Below is the original legal text:
    -------------------
    {original_text}
    -------------------
    
    Now, return ONLY the structured JSON object with the modified text and explanation.
    """
    
    response = None
    response_text = None
    try:
        response = model.generate_content(prompt)
        response_text = response.text
    except ValueError as e:
        if "reciting from copyrighted material" in str(e):
            print("Error: The model was reciting from copyrighted material. Please modify your prompt.")
        else:
            print(f"Unexpected ValueError: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    
    return response_text if response else "ERROR: No response from API"

## Applies perturbations to files and stores in json format

In [101]:
def apply_perturbations(folder_path_read, folder_path_json, folder_path_save, prompt):
    legal_docs = read_legal_files(folder_path_read)
    
    print('We have got the legal docs:')

    global error_log

    for i, (file_name, content) in enumerate(legal_docs.items()):
        print("________________________________________________________________________")
        results = []
        # if i >= 50:  # Stop after processing 5 documents
        #      break

        # Strip file name whitespaces
        file_name = file_name.replace(" ", "")
        
        output_file = f"perturbed_{file_name}.json"
        output_path = os.path.join(folder_path_json, output_file)

        if os.path.exists(output_path):
            print(f"Skipping {file_name} — already processed.")
            continue
        
        print("This is file: ", i)
        print(f"Processing {file_name}...")
        #print('This is the content:', content)
        #perturbed_json = generate_perturbation(content, file_name, perturbation_type)
        perturbed_json = generate_perturbation_new(content, file_name, prompt)
        
        #print('This is the perturbed json for this file:', perturbed_json)

        # If there is no returned json, return this message
        if perturbed_json.__eq__("ERROR: No response from API"):
            continue
        
        #print("This is the perturbed json:", perturbed_json)
        clean_json_text = re.sub(r"```json|```", "", perturbed_json).strip()

        # print('this is json:', clean_json_text)
        try:
            # Convert response into a Python dictionary
            perturbed_data = json.loads(clean_json_text)
            results.append(perturbed_data)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON for {file_name}, writing into logs and skipping...")
            error_log += f"""\nIn {file_name}: 
            Error name: JSONDecodeError
            Error message: {e}\n"""
            continue
            
        # Save the JSON output
        json_output_path = os.path.join(folder_path_json, f"perturbed_{file_name}.json")
        # json_output_path = json_output_path.strip()
        try:
            with open(json_output_path, "w", encoding="utf-8") as f:
                json.dump(results, f, indent=4, ensure_ascii=False)
        except FileNotFoundError as e:
            print(f"An error occurred while writing to the file: {e}")
            error_log += f"""\nIn {file_name}: 
            Error name: FileNotFoundError
            Error message: {e}\n"""
            continue
        except IOError as e:
            print(f"An error occurred while writing to the file: {e}")
            error_log += f"""\nIn {file_name}:
            Error name: IOError
            Error message: {e}\n"""
            continue
        except json.JSONEncodeError as e:
            print(f"An error occurred while encoding JSON: {e}")
            error_log += f"""\nIn {file_name}:
            Error name: JSONEncodeError
            Error message: {e}\n"""
            continue

        
        # Apply the perturbation from json to text
        modified_contract = apply_perturbation_from_json(content, json_output_path, folder_path_save)

        # Write to a log file for this folder
    
    print(f"All perturbations saved in {folder_path_save}")

    return perturbed_json

In [26]:
# def normalize_text(text):
#     """
#     Normalizes text by removing extra spaces, line breaks, and ensuring consistent spacing. Helper function.
#     """
#     text = text.replace("\n", " ")  # Replace newlines with space
#     text = re.sub(r"\s+", " ", text).strip()  # Replace multiple spaces with a single space
#     return text

## Create and store tagged modified file from its respective json log file

In [78]:
def apply_perturbation_from_json(original_text, json_file, output_folder="test_benchmark_dataset/"):
    """
    Reads the JSON metadata and applies the described perturbations to the original document,
    adding unique <*$p$*> markers around the modified sections.

    Parameters:
    - original_text (str): The original contract text.
    - json_file (str): Path to the JSON file containing the perturbation details. Stripped
    - output_folder (str): Folder to save the modified contract.

    Returns:
    - modified_text (str): The full modified document.
    """

    global error_log
    # Ensure the output directory exists
    #os.makedirs(output_folder, exist_ok=True)

    if not os.path.exists(json_file):  # Check if the file does NOT exist
        print(f"File '{json_file}' does not exist. Skipping execution.")
        return null
    
    print("json file:", json_file)
    
    # Load the JSON metadata
    with open(json_file, "r", encoding="utf-8") as file:
        json_data = json.load(file)

    #print("File successfully loaded") 
    if isinstance(json_data, list) and len(json_data) > 0:
        json_data = json_data[0]  # Extract the first item in the list
    
    # Normalize the original contract text
    normalized_text = normalize_text(original_text)

    # Apply modifications with unique markers
    modified_text = normalized_text
    
    for perturbation in json_data["perturbation"]:
        # Normalize both original and the changed section of text
        original_section = normalize_text(perturbation["original_text"])  
        #print("this is original text:", original_section)
        changed_section = normalize_text(perturbation["changed_text"])
        #print("this is the changed text:", changed_section)
        
        # Wrap changed section with unique <*$p$*> markers
        marked_section = f"<*$p$*>{changed_section}<*$p$*>"

        # Replace original section with marked modified section
        if original_section in modified_text:
            modified_text = modified_text.replace(original_section, marked_section)
        else:
            error_name = "FileModifyError"
            e = f"Could not find section in text: {original_section}"
            print("Warning: " + e)
            error_log += f"""\nIn {json_file}: 
            Error name: {error_name}
            Error message: {e}\n"""
            return error_name 

    print("File modified, saving...")
    # Save the modified contract as a new file
    modified_file_name = f"modified_{json_data['file_name']}.txt"
    modified_file_path = os.path.join(output_folder, modified_file_name)

    try:
        with open(modified_file_path, "w", encoding="utf-8") as file:
            file.write(modified_text)
    except FileNotFoundError as e:
        print(f"An error occurred while writing to the file: {e}")
        error_log += f"""\nIn {file_name}: 
        Error name: FileNotFoundError
        Error message: {e}\n"""
        # continue
        

    print(f"File '{json_file}' loaded and written.") 
    return modified_text

## Functions to clean and apply highlighting to the perturbed legal documents

In [79]:
def highlight_changes(original, modified):
    """Compares original and modified text and marks changes."""
    original_lines = original.split("\n")
    modified_lines = modified.split("\n")

    diff = difflib.ndiff(original_lines, modified_lines)
    highlighted = []
    
    for line in diff:
        if line.startswith("+ "):  # Added text
            highlighted.append(f"[MODIFIED] {line[2:]}")
        elif line.startswith("- "):  # Removed text
            highlighted.append(f"[REMOVED] {line[2:]}")
        else:
            highlighted.append(line[2:])  
    
    return "\n".join(highlighted)

In [104]:
def extract_clean_text(perturbed_text):
    """
    Removes [MODIFIED], [REMOVED] tags and explanations, leaving only the modified version.
    """
    # Remove any [MODIFIED] or [REMOVED] markers
    clean_text = re.sub(r"\[MODIFIED\]|\[REMOVED\]", "", perturbed_text)
    
    # Remove explanations (assuming they are after a certain marker like "Explanation:")
    clean_text = re.sub(r"Explanation:.*", "", clean_text, flags=re.DOTALL)
    
    # Clean up extra spaces that may remain after removal
    clean_text = re.sub(r"\n\s*\n", "\n", clean_text).strip()
    
    
    
    return clean_text

In [106]:
# Destination directory creation and check
os.makedirs(folder_path_json, exist_ok=True)
os.makedirs(folder_path_save, exist_ok=True)

# Get all end folders, make it quick
end_folder_names = get_end_folders(folder_path_read)
# perturbation_type = "contradiction"  # Change to "ambiguity", "omission", etc.
# perturbed_legal_docs = apply_perturbations(folder_path_read, folder_path_json, folder_path_save, perturbation_type, prompt)

# Find the index of the start folder
# if start_folder in end_folder_names:
#     start_index = end_folder_names.index(start_folder)
# else:
#     start_index = 0  # Default to starting from the beginning if folder not found

for folder_name in end_folder_names[start_index:]:
    print("\nCurrently in " + folder_name + "\n")
    perturbed_legal_docs = apply_perturbations(folder_name, folder_path_json, folder_path_save, "")


Currently in full_contract_txt/

Skipping unsupported file: .ipynb_checkpoints
We have got the legal docs:
________________________________________________________________________
Skipping LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTORAGREEMENT.txt — already processed.
________________________________________________________________________
Skipping WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTIONANDDISTRIBUTIONAGREEMENT.txt — already processed.
________________________________________________________________________
Skipping LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_SupplyAgreement.txt — already processed.
________________________________________________________________________
Skipping CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEBSITEHOSTINGAGREEMENT.txt — already processed.
________________________________________________________________________
Skipping NELNETINC_04_08_2020-EX-1-JOINTFILINGAGREEMENT.txt — already processed.
___________________________________________________

# End of Program

In [106]:
# Output error log txt file
# print(error_log)
with open(error_log_name, "w", encoding="utf-8") as file:
    file.write(error_log)

print(f"{error_log_name} written successfully.")

error_log_omissions_inText.txt written successfully.


In [107]:
# # Count files as needed
# def count_files(folder_path):
#     files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
#     return len(files)

# # Example usage:
# folder_path = "benchmark_dataset_v2/misaligned_terminalogy_inText/"
# print("Number of files:", count_files(folder_path))

In [108]:
print("EOP")

EOP
