# Initialization and Set Up


In [1]:
!pip install google-generativeai
!pip install pymupdf

print("Install done")

Install done


## Test Gemini API Key
- gemini-2.0-flash

In [2]:
import google.generativeai as genai

In [3]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyCVjSqp_8WwJMVaIi3dVSQDRic5I1869kE"


In [4]:
api_key = os.getenv("GOOGLE_API_KEY")

genai.configure(api_key=api_key)

# Initialize the Gemini model
model = genai.GenerativeModel("gemini-2.0-flash")

In [5]:
# Ask a question
response = model.generate_content("Explain quantum computing in simple terms.")

# Print the response
print(response.text)


In [6]:
import fitz
import difflib
import re
import json

# Starting of pipeline 
1. reads legal documents
2. calls LLM to add perturbations
3. creates output in json format
4. stores output files in benchmark dataset

## Change source folder and destination folder


In [257]:
# Edit these as needed
"""Folder paths"""
folder_path_read = "full_contract_pdf/"

folder_path_json = "test_benchmark_dataset/ambiguity_legal_contradication_json/"

folder_path_save = "test_benchmark_dataset/ambiguity_legal_contradication/"

"""Prompts"""
# See function generate_perturbation_new


'Prompts'

## Retrieve content for each legal document


In [259]:
def get_end_folders(root_folder, skip_folder=".ipynb_checkpoints"):
    end_folders = []

    for dirpath, dirnames, _ in os.walk(root_folder, topdown=True):
        # Remove the folders that should be skipped
        dirnames[:] = [d for d in dirnames if d != skip_folder]

        # If there are no subdirectories left, it's an end folder
        if not dirnames:
            end_folders.append(os.path.join(dirpath, ""))  # Ensure trailing backslash

    return end_folders

# Example usage:
# result = get_end_folders("full_contract_pdf")
# print(result)

In [260]:
def read_pdf(file_path):
    """Reads a PDF file"""
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])

def read_legal_files(folder_path):
    """Reads all legal files in the folder and returns a dictionary with file names and content."""
    legal_documents = {}

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith(".pdf"):
            legal_documents[file_name] = read_pdf(file_path)
        else:
            print(f"Skipping unsupported file: {file_name}")

    return legal_documents

In [261]:
# Read legal files
legal_docs = read_legal_files(folder_path_read)

# Display first document
# for file_name, content in legal_docs.items():
#     print(f"--- {file_name} ---\n{content[:500]}...\n")
#     break

Skipping unsupported file: Part_I
Skipping unsupported file: Part_II
Skipping unsupported file: Part_III


In [262]:
# Printing all file names that were accepted
print(f"Total files from {folder_path_read}: {len(legal_docs)}") 
for file_name, content in legal_docs.items():
    print(f"--- {file_name} ---...\n")

Total files from full_contract_pdf/: 0


## Prompt to read through legal file and insert different types of perturbations
- 10 different types of prompts to switch
- returns output in json format, which would be considered as lock-file

In [264]:
def generate_perturbation_new(original_text, file_name, prompt):    
    """Generates a perturbed version of the legal document section in structured JSON format."""

    prompt = f"""You are a senior compliance officer reviewing a legal contract. Your task is to modify a section of the contract by introducing an ambiguous legal obligation while ensuring that the ambiguity contradicts a state or national law.
    
    Before modifying the text:
    - **Read the file** to determine what city, state, or country the contract applies to.
    - If the jurisdiction is unclear, default to **United States law**.
    - Make sure that when taking the original texts, there should be no jumps between sentences. Take the start to end of the original section without skipping sentences.
    
    ### **Definition:**
    Ambiguities occur when a legal statement is vague, leading to multiple interpretations. A **legal contradiction** under this category happens when an obligation is introduced ambiguously, making it difficult to enforce under state or national law. This can result in non-compliance with regulatory requirements, leaving legal obligations open to dispute.
    
    ### **Step-by-Step Instructions:**
    1. Identify a clear legal obligation in the contract.
    2. Modify the wording to make it **vague or open to multiple interpretations**.
    3. Ensure that this ambiguity creates **non-compliance with a specific law** in the identified jurisdiction.
    4. For that perturbation, make sure in the file there should be **2-3** of them. 
    5. Output the modified contract in structured JSON format.
    
    ---
    
    ### **Examples of Ambiguous Legal Obligations:**
    
    **Example 1:**
    - **Original:** "The company shall provide necessary accommodations for disabled employees."
    - **Modified:** "The company shall provide accommodations for disabled employees as deemed appropriate."
    - **Explanation:** The term "as deemed appropriate" introduces ambiguity, conflicting with **ADA (Americans with Disabilities Act)**, which mandates **clear, non-discretionary accommodations**.
    
    **Example 2:**
    - **Original:** "All contractors must comply with local zoning laws."
    - **Modified:** "All contractors must make reasonable efforts to comply with zoning laws."
    - **Explanation:** "Reasonable efforts" is vague—some zoning laws require strict adherence.
    
    **Example 3:**
    - **Original:** "The landlord shall ensure habitable living conditions in compliance with state law."
    - **Modified:** "The landlord shall make efforts to maintain habitable conditions."
    - **Explanation:** "Make efforts" does not guarantee habitability, which violates **tenant protection laws**.
    
    **Example 4:**
    - **Original:** "The company shall maintain data security measures that meet industry standards."
    - **Modified:** "The company shall maintain data security measures that it deems sufficient."
    - **Explanation:** "Deems sufficient" is subjective and contradicts **GDPR and CCPA** requirements for **specific security standards**.
    
    **Example 5:**
    - **Original:** "Employees shall be provided meal breaks as required by law."
    - **Modified:** "Employees shall be encouraged to take meal breaks."
    - **Explanation:** Some states require **mandatory** meal breaks (e.g., California).
    
    **Example 6:**
    - **Original:** "The Contractor shall comply with all federal and state regulations governing workplace safety, ensuring all necessary precautions are taken to protect employees from occupational hazards. The Contractor must conduct quarterly safety inspections and submit reports to regulatory authorities. Any violations of safety standards shall result in corrective actions and potential penalties. The company’s leadership is responsible for ensuring full compliance at all levels."
    - **Modified:** "The Contractor should make reasonable efforts to comply with applicable federal and state regulations governing workplace safety. The Contractor may conduct periodic safety inspections and submit reports when deemed necessary. Violations of safety standards will be assessed on a case-by-case basis, and corrective actions may be recommended where appropriate."
    - **Explanation:** This change weakens the legal obligation by replacing 'shall comply' with 'should make reasonable efforts,' making compliance discretionary rather than mandatory. The removal of 'quarterly safety inspections' eliminates a clear legal requirement, and replacing 'shall result in corrective actions' with 'may be recommended' creates uncertainty. This contradicts **OSHA regulations**, which mandate strict compliance and routine reporting on workplace safety violations.
    ---
    
    
    ### **Return JSON Format**
    {{
        "file_name": {file_name},
        "perturbation": [
            {{
                "type": "Ambiguities - Ambiguous Legal Obligation",
                "original_text": "EXCERPT BEFORE CHANGE",
                "changed_text": "EXCERPT AFTER CHANGE",
                "explanation": "WHY THIS CHANGE INTRODUCES A PERTURBATION",
                "contradicted_law": "SPECIFIC LAW OR REGULATION BEING VIOLATED",
                "location": "SECTION OR PARAGRAPH NUMBER"
            }}
        ]
    }}
    
    
    Below is the original legal text:
    -------------------
    {original_text}
    -------------------
    
    Now, return ONLY the structured JSON object with the modified text and explanation.
    """
    
    response = model.generate_content(prompt)
    return response.text if response else "ERROR: No response from API"

## Applies perturbations to files and stores in json format

In [266]:
def apply_perturbations(folder_path_read, folder_path_json, folder_path_save, prompt):
    legal_docs = read_legal_files(folder_path_read)
    

    for i, (file_name, content) in enumerate(legal_docs.items()):
        results = []
        # if i >= 10:  # Stop after processing 5 documents
        #      break
        print(f"Processing {file_name}...")
        #perturbed_json = generate_perturbation(content, file_name, perturbation_type)
        perturbed_json = generate_perturbation_new(content, file_name, prompt)
        #print("This is the perturbed json:", perturbed_json)
        clean_json_text = re.sub(r"```json|```", "", perturbed_json).strip()

        # print('this is json:', clean_json_text)
        try:
            # Convert response into a Python dictionary
            perturbed_data = json.loads(clean_json_text)
            results.append(perturbed_data)
        except json.JSONDecodeError:
            print(f"Error parsing JSON for {file_name}, skipping...")
            continue
            
        # Save the JSON output
        json_output_path = os.path.join(folder_path_json, f"perturbed_{file_name}.json")
        with open(json_output_path, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=4, ensure_ascii=False)

        # Apply the perturbation from json to text
        modified_contract = apply_perturbation_from_json(content, json_output_path, folder_path_save)
        print("________________________________________________________________________")

    print(f"All perturbations saved in {folder_path_save}")

    return perturbed_json

In [267]:
def normalize_text(text):
    """
    Normalizes text by removing extra spaces, line breaks, and ensuring consistent spacing. Helper function.
    """
    text = text.replace("\n", " ")  # Replace newlines with space
    text = re.sub(r"\s+", " ", text).strip()  # Replace multiple spaces with a single space
    return text

## Create and store tagged modified file from its respective json log file

In [269]:
def apply_perturbation_from_json(original_text, json_file, output_folder="test_benchmark_dataset/"):
    """
    Reads the JSON metadata and applies the described perturbations to the original document,
    adding unique <*$p$*> markers around the modified sections.

    Parameters:
    - original_text (str): The original contract text.
    - json_file (str): Path to the JSON file containing the perturbation details.
    - output_folder (str): Folder to save the modified contract.

    Returns:
    - modified_text (str): The full modified document.
    """

    # Ensure the output directory exists
    os.makedirs(output_folder, exist_ok=True)
    
    print("json file:", json_file)
    
    # Load the JSON metadata
    with open(json_file, "r", encoding="utf-8") as file:
        json_data = json.load(file)

    print("File successfully loaded") 
    if isinstance(json_data, list) and len(json_data) > 0:
        json_data = json_data[0]  # Extract the first item in the list
    
    # Normalize the original contract text
    normalized_text = normalize_text(original_text)

    # Apply modifications with unique markers
    modified_text = normalized_text
    
    for perturbation in json_data["perturbation"]:
        # Normalize both original and the changed section of text
        original_section = normalize_text(perturbation["original_text"])  
        #print("this is original text:", original_section)
        changed_section = normalize_text(perturbation["changed_text"])
        #print("this is the changed text:", changed_section)
        
        # Wrap changed section with unique <*$p$*> markers
        marked_section = f"<*$p$*>{changed_section}<*$p$*>"

        # Replace original section with marked modified section
        if original_section in modified_text:
            modified_text = modified_text.replace(original_section, marked_section)
        else:
            print(f"Warning: Could not find section in text: {original_section}")
            error = "COULD NOT MODIFY FILE"
            return error

    print("File modified, saving...")
    # Save the modified contract as a new file
    modified_file_name = f"modified_{json_data['file_name']}.txt"
    modified_file_path = os.path.join(output_folder, modified_file_name)

    with open(modified_file_path, "w", encoding="utf-8") as file:
        file.write(modified_text)

    return modified_text

## Functions to clean and apply highlighting to the perturbed legal documents

In [271]:
def highlight_changes(original, modified):
    """Compares original and modified text and marks changes."""
    original_lines = original.split("\n")
    modified_lines = modified.split("\n")

    diff = difflib.ndiff(original_lines, modified_lines)
    highlighted = []
    
    for line in diff:
        if line.startswith("+ "):  # Added text
            highlighted.append(f"[MODIFIED] {line[2:]}")
        elif line.startswith("- "):  # Removed text
            highlighted.append(f"[REMOVED] {line[2:]}")
        else:
            highlighted.append(line[2:])  
    
    return "\n".join(highlighted)

In [272]:
def extract_clean_text(perturbed_text):
    """
    Removes [MODIFIED], [REMOVED] tags and explanations, leaving only the modified version.
    """
    # Remove any [MODIFIED] or [REMOVED] markers
    clean_text = re.sub(r"\[MODIFIED\]|\[REMOVED\]", "", perturbed_text)
    
    # Remove explanations (assuming they are after a certain marker like "Explanation:")
    clean_text = re.sub(r"Explanation:.*", "", clean_text, flags=re.DOTALL)
    
    # Clean up extra spaces that may remain after removal
    clean_text = re.sub(r"\n\s*\n", "\n", clean_text).strip()
    
    
    
    return clean_text

In [278]:
# Destination directory creation and check
os.makedirs(folder_path_json, exist_ok=True)
os.makedirs(folder_path_save, exist_ok=True)

# Get all end folders, make it quick
end_folder_names = get_end_folders(folder_path_read)
perturbation_type = "contradiction"  # Change to "ambiguity", "omission", etc.
# perturbed_legal_docs = apply_perturbations(folder_path_read, folder_path_json, folder_path_save, perturbation_type, prompt)
for folder_name in end_folder_names:
    print("Currently in " + folder_name + "\n")
    perturbed_legal_docs = apply_perturbations(folder_name, folder_path_json, folder_path_save, "")

Currently in full_contract_pdf/Part_I\Affiliate_Agreements\

Skipping unsupported file: .ipynb_checkpoints
Processing CreditcardscomInc_20070810_S-1_EX-10.33_362297_EX-10.33_Affiliate Agreement.pdf...
json file: test_benchmark_dataset/ambiguity_legal_contradication_json/perturbed_CreditcardscomInc_20070810_S-1_EX-10.33_362297_EX-10.33_Affiliate Agreement.pdf.json
File successfully loaded
File modified, saving...
________________________________________________________________________
Processing CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.pdf...
json file: test_benchmark_dataset/ambiguity_legal_contradication_json/perturbed_CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.pdf.json
File successfully loaded
File modified, saving...
________________________________________________________________________
Processing DigitalCinemaDestinationsCorp_20111220_S-1_EX-10.10_7346719_EX-10.10_Affiliate Agreement.pdf...
json file: test_

UnboundLocalError: cannot access local variable 'perturbed_json' where it is not associated with a value

# End of Program

In [None]:
print("EOP")