# Initialization and Set Up


In [161]:
!pip install google-generativeai
!pip install pymupdf
!pip install google-generativeai pymupdf tldextract

print("Install done")

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Install done


## Test Gemini API Key
- gemini-2.0-flash

In [162]:
import google.generativeai as genai

In [163]:
import os
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")


In [164]:
api_key = os.getenv("GOOGLE_API_KEY")

genai.configure(api_key=api_key)

# Initialize the Gemini model
model = genai.GenerativeModel("gemini-2.0-flash")

In [165]:
# Ask a question
response = model.generate_content("Explain quantum computing in simple terms.")

# Print the response
print(response.text)


Imagine a light switch. A regular computer bit is like that switch: it can be either ON (1) or OFF (0). That's it.

Now imagine a dimmer switch, but instead of just controlling brightness, it can be *both* ON and OFF at the same time, and in many different amounts of each! That's like a **qubit**, the basic unit of quantum information.

Here's the breakdown:

* **Normal Computers (Classical Computers):**
    * Use **bits** which are either 0 or 1. Think of it like a light switch: either ON or OFF.
    * Solve problems by doing calculations step-by-step, in a specific order.

* **Quantum Computers:**
    * Use **qubits** which can be 0, 1, or *both* at the same time.  This "both at the same time" thing is called **superposition**. Think of it like that dimmer switch being partially on and partially off.
    *  Qubits can also be linked together in a strange way called **entanglement**.  If you know the state of one entangled qubit, you instantly know the state of the other, even if they

In [166]:
import fitz
import difflib
import re
import json

In [167]:
# -----------------------------------------------------------
# Utilities shared with scraper
# -----------------------------------------------------------
import tldextract, re

_GOV_ALLOW = re.compile(
    r"""(
        (\.gov|\.mil)$ |
        \.state\.[a-z]{2}\.us$ |
        ^uscode\.house\.gov$ |
        ^ecfr\.gov$ |
        ^govinfo\.gov$ |
        ^law\.cornell\.edu$
    )""",
    re.VERBOSE,
)

def is_official_site(url: str) -> bool:
    host = tldextract.extract(url).fqdn
    return bool(_GOV_ALLOW.search(host))

def passes_verification(pert: dict) -> bool:
    """
    Return True when:
      • an official primary link is present, AND
      • at least one snippet was successfully scraped.
    """
    ok_primary = pert.get("law_url1") and is_official_site(pert["law_url1"][0])
    return ok_primary and pert.get("scrape_success", 0) > 0


### Error log

In [168]:
# Let it be global
error_log = """Error log\n"""

# Starting of pipeline 
1. reads legal documents
2. calls LLM to add perturbations
3. creates output in json format
4. stores output files in benchmark dataset

## Change source folder and destination folder


In [169]:
# Edit these as needed

# Perturbation type - only determines file name changes
perturbation_type = "inconsistencies_legal"

# Folder paths
"""
folder_path_read = folder path to read the pdfs, put the root folder here and it picks out all pdfs

folder_path_json = folder path to save the perturbation json files

folder_path_save = folder path to save the modified perturbed text files
"""

folder_path_read = "full_contract_txt/"

folder_path_json = os.path.join("Legal_Official_v1", perturbation_type)

folder_path_save = os.path.join(folder_path_json, "modified_files")

# Switch to limit read number of files for testing
#file_read_limit_flag = False
# Limit of number of files to read
#file_read_limit = 20

# Error log file name
error_log_name = f"error_log_{perturbation_type}.txt"

# Change this as needed
#start_folder = ""
# start_folder = "full_contract_pdf/Part_I/License_Agreements/"

# Index of subfolder to start with from the root folder in folder_path_read
start_index = 0

# Prompts
# See function generate_perturbation_new









## Retrieve content for each legal document


In [170]:
def get_end_folders(root_folder, skip_folder=".ipynb_checkpoints"):
    end_folders = []

    for dirpath, dirnames, _ in os.walk(root_folder, topdown=True):
        # Remove the folders that should be skipped
        dirnames[:] = [d for d in dirnames if d != skip_folder]

        # If there are no subdirectories left, it's an end folder
        if not dirnames:
            end_folders.append(os.path.join(dirpath, ""))  # Ensure trailing backslash

    return end_folders

# Example usage:
# result = get_end_folders("full_contract_pdf")
# print(result)

### Clean Text

In [171]:
def normalize_text(text):
    """
    Normalizes text by removing extra spaces, line breaks, and ensuring consistent spacing. Helper function.
    """
    text = text.replace("\n", " ")  # Replace newlines with space
    text = re.sub(r"\s+", " ", text).strip()  # Replace multiple spaces with a single space
    return text

### Reading the pdf versions


In [172]:
def read_pdf(file_path):
    """Reads a PDF file"""
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])
    # no error catch

### Reading the txt versions

In [173]:
def read_txt(file_path):
    """
    Reads the entire content of a text file.

    Args:
        file_path (str): Path to the text file.

    Returns:
        (str): Content of the file as a string.
    """
    global error_log
    try:
        with open(file_path, 'r', encoding="utf8") as file:
            all_text = file.read()
            clean_text = normalize_text(all_text)
        return clean_text
    except FileNotFoundError:
        e = "File not found. Please check the file path."
        error_log += f"""\nIn {file_path}: 
        Error name: FileNotFoundError
        Error message: {e}\n"""
        return e


In [174]:
def read_legal_files(folder_path):
    """Reads all legal files in the folder and returns a dictionary with file names and content."""
    #global file_read_limit
    #global file_read_limit_flag
    #file_read_limit_local = file_read_limit
    legal_documents = {}
    # file_start_ = 350
    # print(str(file_read_limit_flag) + " and " + str(file_read_limit_local))
    #Goes through the directory, looking for supported files to read
    for file_name in os.listdir(folder_path):
        # if file_start_ > 0:
        #     file_start_ -= 1
        #     continue
        
    # limit read files for experimentation
    # if file_read_limit_flag and file_read_limit_local <= 0:
    #     break;
        file_path = os.path.join(folder_path, file_name)

        # if block to determine which read function to call
        if file_name.endswith(".pdf"):
            legal_documents[file_name] = read_pdf(file_path)
            #file_read_limit_local -= 1
        elif file_name.endswith(".txt"):
            legal_documents[file_name] = read_txt(file_path)
            #file_read_limit_local -= 1
        else:
            print(f"Skipping unsupported file: {file_name}")

    return legal_documents

In [175]:
# Read legal files
legal_docs = read_legal_files(folder_path_read)

# Display first document
# for file_name, content in legal_docs.items():
#     print(f"--- {file_name} ---\n{content[:500]}...\n")
#     break

Skipping unsupported file: .ipynb_checkpoints


In [176]:
# Printing all file names that were accepted
print(f"Total files from {folder_path_read}: {len(legal_docs)}") 
for file_name, content in legal_docs.items():
    print(f"--- {file_name} ---...\n")

Total files from full_contract_txt/: 510
--- LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT.txt ---...

--- WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTION AND DISTRIBUTION AGREEMENT.txt ---...

--- LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_Supply Agreement.txt ---...

--- CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEB SITE HOSTING AGREEMENT.txt ---...

--- NELNETINC_04_08_2020-EX-1-JOINT FILING AGREEMENT.txt ---...

--- ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT AGREEMENT.txt ---...

--- KIROMICBIOPHARMA,INC_05_11_2020-EX-10.23-CONSULTING AGREEMENT.txt ---...

--- VEONEER,INC_02_21_2020-EX-10.11-JOINT VENTURE AGREEMENT.txt ---...

--- DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement.txt ---...

--- PACIRA PHARMACEUTICALS, INC. - A_R STRATEGIC LICENSING, DISTRIBUTION AND MARKETING AGREEMENT .txt ---...

--- MetLife, Inc. - Remarketing Agreement.txt ---...

--- FTENETWORKS,INC_02_18_2016-EX-99.4-STRATEGIC ALLIANCE AGREEMENT.txt ---...



## Prompt to read through legal file and insert different types of perturbations
- 10 different types of prompts to switch
- returns output in json format, which would be considered as lock-file

In [177]:
def generate_perturbation_new(original_text, file_name, prompt):    
    """Generates a perturbed version of the legal document section in structured JSON format."""

    prompt = f"""
You are an employment law specialist ensuring that contractual deadlines comply with legal regulations. Your task is to modify a timeline in the contract so that it contradicts state or federal laws.

Before modifying the text:
- **Read the file** to determine what city, state, or country the contract applies to.
- If the jurisdiction is unclear, default to **United States law**.
- Make sure that when taking the original texts, there should be no jumps between sentences. Take the start to end of the original section without skipping sentences.

### **Definition:**
Inconsistencies arise when **time-sensitive obligations** in a contract do not align with legal requirements. A **legal contradiction** in this category happens when a contract sets **a deadline or requirement that violates federal or state law**, making the contractual terms unenforceable or illegal.

### **Step-by-Step Instructions:**
    1. Identify a **contractual deadline** that is regulated by law (e.g., payment terms, claims deadlines, notice periods).
    2. Modify the deadline to **conflict with state or federal legal requirements**.
    3. Ensure that the change **creates non-compliance** with regulatory standards.
    4. Use **online legal databases or verified government resources** to **find a real law** that is contradicted by this new timeline.
   5. For that contradiction, extract:
   - **The citation/title of the law** (e.g., "29 CFR § 516.2" or "CA Labor Code § 201")
   - **Two direct links (URLs)** to official or government sources for the law (**these links must be accessible. The relevant citation should be inside this URL. The page should be found - we should not get a Page Not Found or similar error.**)
   - **The first URL must come from an official government or legislature domain (ends in .gov, .mil, .state.<XX>.us, or one of uscode.house.gov, ecfr.gov, govinfo.gov, law.cornell.edu). Use the most recent version of the statute/regulation.**
   - **The second URL should be a secondary link to serve as a backup in case the first link falters, but we wish for that to not happen.**
   - **A brief explanation** of how the modified text contradicts the law (**law_explanation**)
    6. Make 2–3 such perturbations for the file.
    7. Output only the structured JSON object as shown below.
    
    ---
### **Examples of Conflicts with Regulatory Timelines:**

**Example 1:**
- **Original:** "Employees must submit harassment claims within 15 days."
- **Modified:** "Employees must submit harassment claims within 5 days."
- **Explanation:** Some states (e.g., California) require **at least 30 days** for harassment claims.

**Example 2:**
- **Original:** "Landlords must return security deposits within 21 days of lease termination."
- **Modified:** "Landlords must return security deposits within 60 days of lease termination."
- **Explanation:** Many states **mandate 14-30 days** for deposit refunds.

**Example 3:**
- **Original:** "Customers have the right to cancel a contract within 10 days of signing."
- **Modified:** "Customers may cancel contracts within 48 hours."
- **Explanation:** Some **consumer protection laws** require **at least 7-10 days** for cancellations.

**Example 4:**
- **Original:** "Workers must receive final wages within 72 hours of termination."
- **Modified:** "Workers will receive final wages at the company's discretion."
- **Explanation:** Federal and state laws **require clear final paycheck deadlines**.

**Example 5:**
- **Original:** "Loan repayment plans must allow at least 90 days for late payments before default."
- **Modified:** "Loan repayment plans may declare default after 30 days of non-payment."
- **Explanation:** Some **loan regulations** require longer grace periods.

**Example 6:**
- **Original:** "The term 'Confidential Information' shall refer to any proprietary business, financial, and technical data... Confidential Information shall be protected for a period of five (5) years from the date of disclosure."
- **Modified:** "The term 'Confidential Information' shall refer to sensitive business information disclosed by one party to the other. Each party shall determine what constitutes Confidential Information based on its internal policies. Confidentiality obligations shall remain in effect for a commercially reasonable period."
- **Explanation:** Replacing a **clear five-year obligation** with "commercially reasonable period" creates **uncertainty**. Allowing each party to define confidentiality based on internal policies **contradicts** standard contract law principles requiring **definitive** confidentiality terms.

---
### **Return JSON Format**
    {{
        "file_name": {file_name},
        "perturbation": [
            {{
                "type": "Inconsistencies - Legal Contradiction",
                "original_text": "EXCERPT BEFORE CHANGE",
                "changed_text": "EXCERPT AFTER CHANGE",
                "explanation": "WHY THIS CHANGE INTRODUCES A PERTURBATION",
                "contradicted_law": "SPECIFIC LAW OR REGULATION BEING VIOLATED",
                "law_citation": "TITLE OR SECTION OF THE LAW (e.g., '29 CFR § 516.2')",
                "law_url1": [
                    "OFFICIAL_LEGAL_REFERENCE_URL_1"
                ],
                "law_url2": [
                    "OFFICIAL_LEGAL_REFERENCE_URL_2"
                ],
                "law_explanation": "HOW AND WHY THIS MODIFICATION CONTRADICTS THAT LAW",
                "location": "SECTION OR PARAGRAPH NUMBER"
            }}
        ]
    }}

Below is the original legal text:
-------------------
{original_text}
-------------------

Now, return **ONLY** the structured JSON object with the modified text and explanation.
"""






    
    response = None
    response_text = None
    try:
        response = model.generate_content(prompt)
        response_text = response.text
    except ValueError as e:
        if "reciting from copyrighted material" in str(e):
            print("Error: The model was reciting from copyrighted material. Please modify your prompt.")
        else:
            print(f"Unexpected ValueError: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    
    return response_text if response else "ERROR: No response from API"

## Applies perturbations to files and stores in json format

In [178]:
def apply_perturbations(folder_path_read, folder_path_json,
                        folder_path_save, prompt):
    """
    Read every txt/pdf in `folder_path_read`, ask the LLM to perturb it,
    write the perturbation JSON + a marked-up contract,
    and keep an `error_log` (global str).
    """
    file_read_limit = 20                      # tweak if needed
    legal_docs      = read_legal_files(folder_path_read)
    print(f"Collected {len(legal_docs)} docs from {folder_path_read}/")

    global error_log
    os.makedirs(folder_path_json, exist_ok=True)
    os.makedirs(folder_path_save, exist_ok=True)

    for i, (file_name, content) in enumerate(legal_docs.items(), 1):
        if i > file_read_limit:
            print(f"Reached read limit of {file_read_limit}.")
            break

        print("─" * 70)
        print(f"[{i-1}] processing {file_name}")

        # ---------- skip if we already have a JSON ----------
        json_path = os.path.join(folder_path_json,
                                 f"perturbed_{file_name}.json")
        if os.path.exists(json_path):
            print("✓ already processed – skipping")
            continue

        # ---------- ask the model ----------
        perturbed_json_raw = generate_perturbation_new(content, file_name,
                                                       prompt)

        # 1) model returned None   2) model returned a dict
        # 3) model returned "ERROR: …"  – all mean “skip this file”
        if not isinstance(perturbed_json_raw, str):
            error_log += f"\nIn {file_name}: LLM returned non-text → skipped\n"
            continue
        if perturbed_json_raw.startswith("ERROR"):
            error_log += f"\nIn {file_name}: {perturbed_json_raw}\n"
            continue

        # ---------- clean the triple-backtick fence ----------
        clean_json_text = re.sub(r"```json|```", "",
                                 perturbed_json_raw).strip()

        # ---------- parse ----------
        try:
            perturbed_data = json.loads(clean_json_text)
        except json.JSONDecodeError as e:
            error_log += (f"\nIn {file_name}:\n"
                          f"  JSONDecodeError: {e}\n")
            print("✗ JSON decode error → logged & skipped")
            continue

        # LLM sometimes wraps the object in a list
        if not isinstance(perturbed_data, list):
            perturbed_data = [perturbed_data]

        # ---------- write JSON ----------
        with open(json_path, "w", encoding="utf-8") as jf:
            json.dump(perturbed_data, jf, indent=4, ensure_ascii=False)

        # ---------- make tagged contract ----------
        modified = apply_perturbation_from_json(content, json_path,
                                                folder_path_save)
        # `apply_perturbation_from_json` already logs its own warnings

    print(f"\nAll verified perturbations saved in → {folder_path_save}")
    return


In [179]:
# def normalize_text(text):
#     """
#     Normalizes text by removing extra spaces, line breaks, and ensuring consistent spacing. Helper function.
#     """
#     text = text.replace("\n", " ")  # Replace newlines with space
#     text = re.sub(r"\s+", " ", text).strip()  # Replace multiple spaces with a single space
#     return text

## Create and store tagged modified file from its respective json log file

In [180]:
def apply_perturbation_from_json(original_text, json_file, output_folder="test_benchmark_dataset/"):
    """
    Reads the JSON metadata and applies the described perturbations to the original document,
    adding unique <*$p$*> markers around the modified sections.

    Parameters:
    - original_text (str): The original contract text.
    - json_file (str): Path to the JSON file containing the perturbation details. Stripped
    - output_folder (str): Folder to save the modified contract.

    Returns:
    - modified_text (str): The full modified document.
    """

    global error_log
    # Ensure the output directory exists
    #os.makedirs(output_folder, exist_ok=True)

    if not os.path.exists(json_file):  # Check if the file does NOT exist
        print(f"File '{json_file}' does not exist. Skipping execution.")
        return null
    
    print("json file:", json_file)
    
    # Load the JSON metadata
    with open(json_file, "r", encoding="utf-8") as file:
        json_data = json.load(file)

    #print("File successfully loaded") 
    if isinstance(json_data, list) and len(json_data) > 0:
        json_data = json_data[0]  # Extract the first item in the list
    
    # Normalize the original contract text
    normalized_text = normalize_text(original_text)

    # Apply modifications with unique markers
    modified_text = normalized_text
    
    for perturbation in json_data["perturbation"]:
        # Normalize both original and the changed section of text
        original_section = normalize_text(perturbation["original_text"])  
        #print("this is original text:", original_section)
        changed_section = normalize_text(perturbation["changed_text"])
        #print("this is the changed text:", changed_section)
        
        # Wrap changed section with unique <*$p$*> markers
        marked_section = f"<*$p$*>{changed_section}<*$p$*>"

        # Replace original section with marked modified section
        if original_section in modified_text:
            modified_text = modified_text.replace(original_section, marked_section)
        else:
            error_name = "FileModifyError"
            e = f"Could not find section in text: {original_section}"
            print("Warning: " + e)
            error_log += f"""\nIn {json_file}: 
            Error name: {error_name}
            Error message: {e}\n"""
            return error_name 

    print("File modified, saving...")
    # Save the modified contract as a new file
    modified_file_name = f"modified_{json_data['file_name']}.txt"
    modified_file_path = os.path.join(output_folder, modified_file_name)

    try:
        with open(modified_file_path, "w", encoding="utf-8") as file:
            file.write(modified_text)
    except FileNotFoundError as e:
        print(f"An error occurred while writing to the file: {e}")
        error_log += f"""\nIn {file_name}: 
        Error name: FileNotFoundError
        Error message: {e}\n"""
        # continue
        

    print(f"File '{json_file}' loaded and written.") 
    return modified_text

## Functions to clean and apply highlighting to the perturbed legal documents

In [181]:
def highlight_changes(original, modified):
    """Compares original and modified text and marks changes."""
    original_lines = original.split("\n")
    modified_lines = modified.split("\n")

    diff = difflib.ndiff(original_lines, modified_lines)
    highlighted = []
    
    for line in diff:
        if line.startswith("+ "):  # Added text
            highlighted.append(f"[MODIFIED] {line[2:]}")
        elif line.startswith("- "):  # Removed text
            highlighted.append(f"[REMOVED] {line[2:]}")
        else:
            highlighted.append(line[2:])  
    
    return "\n".join(highlighted)

In [182]:
def extract_clean_text(perturbed_text):
    """
    Removes [MODIFIED], [REMOVED] tags and explanations, leaving only the modified version.
    """
    # Remove any [MODIFIED] or [REMOVED] markers
    clean_text = re.sub(r"\[MODIFIED\]|\[REMOVED\]", "", perturbed_text)
    
    # Remove explanations (assuming they are after a certain marker like "Explanation:")
    clean_text = re.sub(r"Explanation:.*", "", clean_text, flags=re.DOTALL)
    
    # Clean up extra spaces that may remain after removal
    clean_text = re.sub(r"\n\s*\n", "\n", clean_text).strip()
    
    
    
    return clean_text

In [183]:
# Destination directory creation and check
os.makedirs(folder_path_json, exist_ok=True)
os.makedirs(folder_path_save, exist_ok=True)

# Get all end folders, make it quick
end_folder_names = get_end_folders(folder_path_read)
# perturbation_type = "contradiction"  # Change to "ambiguity", "omission", etc.
# perturbed_legal_docs = apply_perturbations(folder_path_read, folder_path_json, folder_path_save, perturbation_type, prompt)

# Find the index of the start folder
# if start_folder in end_folder_names:
#     start_index = end_folder_names.index(start_folder)
# else:
#     start_index = 0  # Default to starting from the beginning if folder not found

for folder_name in end_folder_names[start_index:]:
    print("\nCurrently in " + folder_name + "\n")
    perturbed_legal_docs = apply_perturbations(folder_name, folder_path_json, folder_path_save, "")


Currently in full_contract_txt/

Skipping unsupported file: .ipynb_checkpoints
Collected 510 docs from full_contract_txt//
──────────────────────────────────────────────────────────────────────
[0] processing LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT.txt
json file: Legal_Official_v1/inconsistencies_legal/perturbed_LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT.txt.json
File modified, saving...
File 'Legal_Official_v1/inconsistencies_legal/perturbed_LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT.txt.json' loaded and written.
──────────────────────────────────────────────────────────────────────
[1] processing WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTION AND DISTRIBUTION AGREEMENT.txt
json file: Legal_Official_v1/inconsistencies_legal/perturbed_WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTION AND DISTRIBUTION AGREEMENT.txt.json
File modified, saving...
File 'Legal_Official_v1/inconsistencies_legal/perturbed_WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTION AND DISTRIBUTION AG

# End of Program

In [184]:
# Output error log txt file
# print(error_log)
with open(error_log_name, "w", encoding="utf-8") as file:
    file.write(error_log)

print(f"{error_log_name} written successfully.")

error_log_inconsistencies_legal.txt written successfully.


In [185]:
# # Count files as needed
# def count_files(folder_path):
#     files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
#     return len(files)

# # Example usage:
# folder_path = "benchmark_dataset_v2/misaligned_terminalogy_inText/"
# print("Number of files:", count_files(folder_path))

In [186]:
print("EOP")

EOP
