In [2]:
!pip install google-generativeai




In [3]:
!pip install pymupdf



In [4]:
import google.generativeai as genai

In [5]:
import os
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")


In [6]:
api_key = os.getenv("GOOGLE_API_KEY")


In [7]:
genai.configure(api_key=api_key)

# Initialize the Gemini model
model = genai.GenerativeModel("gemini-2.0-flash")

# Ask a question
response = model.generate_content("Explain quantum computing in simple terms.")

# Print the response
print(response.text)

Imagine a light switch. A regular computer bit is like that light switch – it can be either on (1) or off (0). Quantum computers use something called a "qubit," which is like a dimmer switch.

**Instead of just being on or off, a qubit can be on, off, *or somewhere in between, all at the same time!* This "somewhere in between" state is called superposition.**

Think of it like a coin spinning in the air. While it's spinning, it's neither heads nor tails, it's both at the same time, until it lands. A qubit is similar, it exists in multiple states simultaneously.

**Why is this powerful?**

*   **Parallel Processing:** Because a qubit can be in multiple states at once, a quantum computer can explore many possibilities simultaneously. This is like trying many different keys on a lock at the same time, instead of one at a time.
*   **Complex Problems:** This parallel processing allows quantum computers to tackle complex problems that are too difficult for regular computers, like:
    *   *

In [13]:
import fitz
import difflib
import re
import json

In [9]:
def read_pdf(file_path):
    """Reads a PDF file"""
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])

def read_legal_files(folder_path):
    """Reads all legal files in the folder and returns a dictionary with file names and content."""
    legal_documents = {}

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith(".pdf"):
            legal_documents[file_name] = read_pdf(file_path)
        else:
            print(f"Skipping unsupported file: {file_name}")

    return legal_documents

In [10]:
folder_path = "full_contract_pdf/Part_I/Affiliate_Agreements/"
legal_docs = read_legal_files(folder_path)

# Display first document
for file_name, content in legal_docs.items():
    print(f"--- {file_name} ---\n{content[:500]}...\n")
    break  # Only show the first one for preview

Skipping unsupported file: perturbed_UsioInc_20040428_SB-2_EX-10.11_1723988_EX-10.11_Affiliate Agreement 2.pdf.txt
Skipping unsupported file: .ipynb_checkpoints
--- UsioInc_20040428_SB-2_EX-10.11_1723988_EX-10.11_Affiliate Agreement 2.pdf ---
                                                                   EXHIBIT 10.11
                         NETWORK 1 FINANCIAL CORPORATION
                           AFFILIATE OFFICE AGREEMENT
THIS  AGREEMENT  is  entered  into  by  and  between  NETWORK  1 FINANCIAL, INC.
("NETWORK  1"),  a  Virginia Corporation with its principal place of business at
1501  Farm  Credit  Drive,  Suite 1500, McLean, Virginia 22102-5004, and Payment
Data  Systems,  Inc.,  the  Affiliate Office ("AFFILIATE"), a Nev...



In [11]:
for file_name, content in legal_docs.items():
    print(f"--- {file_name} ---...\n")


--- UsioInc_20040428_SB-2_EX-10.11_1723988_EX-10.11_Affiliate Agreement 2.pdf ---...

--- DigitalCinemaDestinationsCorp_20111220_S-1_EX-10.10_7346719_EX-10.10_Affiliate Agreement.pdf ---...

--- LinkPlusCorp_20050802_8-K_EX-10_3240252_EX-10_Affiliate Agreement.pdf ---...

--- CreditcardscomInc_20070810_S-1_EX-10.33_362297_EX-10.33_Affiliate Agreement.pdf ---...

--- SouthernStarEnergyInc_20051202_SB-2A_EX-9_801890_EX-9_Affiliate Agreement.pdf ---...

--- UnionDentalHoldingsInc_20050204_8-KA_EX-10_3345577_EX-10_Affiliate Agreement.pdf ---...

--- SteelVaultCorp_20081224_10-K_EX-10.16_3074935_EX-10.16_Affiliate Agreement.pdf ---...

--- TubeMediaCorp_20060310_8-K_EX-10.1_513921_EX-10.1_Affiliate Agreement.pdf ---...

--- CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.pdf ---...



In [70]:
def generate_perturbation(original_text, file_name, perturbation_type="contradiction"):
    """Generates a perturbed version of the legal document section in structured JSON format."""

    prompt = f"""
    You are a legal expert trained in contract analysis. Below is a section from a legal document. 
    Your task is to modify it by introducing a {perturbation_type} and return the result in a structured JSON format.

    Examples:
    - Contradiction: Add a clause that conflicts with an earlier statement or national law.
    - Ambiguity: Make a sentence vague so it can have multiple interpretations.
    - Omission: Remove a key clause that changes the meaning significantly.
    
    Instructions:
    - Apply the perturbation directly into the text.
    - Identify and extract the exact original excerpt that was changed.
    - Provide the changed version of the excerpt.
    - Explain why the change introduces a {perturbation_type}.
    - Format the output in the following JSON structure:
    
    {{
        "file_name": {file_name},
        "text": "FULL MODIFIED DOCUMENT HERE",
        "explanation": [
            {{
                "location": "LOCATION OF CHANGE",
                "original_text": "EXCERPT BEFORE CHANGE",
                "changed_text": "EXCERPT AFTER CHANGE",
                "explanation": "WHY THIS CHANGE INTRODUCES A PERTURBATION"
            }}
        ]
    }}

    Below is the original legal text:
    -------------------
    {original_text}
    -------------------

    Now, return ONLY the structured JSON object with the modified text and explanation.
    """

    response = model.generate_content(prompt)
    return response.text if response else "ERROR: No response from API"


In [35]:
def highlight_changes(original, modified):
    """Compares original and modified text and marks changes."""
    original_lines = original.split("\n")
    modified_lines = modified.split("\n")

    diff = difflib.ndiff(original_lines, modified_lines)
    highlighted = []
    
    for line in diff:
        if line.startswith("+ "):  # Added text
            highlighted.append(f"[MODIFIED] {line[2:]}")
        elif line.startswith("- "):  # Removed text
            highlighted.append(f"[REMOVED] {line[2:]}")
        else:
            highlighted.append(line[2:])  
    
    return "\n".join(highlighted)

In [53]:
def extract_clean_text(perturbed_text):
    """
    Removes [MODIFIED], [REMOVED] tags and explanations, leaving only the modified version.
    """
    # Remove any [MODIFIED] or [REMOVED] markers
    clean_text = re.sub(r"\[MODIFIED\]|\[REMOVED\]", "", perturbed_text)
    
    # Remove explanations (assuming they are after a certain marker like "Explanation:")
    clean_text = re.sub(r"Explanation:.*", "", clean_text, flags=re.DOTALL)
    
    # Clean up extra spaces that may remain after removal
    clean_text = re.sub(r"\n\s*\n", "\n", clean_text).strip()
    
    
    
    return clean_text

In [78]:
def apply_perturbations(folder_path, perturbation_type="contradiction"):
    legal_docs = read_legal_files(folder_path)
    results = []

    for i, (file_name, content) in enumerate(legal_docs.items()):
        if i >= 3:  # Stop after processing 5 documents
            break
        print(f"Processing {file_name}...")
        perturbed_json = generate_perturbation(content, file_name, perturbation_type)
        
        clean_json_text = re.sub(r"```json|```", "", perturbed_json).strip()

        # print('this is json:', clean_json_text)
        try:
            # Convert response into a Python dictionary
            perturbed_data = json.loads(clean_json_text)
            results.append(perturbed_data)
        except json.JSONDecodeError:
            print(f"Error parsing JSON for {file_name}, skipping...")
            continue
            
            
    # Save the modified text
    folder_path = "perturbed_full_contract_pdf/Part_I/Affiliate_Agreements/"
        
    # Save the JSON output
    json_output_path = os.path.join(folder_path, f"perturbed_{perturbation_type}_legal_docs.json")
    with open(json_output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4, ensure_ascii=False)

    print(f"All perturbations saved in {folder_path}")

    return perturbed_json

In [79]:
folder_path = "full_contract_pdf/Part_I/Affiliate_Agreements/"
perturbation_type = "contradiction"  # Change to "ambiguity", "omission", etc.
perturbed_legal_docs = apply_perturbations(folder_path, perturbation_type)

Skipping unsupported file: perturbed_UsioInc_20040428_SB-2_EX-10.11_1723988_EX-10.11_Affiliate Agreement 2.pdf.txt
Skipping unsupported file: .ipynb_checkpoints
Processing UsioInc_20040428_SB-2_EX-10.11_1723988_EX-10.11_Affiliate Agreement 2.pdf...
Processing DigitalCinemaDestinationsCorp_20111220_S-1_EX-10.10_7346719_EX-10.10_Affiliate Agreement.pdf...
Error parsing JSON for DigitalCinemaDestinationsCorp_20111220_S-1_EX-10.10_7346719_EX-10.10_Affiliate Agreement.pdf, skipping...
Processing LinkPlusCorp_20050802_8-K_EX-10_3240252_EX-10_Affiliate Agreement.pdf...
All perturbations saved in perturbed_full_contract_pdf/Part_I/Affiliate_Agreements/


In [63]:
json_text = """{
    "text": "hello",
    "explain": [
        {
            "from": "tucson"
        }
    ]
}"""


In [61]:
json_output = json.loads(json_text)  # Converts to JSON with formatting
print(json_output)

{'text': 'hello', 'explain': [{'from': 'tucson'}]}


In [41]:
print(results)

[<module 'json' from '/Users/adithmouli/anaconda3/lib/python3.11/json/__init__.py'>]
