In [None]:
import os
import shutil
import random
import fitz
import re
import json
import pandas as pd
from tqdm import tqdm
import subprocess

# PORTABLE PATHS FOR GITHUB

source_root = "./data/full_contract_pdf"
dest_folder = "./data/CUAD_50"
CONTRACT_DIR = "./data/CUAD_50"

os.makedirs(dest_folder, exist_ok=True)

# Recursively collect all PDFs
all_pdfs = []
for root, dirs, files in os.walk(source_root):
    for file in files:
        if file.lower().endswith(".pdf"):
            all_pdfs.append(os.path.join(root, file))

print("Total PDFs found:", len(all_pdfs))

# Select exactly 50
selected = random.sample(all_pdfs, 50)

# Copy into destination
for i, pdf_path in enumerate(selected):
    dst_path = os.path.join(dest_folder, f"contract_{i}.pdf")
    shutil.copy(pdf_path, dst_path)

print("Successfully copied 50 PDFs into:", dest_folder)

# OLLAMA LLM WRAPPER

def call_llm(prompt, model="qwen2.5:1.5b-instruct"):

    command = ["ollama", "run", model, prompt]

    result = subprocess.run(
        command,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        encoding="utf-8",
        errors="ignore"
    )

    return result.stdout.strip()

# PDF EXTRACTION HELPERS

def extract_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    return " ".join(page.get_text() for page in doc)

def normalize_text(text):
    return re.sub(r"\s+", " ", text).strip()

def chunk_text(text, size=1500):
    return [text[i:i+size] for i in range(0, len(text), size)]

# CLAUSE EXTRACTION

def extract_clause(chunk, clause_type):
    prompt = f"""
You are a legal clause extraction model.
Extract the **{clause_type}** from the following contract chunk.
Return ONLY exact text from the chunk. If not found, return "NOT FOUND".

Chunk:
{chunk}
"""
    return call_llm(prompt)

def extract_clauses(text):
    chunks = chunk_text(text)

    termination = ""
    confidentiality = ""
    liability = ""

    for ch in chunks:
        if not termination:
            resp = extract_clause(ch, "termination clause")
            if "NOT FOUND" not in resp and len(resp) > 20:
                termination = resp

        if not confidentiality:
            resp = extract_clause(ch, "confidentiality clause")
            if "NOT FOUND" not in resp and len(resp) > 20:
                confidentiality = resp

        if not liability:
            resp = extract_clause(ch, "liability clause")
            if "NOT FOUND" not in resp and len(resp) > 20:
                liability = resp

    return {
        "termination_clause": termination or "NOT FOUND",
        "confidentiality_clause": confidentiality or "NOT FOUND",
        "liability_clause": liability or "NOT FOUND",
    }

# SUMMARY WITH LENGTH ENFORCEMENT

def summarize_contract(text):
    prompt = f"""
You are a legal contract analysis model.

Write a **detailed summary of 120–150 words**.
Strict: fewer than 120 words or more than 150 is not allowed.

Include:
- Purpose of the agreement
- Obligations of each party
- Risks, penalties, termination or breach consequences

TEXT:
{text[:3000]}
"""

    summary = call_llm(prompt)

    # retry if too short
    if len(summary.split()) < 120:
        retry_prompt = f"""
Rewrite the following summary into **130–150 words**, keeping all meaning:

{summary}
"""
        summary = call_llm(retry_prompt)

    return summary

# PROCESS ALL CONTRACTS

def process_contracts(folder):
    pdfs = sorted([os.path.join(folder, f)
                   for f in os.listdir(folder)
                   if f.lower().endswith(".pdf")])

    results = []

    for idx, pdf in tqdm(enumerate(pdfs), total=len(pdfs)):
        raw = extract_pdf_text(pdf)
        cleaned = normalize_text(raw)

        clauses = extract_clauses(cleaned)
        summary = summarize_contract(cleaned)

        results.append({
            "contract_id": f"contract_{idx}",
            "summary": summary,
            "termination_clause": clauses["termination_clause"],
            "confidentiality_clause": clauses["confidentiality_clause"],
            "liability_clause": clauses["liability_clause"]
        })

    return results

# RUN PIPELINE
output = process_contracts(CONTRACT_DIR)

df = pd.DataFrame(output)
df.to_csv("CUAD_LOCAL_NO_ERRORS.csv", index=False)

print("DONE! Saved to CUAD_LOCAL_NO_ERRORS.csv")
