In [4]:
!pip install -qU langchain==0.2.0 langchain-community==0.2.0 langchain-core==0.2.0 langchain-text-splitters==0.2.0 langchain-huggingface chromadb pandas==2.2.2 sentence-transformers datasets ctransformers[cuda] huggingface_hub

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/12.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/12.7 MB[0m [31m38.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m8.3/12.7 MB[0m [31m121.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m12.7/12.7 MB[0m [31m233.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m135.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
db-dtypes 1.4.4 requires packaging>=24.2.0, but you have packaging 23.2 which is incompatible.
xarray 2025.11.0 requires packaging>=24.1, but you have packaging 23.2 which is incom

In [2]:
!rm -rf /kaggle/working/*

In [1]:
# Install required packages (run this cell first)
!pip install langchain_core langchain_text_splitters langchain_huggingface langchain_community chromadb

!pip install pandas requests

# ==================================================
# Imports
# ==================================================
import pandas as pd
import os
import shutil
import json
import requests
import zipfile
import io
import random

from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# ==================================================
# Paths and URLs
# ==================================================
DB_PATH = "/content/compliance_db"
CUAD_URL = "https://github.com/TheAtticusProject/cuad/raw/main/data.zip"

# ==================================================
# Function to setup database
# ==================================================
def setup_compliance_db():
    if os.path.exists(DB_PATH):
        shutil.rmtree(DB_PATH)

    print("Downloading dataset...")
    try:
        r = requests.get(CUAD_URL)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall("/content/cuad_data")

        json_path = "/content/cuad_data/test.json"
        with open(json_path, 'r') as f:
            raw_data = json.load(f)
        print("Download complete.")
    except Exception as e:
        print(f"Error fetching data: {e}")
        return None

    print("Picking a random contract...")

    max_retries = 50
    contract_data = None

    for _ in range(max_retries):
        rand_idx = random.randint(0, len(raw_data['data']) - 1)
        candidate = raw_data['data'][rand_idx]
        text_content = candidate['paragraphs'][0]['context']
        if len(text_content) > 5000:
            contract_data = candidate
            print(f"Selected document at index {rand_idx}")
            break

    if not contract_data:
        print("Couldn't find a suitable doc. Picking default index 10.")
        contract_data = raw_data['data'][10]

    contract_title = contract_data.get('title', 'Unknown Contract')
    contract_text = contract_data['paragraphs'][0]['context']

    print(f"Analyzing: {contract_title}")
    print(f"Doc length: {len(contract_text)} chars")

    doc = Document(page_content=contract_text, metadata={"source": contract_title})
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents([doc])

    print("Generating embeddings and building DB...")
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    vectorstore = Chroma.from_documents(chunks, embeddings, persist_directory=DB_PATH)
    print("Vector store ready.")

    return vectorstore

# ==================================================
# Run setup
# ==================================================
vectorstore = setup_compliance_db()
if vectorstore:
    retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
    print("Retriever is ready!")


Downloading dataset...
Download complete.
Picking a random contract...
Selected document at index 30
Analyzing: BABCOCK_WILCOXENTERPRISES,INC_08_04_2015-EX-10.17-INTELLECTUAL PROPERTY AGREEMENT between THE BABCOCK _ WILCOX COMPANY and BABCOCK _ WILCOX ENTERPRISES, INC.
Doc length: 112447 chars
Generating embeddings and building DB...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector store ready.
Retriever is ready!


In [2]:
import os
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.llms import CTransformers


print("Loading Embeddings Model...")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

config = {'max_new_tokens': 512, 'temperature': 0.0, 'context_length': 2048, 'gpu_layers': 40}
llm = CTransformers(
    model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
    model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    model_type="mistral",
    config=config
)
print("Models Loaded.")

Loading Embeddings Model...


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

mistral-7b-instruct-v0.2.Q4_K_M.gguf:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

Models Loaded.


In [3]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = """You are a strict Legal Compliance Auditor.
Your task is to check if the Document Context complies with the specific Rule.

Instructions:
1. Analyze the Context.
2. Determine if the Rule is met (PASS) or violated/missing (FAIL).
3. Extract the exact text that proves your decision (Evidence).
4. If FAIL, suggest exactly what clause needs to be added (Remediation).

Output format:
Status: [PASS or FAIL]
Evidence: [Quote from text or "None"]
Remediation: [Suggestion or "None"]

Rule to Check: {input}

Document Context:
{context}
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful AI legal assistant."),
    ("human", system_prompt),
])

audit_chain = create_retrieval_chain(retriever, create_stuff_documents_chain(llm, prompt))

def check_compliance(rule):
    try:
        response = audit_chain.invoke({"input": rule})
        return response["answer"]
    except Exception as e:
        return f"Error: {e}"

print("Compliance Agent Ready.")

Compliance Agent Ready.


In [4]:
import time

rules = [
    "1. Document Name: The name of the contract must be clearly stated.",
    "2. Parties: The two or more parties who signed the contract must be identified.",
    "3. Agreement Date: The date of the contract must be stated.",
    "4. Effective Date: The date when the contract becomes effective must be stated.",
    "5. Expiration Date: The date on which the contract's initial term expires must be specified.",
    "6. Renewal Term: The renewal term after the initial term expires must be defined.",
    "7. Notice to Terminate Renewal: The notice period required to terminate renewal must be specified.",
    "8. Governing Law: The state or country's law governing the contract must be specified.",
    "9. Most Favored Nation: Is there a 'Most Favored Nation' clause ensuring best terms?",
    "10. Non-Compete: Is there a restriction on the ability of a party to compete?",
    "11. Exclusivity: Is there an exclusive dealing commitment or prohibition on working with others?",
    "12. No-Solicit of Customers: Is a party restricted from soliciting customers of the counterparty?",
    "13. Competitive Restriction Exception: Are there exceptions to non-compete or exclusivity restrictions?",
    "14. No-Solicit of Employees: Is a party restricted from soliciting/hiring employees of the counterparty?",
    "15. Non-Disparagement: Is there a requirement not to disparage the counterparty?",
    "16. Termination for Convenience: Can a party terminate without cause (solely by notice)?",
    "17. Right of First Refusal (ROFR/ROFO/ROFN): Is there a clause granting a right of first refusal/offer/negotiation?",
    "18. Change of Control: Is consent/notice required if a party undergoes a change of control (e.g., merger)?",
    "19. Anti-Assignment: Is consent/notice required if the contract is assigned to a third party?",
    "20. Revenue/Profit Sharing: Is one party required to share revenue or profit?",
    "21. Price Restriction: Is there a restriction on the ability to raise/reduce prices?",
    "22. Minimum Commitment: Is there a minimum order size or amount required?",
    "23. Volume Restriction: Is there a fee increase or consent required if use exceeds a threshold?",
    "24. IP Ownership Assignment: Does IP created by one party become property of the counterparty?",
    "25. Joint IP Ownership: Is there a clause for joint/shared ownership of IP?",
    "26. License Grant: Does the contract contain a license granted by one party?",
    "27. Non-Transferable License: Does the contract limit the transferability of the license?",
    "28. Affiliate IP License-Licensor: Does the license grant include affiliates of the licensor?",
    "29. Affiliate IP License-Licensee: Does the license grant extend to affiliates of the licensee?",
    "30. Unlimited/All-You-Can-Eat License: Is there an 'enterprise' or unlimited usage license?",
    "31. Irrevocable or Perpetual License: Is the license grant irrevocable or perpetual?",
    "32. Source Code Escrow: Is one party required to deposit source code into escrow?",
    "33. Post-Termination Services: Are there obligations (transition, payment) after termination?",
    "34. Audit Rights: Does a party have the right to audit books/records/locations?",
    "35. Uncapped Liability: Is liability uncapped upon breach (e.g., for IP infringement)?",
    "36. Cap on Liability: Does the contract include a cap (limit) on liability?",
    "37. Liquidated Damages: Does the contract award liquidated damages or termination fees?",
    "38. Warranty Duration: What is the duration of warranties provided?",
    "39. Insurance: Is there a requirement for insurance to be maintained?",
    "40. Covenant Not to Sue: Is a party restricted from contesting IP validity or bringing claims?",
    "41. Third Party Beneficiary: Is there a non-contracting party who is a beneficiary?"
]

results = []
print(f"Starting Full Compliance Audit on {len(rules)} rules...\n")

for i, rule in enumerate(rules):
    category_name = rule.split(":")[0]
    print(f"[{i+1}/{len(rules)}] Checking: {category_name}...")
    raw_output = check_compliance(rule)
    status = "Unknown"
    evidence = "See raw output"
    remediation = "None"
    lines = raw_output.strip().split('\n')
    for line in lines:
        if "Status:" in line: status = line.split("Status:")[1].strip()
        if "Evidence:" in line: evidence = line.split("Evidence:")[1].strip()
        if "Remediation:" in line: remediation = line.split("Remediation:")[1].strip()

    results.append({
        "Rule_ID": category_name,
        "Full_Rule": rule,
        "Status": status,
        "Evidence": evidence,
        "Remediation": remediation
    })
audit_df = pd.DataFrame(results)
audit_df.to_csv("compliance_audit_report.csv", index=False)

print("\nAudit Complete! Report saved to 'compliance_audit_report.csv'")

Starting Full Compliance Audit on 41 rules...

[1/41] Checking: 1. Document Name...
[2/41] Checking: 2. Parties...
[3/41] Checking: 3. Agreement Date...
[4/41] Checking: 4. Effective Date...
[5/41] Checking: 5. Expiration Date...
[6/41] Checking: 6. Renewal Term...
[7/41] Checking: 7. Notice to Terminate Renewal...
[8/41] Checking: 8. Governing Law...
[9/41] Checking: 9. Most Favored Nation...
[10/41] Checking: 10. Non-Compete...
[11/41] Checking: 11. Exclusivity...
[12/41] Checking: 12. No-Solicit of Customers...
[13/41] Checking: 13. Competitive Restriction Exception...
[14/41] Checking: 14. No-Solicit of Employees...
[15/41] Checking: 15. Non-Disparagement...
[16/41] Checking: 16. Termination for Convenience...
[17/41] Checking: 17. Right of First Refusal (ROFR/ROFO/ROFN)...
[18/41] Checking: 18. Change of Control...
[19/41] Checking: 19. Anti-Assignment...
[20/41] Checking: 20. Revenue/Profit Sharing...
[21/41] Checking: 21. Price Restriction...
[22/41] Checking: 22. Minimum Commit

In [5]:
from IPython.display import display

pd.set_option('display.max_colwidth', None)

cols_to_show = [c for c in ['Rule_ID', 'Rule', 'Status', 'Evidence', 'Remediation'] if c in audit_df.columns]

def highlight_status(val):
    if "PASS" in str(val).upper():
        return 'background-color: #d4edda; color: #155724; font-weight: bold; padding: 10px'
    elif "FAIL" in str(val).upper():
        return 'background-color: #f8d7da; color: #721c24; font-weight: bold; padding: 10px'
    return ''

print(f"Compliance Report Summary")


display(audit_df[cols_to_show].style.map(highlight_status, subset=['Status']))

Compliance Report Summary


Unnamed: 0,Rule_ID,Status,Evidence,Remediation
0,1. Document Name,PASS,"The document clearly states ""ARTICLE II INTELLECTUAL PROPERTY ASSIGNMENT AND OWNERSHIP"" as the title for a section in the contract.",
1,2. Parties,PASS,"""The Parties"" is mentioned multiple times throughout the document.",
2,3. Agreement Date,FAIL,"""This Agreement and the Schedules referenced herein or therein or attached hereto or thereto, constitute the entire agreement and understanding between the Parties with respect to the subject matter hereof and supersedes all prior written and oral and all contemporaneous oral agreements and understandings with respect to the subject matter hereof."" (Section 10.2)","Add a clause stating the effective date or agreement date in the document. For example, ""This Agreement shall be effective as of [insert date]."""
3,4. Effective Date,PASS,The document includes a clear statement of effective date in section 2.5 Rights Arising in the Future under clause (c).,
4,5. Expiration Date,FAIL,None (The document does not contain an expiration date for the contract's initial term),"Add a clause specifying the expiration date for the contract's initial term, such as ""This Agreement shall commence on [Start Date] and shall continue until [End Date]."""
5,6. Renewal Term,FAIL,"""For clarity, SpinCo shall not be obligated to affirmatively abandon such Intellectual Property prior to the date that any renewal fees are due in the future."" - Section 2.6","The document should include a clause defining the renewal term for the abandoned intellectual property after the initial term expires. For example, ""The abandoned Intellectual Property shall be deemed abandoned and no longer subject to maintenance or prosecution as of [insert date]."""
6,7. Notice to Terminate Renewal,FAIL,Sections 5.9(a) and 5.10(a) state that no notice is required for abandoning or dedicating intellectual property to the public.,"Add a clause specifying the notice period required to terminate renewal of intellectual property rights in Section 2.6, similar to the notice requirements outlined in Section 10.6. For example: ""SpinCo will provide written notice to RemainCo at least [insert number] months prior to any applicable renewal fees for Intellectual Property set forth on Schedule 2.6 are due."""
7,8. Governing Law,PASS,"""Section 10.9 Governing Law. This Agreement shall be governed by, and construed and enforced in accordance with, the substantive laws of the State of Delaware, without regard to any conflicts of law provisions thereof that would result in the application of the laws of any other jurisdiction.""",
8,9. Most Favored Nation,FAIL,"""This Agreement does not and is not intended to confer any rights or remedies upon any Person other than the Parties."" - Section 10.3, Third-Party Beneficiaries.","Add a Most Favored Nation clause ensuring that the best terms are granted to all third-party beneficiaries as well. For example: ""This Agreement grants to each third party beneficiary named in this Agreement (the 'Third Party Beneficiaries') the right to enforce the provisions of this Agreement directly against the Parties, and the Parties agree that any such Third Party Beneficiary shall have all the rights and remedies under this Agreement as if it were a party hereto."""
9,10. Non-Compete,PASS,"The document also includes a clause where RemainCo agrees not to oppose SpinCo's use or registration of its house marks, as long as it does not make use of RemainCo's house marks. (Section 10.10(b))",


In [None]:
import gradio as gr

# 1. Define the wrapper function for the UI
def audit_rule(rule_text):
    if not rule_text.strip():
        return "Please enter a rule to check."

    status_msg = f"🕵️ Auditing contract for rule: '{rule_text}'..."
    # We yield the status message first so the user knows it's working
    yield status_msg

    try:
        # Reuse the 'check_compliance' logic from Cell 4/5
        # Note: We use the global 'audit_chain' we created earlier
        response = audit_chain.invoke({"input": rule_text})
        raw_answer = response["answer"]

        # Format the output for Markdown
        formatted_output = f"""
### 📋 Audit Result

**Rule:** {rule_text}

---
{raw_answer}
---

*Generated by Mistral-7B Compliance Agent*
"""
        yield formatted_output

    except Exception as e:
        yield f"❌ Error: {str(e)}"

# 2. Build the Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# ⚖️ AI Policy Compliance Auditor")
    gr.Markdown(
        "This agent checks the **currently loaded CUAD contract** against your specific compliance rules.\n"
        "It uses **RAG (Retrieval-Augmented Generation)** to find evidence in the document."
    )

    with gr.Row():
        with gr.Column(scale=4):
            rule_input = gr.Textbox(
                label="Enter Compliance Rule",
                placeholder="e.g., 'The agreement must have a Force Majeure clause.'",
                lines=2
            )
        with gr.Column(scale=1):
            check_btn = gr.Button("Check Compliance", variant="primary")

    output_area = gr.Markdown(label="Audit Report")

    # Example buttons to quickly test
    gr.Examples(
        examples=[
            ["The agreement must specify the Governing Law."],
            ["There must be a Non-Compete clause."],
            ["Payment terms must be Net 30 days."],
            ["Assignment to third parties is prohibited without consent."]
        ],
        inputs=rule_input
    )

    # Link buttons to function
    check_btn.click(fn=audit_rule, inputs=rule_input, outputs=output_area)
    rule_input.submit(fn=audit_rule, inputs=rule_input, outputs=output_area)

# 3. Launch
print("🚀 Launching Interface...")
demo.queue().launch(share=True, debug=True)

  with gr.Blocks(theme=gr.themes.Soft()) as demo:


🚀 Launching Interface...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://46dc68b860dfb0489a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
