In [None]:
!pip install --upgrade pip
!pip install langchain langchain-groq chromadb pandas tiktoken
!pip install streamlit




In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json

file_path = '/content/drive/MyDrive/CUAD_v1.json'

with open(file_path, 'r') as f:
    cuad_data = json.load(f)

# Check structure
print(cuad_data['data'][0]['title'])
print(len(cuad_data['data'][0]['paragraphs']))


LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT
1


In [None]:
from langchain_core.documents import Document



documents = []

for item in cuad_data['data']:
    title = item['title']
    for para in item['paragraphs']:
        # Combine all Q&A answers into a single text block
        para_text = " ".join([ans['text'] for q in para['qas'] for ans in q.get('answers', [])])
        if para_text.strip():
            documents.append(Document(page_content=para_text, metadata={"title": title}))

print(f"Total documents/paragraphs for RAG: {len(documents)}")


Total documents/paragraphs for RAG: 510


In [None]:
# !pip install langchain-text-splitters
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

docs_chunks = text_splitter.split_documents(documents)
print(f"Total chunks: {len(docs_chunks)}")


Total chunks: 8600


In [None]:
!pip install langchain-groq



In [None]:
import os
os.environ["GROQ_API_KEY"] = ""

In [None]:
from langchain_groq import ChatGroq
import os

os.environ["GROQ_API_KEY"] = ""

print("Testing Groq LLM...")
llm = ChatGroq(model="llama-3.3-70b-versatile")
response = llm.invoke("What is machine learning?")
print(response.content)

Testing Groq LLM...
**Machine Learning Overview**

Machine learning is a subset of artificial intelligence (AI) that involves training algorithms to learn from data and make predictions or decisions without being explicitly programmed. It enables computers to automatically improve their performance on a task by learning from experience, rather than relying on manual programming.

**Key Characteristics:**

1. **Data-driven**: Machine learning relies on large amounts of data to train and improve models.
2. **Algorithmic**: Machine learning uses algorithms to analyze data and make predictions or decisions.
3. **Self-improving**: Machine learning models can improve their performance over time as they learn from new data.

**Types of Machine Learning:**

1. **Supervised Learning**: The model is trained on labeled data to learn the relationship between input and output.
2. **Unsupervised Learning**: The model is trained on unlabeled data to discover patterns and relationships.
3. **Reinforce

In [None]:
from langchain_community.embeddings.fake import FakeEmbeddings

embeddings = FakeEmbeddings(size=384)
texts = [doc.page_content for doc in docs_chunks]
vectors = embeddings.embed_documents(texts)
print(f"First vector length: {len(vectors[0])}")

First vector length: 384


In [None]:

# !pip install langchain-huggingface
from langchain_huggingface import HuggingFaceEndpointEmbeddings
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""

embeddings = HuggingFaceEndpointEmbeddings(
    model="sentence-transformers/all-MiniLM-L6-v2"
)

texts = [doc.page_content for doc in docs_chunks]
vectors = embeddings.embed_documents(texts)
print(f"First vector length: {len(vectors[0])}")

First vector length: 384


In [None]:
from langchain_community.vectorstores import Chroma
from tqdm import tqdm
import chromadb
client = chromadb.Client()
collection = client.create_collection(name="cuad_compliancess")

for i, doc in enumerate(docs_chunks):
    collection.add(
        ids=[str(i)],
        metadatas=[doc.metadata],
        documents=[doc.page_content],
        embeddings=[vectors[i]]
    )
# Wrap your collection in LangChain's Chroma
vector_store = Chroma(
    client=client,
    collection_name="cuad_compliancess",
    embedding_function=embeddings
)

# Now use similarity_search directly
query = "What is compliance?"
results = vector_store.similarity_search(query, k=3)

print(f"✓ Vector store ready!")
print(f"✓ Found {len(results)} results")

for i, doc in tqdm(enumerate(results), total=len(results), desc="Processing results"):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content[:300])

✓ Vector store ready!
✓ Found 3 results


Processing results: 100%|██████████| 3/3 [00:00<00:00, 22712.84it/s]


--- Result 1 ---
comply with any request of a governmental body or self-regulatory organization or a Plan, (c) verify compliance by Administrator with the terms of this Agreement, (d) make required regulatory reports, or (e) perform general customer supervision. Administrator agrees that it will permit the Trust or 

--- Result 2 ---
premises into full compliance with the then-current specifications and standards before the expiration date of this Agreement;

--- Result 3 ---
compliance with the terms of this Agreement. foregoing sentence, Customer may more frequently conduct "for cause" physical inspections or audits of a Facility with five (5) days' advance written notice to Manufacturer if Customer has reasonable cause to believe that an inspection or audit of such Fa





In [None]:
import pandas as pd

COMPLIANCE_RULES = {
    "confidentiality": {"rule":"Document must contain confidentiality/NDA clauses","keywords":["confidential","proprietary","trade secret","non-disclosure"],"severity":"HIGH"},
    "termination": {"rule":"Clear termination conditions and notice periods must be specified","keywords":["termination","notice period","end of contract","effective date"],"severity":"HIGH"},
    "liability_limitation": {"rule":"Liability limitations must be explicitly defined","keywords":["liability","limitation of liability","exclude","not liable"],"severity":"HIGH"},
    "governing_law": {"rule":"Governing law and jurisdiction must be specified","keywords":["governing law","jurisdiction","applicable law","shall be governed"],"severity":"HIGH"},
    "indemnification": {"rule":"Indemnification obligations must be clearly stated","keywords":["indemnify","indemnification","defend","hold harmless"],"severity":"MEDIUM"},
    "payment_terms": {"rule":"Payment terms, schedule, and methods must be defined","keywords":["payment","invoice","due date","payment terms","price"],"severity":"HIGH"},
    "data_protection": {"rule":"Data protection and privacy measures must be documented","keywords":["data protection","privacy","GDPR","personal data","processing"],"severity":"HIGH"},
    "warranty": {"rule":"Warranty disclaimers or warranties must be present","keywords":["warranty","warrant","guarantee","representation","as-is"],"severity":"MEDIUM"},
    "dispute_resolution": {"rule":"Dispute resolution mechanism must be specified","keywords":["dispute","arbitration","mediation","litigation","resolution"],"severity":"MEDIUM"},
    "force_majeure": {"rule":"Force majeure clause should be included","keywords":["force majeure","unforeseen","circumstances beyond","act of god"],"severity":"LOW"},
    "renewal_terms": {"rule":"Renewal and extension terms must be clear","keywords":["renew","extension","renewal","continue","successive"],"severity":"MEDIUM"},
    "intellectual_property": {"rule":"Intellectual property rights must be addressed","keywords":["intellectual property","ip","copyright","patent","ownership"],"severity":"HIGH"},
    "insurance": {"rule":"Insurance requirements should be specified","keywords":["insurance","insure","coverage","premium","policy"],"severity":"MEDIUM"},
    "modification": {"rule":"Amendment and modification procedures must be outlined","keywords":["amendment","modify","modification","change","waiver"],"severity":"MEDIUM"},
    "termination_for_cause": {"rule":"Grounds for termination for cause must be defined","keywords":["termination for cause","breach","material breach","default"],"severity":"HIGH"}
}

def check_compliance(text, rules=COMPLIANCE_RULES):
    report = []
    for key, rule in rules.items():
        matched_keywords = [kw for kw in rule['keywords'] if kw.lower() in text.lower()]
        report.append({
            "Rule": rule['rule'],
            "Severity": rule['severity'],
            "Compliant": "YES" if matched_keywords else "NO",
            "Matched Keywords": ", ".join(matched_keywords) if matched_keywords else "None"
        })
    return pd.DataFrame(report)


In [None]:
all_reports = []

for doc in documents:
    df = check_compliance(doc.page_content)
    df['Document'] = doc.metadata['title']
    all_reports.append(df)

final_report = pd.concat(all_reports, ignore_index=True)
final_report.to_csv("/content/drive/MyDrive/cuad_compliance_report.csv", index=False)
final_report.head()


Unnamed: 0,Rule,Severity,Compliant,Matched Keywords,Document
0,Document must contain confidentiality/NDA clauses,HIGH,YES,nda,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...
1,Clear termination conditions and notice period...,HIGH,YES,"termination, terminate",LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...
2,Liability limitations must be explicitly defined,HIGH,YES,liability,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...
3,Governing law and jurisdiction must be specified,HIGH,NO,,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...
4,Indemnification obligations must be clearly st...,MEDIUM,NO,,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...


In [None]:
# !pip install langchain langchain-core
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import Chroma
import os

os.environ["GROQ_API_KEY"] = ""

# Your vector store
vector_store = Chroma(
    client=client,
    collection_name="cuad_compliances",
    embedding_function=embeddings
)

# Initialize Groq LLM
llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)

# Search for relevant documents
query = "What are the compliance requirements?"
docs = vector_store.similarity_search(query, k=3)

# Combine documents into context
context = "\n\n".join([doc.page_content for doc in docs])

# Create prompt and get response
prompt = f"""Answer the question based on this context:

Context:
{context}

Question: {query}

Answer:"""

response = llm.invoke(prompt)
print(response.content)

The compliance requirements include:

1. Complying with requests from governmental bodies, self-regulatory organizations, or Plans.
2. Verifying compliance with the terms of the Agreement.
3. Making required regulatory reports.
4. Performing general customer supervision.
5. Permitting access to personnel and records for monitoring the quality of services.
6. Allowing inspections or audits to determine compliance, including access to:
   - Relevant records
   - Personnel
   - Manufacturing operations
   - Validation
   - Cleaning
   - Sampling
   - Laboratory testing
   - Warehouse receiving and storage
7. Bringing premises into full compliance with current specifications and standards before the expiration date of the Agreement.


In [None]:
from langchain_groq import ChatGroq
from langchain_community.vectorstores import Chroma
import os

os.environ["GROQ_API_KEY"] = ""

# Your vector store
vector_store = Chroma(
    client=client,
    collection_name="cuad_compliances",
    embedding_function=embeddings
)

# Initialize Groq LLM
llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)

# Your rule
rule_text = COMPLIANCE_RULES["confidentiality"]["rule"]

# Search for relevant documents
docs = vector_store.similarity_search(rule_text, k=3)

# Combine documents into context
context = "\n\n".join([doc.page_content for doc in docs])

# Create prompt and get response
prompt = f"""Based on this compliance rule, find relevant evidence in the documents:

Rule: {rule_text}

Context from documents:
{context}

Analysis:"""

response = llm.invoke(prompt)
print("Evidence from documents:\n", response.content)

Evidence from documents:
 Based on the provided context from the documents, the relevant evidence for the compliance rule "Document must contain confidentiality/NDA clauses" is:

1. The mention of "CONFIDENTIALITY OBLIGATIONS SET FORTH IN SECTION 12" in the first excerpt, which suggests that the document contains a section dedicated to confidentiality obligations.
2. The reference to "a party's breach of its obligations relating to confidentiality under Article XI" in the second excerpt, which implies that the document includes provisions related to confidentiality and the consequences of breaching those obligations.
3. The phrase "CONFIDENTIALITY OBLIGATIONS" itself, which is explicitly mentioned in the excerpts, indicating that the document addresses confidentiality.

However, it is worth noting that the provided context does not include the actual confidentiality/NDAs clauses, but rather references to them in other sections of the document. To fully confirm compliance with the rule,

In [None]:
# =======================
# Imports & Setup
# =======================
from langchain_groq import ChatGroq
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
import pandas as pd
import os
from tqdm import tqdm
import json

# =======================
# Environment
# =======================
os.environ["GROQ_API_KEY"] = ""

# =======================
# Load CUAD JSON into Documents
# =======================
cuad_path = "/content/drive/MyDrive/CUAD_v1.json"
documents = []

with open(cuad_path, "r") as f:
    data = json.load(f)

for item in data["data"]:
    title = item.get("title", "Unknown")
    for para in item.get("paragraphs", []):
        content = " ".join(
            [qa.get("question","") + " " + " ".join([a["text"] for a in qa.get("answers",[])])
             for qa in para.get("qas",[])]
        )
        if content.strip():
            documents.append(Document(page_content=content, metadata={"source": title}))

print(f"✅ Loaded {len(documents)} documents")

# =======================
# Vector Store & LLM
# =======================
vector_store = Chroma(
    client=client,
    collection_name="cuad_compliances",
    embedding_function=embeddings
)

# Use smaller model to avoid rate limits
llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0)

# =======================
# Compliance Checker
# =======================
def check_compliance_with_groq(doc, rules=COMPLIANCE_RULES):
    report = []
    non_compliant_rules = []

    # Step 1: Keyword-based compliance check
    for key, rule in rules.items():
        matched_keywords = [kw for kw in rule['keywords'] if kw.lower() in doc.page_content.lower()]
        compliant = "YES" if matched_keywords else "NO"

        report.append({
            "Document": doc.metadata.get('source', 'Unknown'),
            "Rule": rule['rule'],
            "Severity": rule['severity'],
            "Compliant": compliant,
            "Matched Keywords": ", ".join(matched_keywords) if matched_keywords else "None",
            "Evidence": None  # fill later
        })

        if compliant == "NO" or rule['severity'] == "HIGH":
            non_compliant_rules.append(rule)

    # Step 2: LLM evaluation for non-compliant/high rules (batch them)
    if non_compliant_rules:
        rules_text = "\n".join([f"- {r['rule']}" for r in non_compliant_rules])
        docs = vector_store.similarity_search(" ".join([r['rule'] for r in non_compliant_rules]), k=2)
        context = "\n\n".join([d.page_content for d in docs])

        prompt = f"""Analyze the following document for compliance:

Document context:
{context}

Rules to check:
{rules_text}

For each rule, provide: YES/NO compliance and 1-2 sentence evidence."""

        try:
            response = llm.invoke(prompt)
            evidence_text = response.content
        except Exception as e:
            evidence_text = f"LLM error: {str(e)}"

        # Fill evidence for non-compliant/high rules
        for r in report:
            if r["Evidence"] is None:
                r["Evidence"] = evidence_text

    # Step 3: Fill remaining rules with keyword-match evidence
    for r in report:
        if r["Evidence"] is None:
            r["Evidence"] = "Keyword match found, rule likely satisfied."

    return pd.DataFrame(report)

# =======================
# Process all documents
# =======================
all_reports = []
for doc in tqdm(documents[:100], desc="Processing documents"):  # Start with 100 to avoid rate limits
    try:
        report_df = check_compliance_with_groq(doc)
        all_reports.append(report_df)
    except Exception as e:
        print(f"Error processing {doc.metadata.get('source')}: {str(e)}")
        continue

# Combine all reports
final_report = pd.concat(all_reports, ignore_index=True)

# Display first few rows
print("\n" + "="*80)
print(final_report.head(10))

# Save final CSV
final_report.to_csv("/content/drive/MyDrive/cuad_compliance_report_groq.csv", index=False)
print(f"\n✓ Report saved! Total records: {len(final_report)}")

✅ Loaded 510 documents


Processing documents: 100%|██████████| 100/100 [07:22<00:00,  4.43s/it]


                                            Document  \
0  LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...   
1  LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...   
2  LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...   
3  LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...   
4  LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...   
5  LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...   
6  LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...   
7  LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...   
8  LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...   
9  LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...   

                                                Rule Severity Compliant  \
0  Document must contain confidentiality/NDA clauses     HIGH       YES   
1  Clear termination conditions and notice period...     HIGH       YES   
2   Liability limitations must be explicitly defined     HIGH       YES   
3   Governing law and jurisdiction must be specified     HIGH       YES   
4  Inde


