In [16]:
# pip install -U langchain langchain-community faiss-cpu langchain-ollama openai tiktoken

from langchain_ollama.embeddings import OllamaEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.faiss import FAISS
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate
from langchain.schema import Document
import csv, json
import os


In [None]:
# 0️⃣ Ensure OPENAI_API_KEY is set in environment
os.environ.setdefault("OPENAI_API_KEY", "sk-...")

# ✅ 1. Load controls from CSV
def load_controls_from_csv(path="ISO_27001_2022_Controls_List.csv"):
    with open(path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        return [{"id": r["Control ID"].strip(), "title": r["Control Title"].strip()} for r in reader]

controls = load_controls_from_csv()
groups = [controls[i:i+20] for i in range(0, len(controls), 20)]

# 🔄 2. Load PDF and chunk text
loader = PyPDFLoader("your_doc.pdf")
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150, length_function=len)
chunks = splitter.split_documents(docs)

# 💡 3. Build FAISS vector store with Ollama embeddings
embeddings = OllamaEmbeddings(model="llama3")
faiss_store = FAISS.from_documents(chunks, embeddings)  # uses FAISS index under the hood :contentReference[oaicite:1]{index=1}
base_retriever = faiss_store.as_retriever(search_kwargs={"k": 5})

# ⚙️ 4. Add LLM-based contextual compression
compressor = LLMChainExtractor.from_llm(ChatOpenAI(model_name="gpt-4.1-mini", temperature=0))
compression_retriever = ContextualCompressionRetriever(
    base_retriever=base_retriever,
    base_compressor=compressor
)

# 🧠 5. Build RetrievalQA chain for grouped control audit
llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0)
system_prompt = "You are an ISO 27001 auditor. Answer in JSON using the context and provided controls."
human_template = """
Context:
{context}

Please audit these controls:
{controls_list}

Return JSON:
[
  {{
    "Control ID": "...",
    "Control Title": "...",
    "Compliance": "✔️ or ❌",
    "Notes / Gaps": "...",
    "Suggested Implementation": "..."
  }},
  …
]
"""
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", human_template),
])
combine_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
qa_chain = create_retrieval_chain(
    retriever=compression_retriever,
    combine_documents_chain=combine_chain
)

# 6. Execute audits group-by-group
all_results = []
for group in groups:
    controls_list = "\n".join(f"{c['id']} - {c['title']}" for c in group)
    result = qa_chain.invoke({"input": controls_list})
    parsed = json.loads(result)
    all_results.extend(parsed)

# 7. Save output
with open("audit.json", "w") as f:
    json.dump(all_results, f, indent=2)

print(f"✅ Completed audit: {len(all_results)} controls evaluated")


ImportError: Could not import chromadb python package. Please install it with `pip install chromadb`.

In [None]:


# 5. Build the RetrievalQA chain
llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0.3, api_key=os.getenv("OPENAI_API_KEY"))
system_prompt = "You are an ISO 27001 auditor. Use context to audit controls and output JSON."
human_prompt = """
Context:
{context}

Audit these controls:
{controls_list}

Return a JSON array:
[
  {
    "Control ID": "...",
    "Control Title": "...",
    "Compliance": "✔️ or ❌",
    "Notes / Gaps": "...",
    "Suggested Implementation": "..."
  },
  ...
]
"""
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", human_prompt),
])
combine_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
qa_chain = create_retrieval_chain(
    retriever=compression_retriever,
    combine_documents_chain=combine_chain
)

# 6. Load the control list and batch them
def load_controls_from_csv(path="iso27001_annexA_controls.csv"):
    with open(path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        # Optionally handle types, strip spaces, etc.
        controls = [
            {
                "id": row["Control ID"].strip(),
                "title": row["Control Title"].strip()
            }
            for row in reader
        ]
    return controls

controls = load_controls_from_csv()
groups = [controls[i:i+20] for i in range(0, len(controls), 20)]

# 7. Execute the audit
all_results = []
for group in groups:
    controls_list = "\n".join(f"{c['id']} - {c['title']}" for c in group)
    result = qa_chain.invoke({"input": controls_list})
    parsed = json.loads(result)
    all_results.extend(parsed)

# 8. Save output
with open("audit.json", "w") as f:
    json.dump(all_results, f, indent=2)

print(f"✅ Completed audit: {len(all_results)} controls evaluated")