In [15]:
from extractors.pdf_extractor import extract_pdf_as_markdown,extract_docx_as_markdown
from chains.audit_chain import analyze_module
from chains.improvement_chain import get_improvement_chain
from langchain_community.chat_models import ChatOpenAI
import pandas as pd
from dotenv import load_dotenv
import asyncio
import os
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import get_openai_callback

In [16]:
load_dotenv()
# print(os.getenv("OPENAI_API_KEY"))

file_path = "./files/Information Security Policy [Internal].docx"
controls = pd.read_json("ISO_27001_2022_Controls_List.json")


In [17]:
controls

Unnamed: 0,Section,Control ID,Control Title
0,A.5 (Organisational Controls),A.5.1,Policies for information security
1,A.5 (Organisational Controls),A.5.2,Information security roles and responsibilities
2,A.5 (Organisational Controls),A.5.3,Segregation of duties
3,A.5 (Organisational Controls),A.5.4,Management responsibilities
4,A.5 (Organisational Controls),A.5.5,Contact with authorities
...,...,...,...
88,A.8 (Technological Controls),A.8.30,Outsourced development
89,A.8 (Technological Controls),A.8.31,"Separation of development, test and production..."
90,A.8 (Technological Controls),A.8.32,Change management
91,A.8 (Technological Controls),A.8.33,Test information


In [5]:
import numpy as np

n = len(controls)
part_size = n // 5
remainder = n % 5

# Calculate split indices
sizes = [part_size + (1 if i < remainder else 0) for i in range(5)]
indices = np.cumsum([0] + sizes)

dfs = [controls.iloc[indices[i]:indices[i+1]].reset_index(drop=True) for i in range(5)]
df1, df2, df3, df4, df5 = dfs

In [6]:
def extract_file_as_markdown(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        return extract_pdf_as_markdown(file_path)
    elif ext == ".docx":
        return extract_docx_as_markdown(file_path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")

In [7]:
text = extract_file_as_markdown(file_path)


✔️ Tesseract detected: tesseract 4.1.1


In [8]:

# llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0.3, api_key=os.getenv("OPENAI_API_KEY"))
# clauses = [df1,df2,df3,df4,df5]
# audits = []
# results = []
# for clause in clauses:
#     audit_chain = analyze_module(llm)
#     results.append(audit_chain.run(text=text, control_json=clause.to_json(orient='records', indent=2)))



In [9]:
from langchain.callbacks import get_openai_callback
import nest_asyncio
import sys
nest_asyncio.apply()

text = extract_pdf_as_markdown(file_path)

llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0,streaming=False, api_key=os.getenv("OPENAI_API_KEY"))
clauses = [df1,df2,df3,df4,df5]
results = []
total_tokens = 0
total_cost = 0.0

async def process_clause_async(clause, text, llm):
    def sync_callback_wrapper():
        with get_openai_callback() as cb:
            audit_chain = analyze_module(llm)
            result = audit_chain.run(
                text=text,
                control_json=clause.to_json(orient='records', indent=2)
            )
            return {
                "result": result,
                "tokens": cb.total_tokens,
                "cost": cb.total_cost,
                "controls": len(clause)
            }

    return await asyncio.to_thread(sync_callback_wrapper)

async def run_all_clauses():
    results = []
    total_tokens = 0
    total_cost = 0.0

    tasks = [process_clause_async(clause, text, llm) for clause in clauses]
    clause_results = await asyncio.gather(*tasks)

    for data in clause_results:
        results.append(data["result"])
        total_tokens += data["tokens"]
        total_cost += data["cost"]
        print(f"🔎 Clause Processed: {data['controls']} controls")
        print(f"🧠 Tokens used: {data['tokens']}")
        print(f"💵 Cost: ${data['cost']:.6f}")

    print("\n✅ All clauses processed (async).")
    print(f"🔢 Total tokens used: {total_tokens}")
    print(f"💰 Total cost: ${total_cost:.6f}")

    return results

if __name__ == "__main__":
    if sys.platform == "win32":
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    final_results = await run_all_clauses() if 'google.colab' in sys.modules or 'IPython' in sys.modules else asyncio.run(run_all_clauses())


  llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0,streaming=False, api_key=os.getenv("OPENAI_API_KEY"))
  result = audit_chain.run(
  return LLMChain(llm=llm, prompt=prompt)


🔎 Clause Processed: 10 controls
🧠 Tokens used: 28229
💵 Cost: $0.012674
🔎 Clause Processed: 10 controls
🧠 Tokens used: 28303
💵 Cost: $0.012768
🔎 Clause Processed: 10 controls
🧠 Tokens used: 28407
💵 Cost: $0.012967
🔎 Clause Processed: 10 controls
🧠 Tokens used: 28256
💵 Cost: $0.012763
🔎 Clause Processed: 9 controls
🧠 Tokens used: 28088
💵 Cost: $0.012541

✅ All clauses processed (async).
🔢 Total tokens used: 141283
💰 Total cost: $0.063713


In [10]:
results = final_results

In [11]:
import json
import pandas as pd

# Parse each JSON string and collect all items into a single list
all_items = []
for js in results:
    try:
        # Remove optional code formatting like ```json and ```
        clean_js = js.strip().removeprefix("```json").removesuffix("```").strip()
        all_items.extend(json.loads(clean_js))
    except json.JSONDecodeError as e:
        print("Failed to parse:", js)
        raise e

# Convert to DataFrame
resutls_df = pd.DataFrame(all_items)



In [12]:
resutls_df

Unnamed: 0,Clause,Section,Control Id,Control Title,Compliance,Reference,Gaps Identified,Recommended Action
0,"4, 5",A.5 (Organisational Controls),A.5.1,Policies for information security,✅,Information Security Policy document (Page 1-2...,,
1,"4, 5",A.5 (Organisational Controls),A.5.2,Information security roles and responsibilities,✅,Section 1.5 Roles and Responsibilities (Pages ...,,
2,5,A.5 (Organisational Controls),A.5.3,Segregation of duties,✅,Section 1.5 Roles and Responsibilities (Page 1...,,
3,"5, 6, 9, 10",A.5 (Organisational Controls),A.5.4,Management responsibilities,✅,Section 1.5 Roles and Responsibilities (Pages ...,,
4,,A.5 (Organisational Controls),A.5.5,Contact with authorities,✅,Section 2.1.7 and 2.1.8 (Pages 36-37) describe...,,
5,,A.5 (Organisational Controls),A.5.6,Contact with special interest groups,✅,Section 2.1.7 (Page 36) states that companies ...,,
6,6,A.5 (Organisational Controls),A.5.7,Threat intelligence,✅,Section 6.5 Security incident management (Page...,,
7,7,A.5 (Organisational Controls),A.5.8,Information security in project management,✅,Sections 2.2.1 and 2.3.1 (Pages 37-38) describ...,,
8,"4, 8",A.5 (Organisational Controls),A.5.14,Information transfer,✅,Section 3.2 Labelling and handling of informat...,,
9,"4, 8",A.5 (Organisational Controls),A.5.19,Information security in supplier relationships,✅,Chapter 3 Business partnership and Outsourcing...,,


In [13]:
resutls_df.to_excel("result.xlsx")

In [14]:
with open("output.md", "w", encoding="utf-8") as f:
    f.write(text)

print("✅ PDF content extracted and saved as 'output.md'")

✅ PDF content extracted and saved as 'output.md'
