In [None]:
!pip install faiss-cpu pypdf2

In [1]:
import faiss
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
# pdf_file_path = 'C:/Users/ribhattacharya/Desktop/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf'
# with fitz.open(pdf_file_path) as doc:
#         page_contents = [page.get_text() for page in doc]
pdf_loader = PyPDFLoader("C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf")
raw_documents= pdf_loader.load()
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
documents = text_splitter.split_documents(raw_documents)

In [3]:
embedding_model = OllamaEmbeddings(model='nomic-embed-text',show_progress=True)

In [4]:
batch_size = 50
texts=[doc.page_content for doc in documents]
#document_texts = [doc.page_content for doc in documents]
documents_embeddings = []

In [5]:
for i in range(0, len(texts),batch_size):
    batch_texts = texts[i:i+batch_size]
    batch_embeddings = embedding_model.embed_documents(batch_texts)
    documents_embeddings.extend(batch_embeddings)

OllamaEmbeddings: 100%|██████████| 50/50 [00:19<00:00,  2.56it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:18<00:00,  2.71it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:17<00:00,  2.80it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:17<00:00,  2.90it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:17<00:00,  2.89it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:16<00:00,  2.97it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:17<00:00,  2.93it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:17<00:00,  2.91it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:17<00:00,  2.86it/s]
OllamaEmbeddings: 100%|██████████| 30/30 [00:12<00:00,  2.48it/s]


In [6]:
import numpy as np

In [7]:
#create FAISS index and add the embeddings
document_embeddings_np = np.array(documents_embeddings)
dimension = document_embeddings_np.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(document_embeddings_np)

In [8]:
from langchain.docstore import InMemoryDocstore

In [9]:
docstore = InMemoryDocstore(dict(enumerate(documents)))
index_to_docstore_id = {i: i for i in range(len(documents))}

In [10]:
vector_store = FAISS(embedding_model, faiss_index, docstore, index_to_docstore_id)

In [11]:
from langchain_community.chat_models import ChatOllama
from langchain.chains import RetrievalQA

In [12]:
local_model = "knoopx/hermes-2-pro-mistral:7b-q8_0" 
llm = ChatOllama(model=local_model)

In [21]:
qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type="stuff",
    retriever = vector_store.as_retriever(search_kwargs={"k":3}),
    return_source_documents = True
)

In [22]:
def get_answer(query):
    result = qa_chain({"query": query})
    response = result['result']

    source_documents = result['source_documents']
    pages = set()
    citations = []
    for doc in source_documents:
        pages.add(doc.metadata['page'])
        citations.append((doc.metadata['page'], doc.page_content))

    return {
        "response": response,
        "pages": list(pages),
        "citations": citations
    }

In [23]:
query = "What access control procedures are in place?"

In [24]:
structured_answer = get_answer(query)

OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.80s/it]


In [27]:
print(structured_answer["response"])

The access control procedures in place include:

1. Periodic assessment of parties' compliance and taking corrective action if necessary (Common Criteria 6.1).
2. Registration and authorization of new internal and external users, with their access administered by the entity (Common Criteria 6.2).
3. User system credentials are removed when user access is no longer authorized (Common Criteria 6.3).
4. Authorizing, modifying, or removing access to data, software, functions, and other protected information assets based on roles, responsibilities, or the system design (Common Criteria 6.3).
5. Customers are responsible for reviewing users' access rights periodically in accordance with their internal access management policies (Common Criteria 6.1).
6. Implementation of logical access security software, infrastructure, and architectures over protected information assets to safeguard them from security events (Common Criteria 6.1).
7. Selection, development, and ongoing evaluation of compone

In [26]:
print(structured_answer["pages"])

[58, 60, 63]


In [28]:
print(structured_answer["citations"])

[(60, "assesses those parties’ compliance \non a periodic and as -needed basis \nand takes corrective action, if \nnecessary.  \nCustomer s are responsible for provisioning, \nmaintaining, monitoring and disabling end users’ \naccess in accordance with their internal access \nmanagement policies.  Common Criteria 6.1: The entity \nimplements logical access security \nsoftware, infrastructure, and \narchi tectures over protected \ninformation assets to protect them \nfrom security events to meet the \nentity's objectives.  \nCommon Criteria 6.2: Prior to \nissuing system credentials and \ngranting system access, the entity \nregisters and authorizes new internal \nand external users whose access is \nadministered by the entity. For those \nusers whose access is administered \nby the entity,  user system rishav.bhattacharya99@gmail.comGoogle Confidental Information"), (58, 'procedures.  Common Criteria 4.1: The entity \nselects, develops, and performs \nongoing and/or separate evaluation

In [29]:
questions = [
"When was the access control policy last reviewed?",
"What is the password management policy in place?",
"What procedures are followed for decommissioning of equipments?",
"What steps are taken to ensure removal of client information prior to decommissioning of equipments?",
"When were information security policies and procedures updated?",
"How are emergency changes performed?",
"What cryptographic controls are in place to encrypt the data at rest and in motion?",
"When was the incident management policy last reviewed?",
"When was the incident response procedure tested?",
"How are incidents classified?",
"What procedures are in place to perform root cause analysis for an incident?",
"What physical perimeter controls are in place?",
"What procedures are in place to deploy patches throughout the IT infrastructure?",
"How is a back up of the data performed?",
"What backgound checks are performed during hiring of an employee?",
"What security incident response procedures are in place?"
]

In [30]:
all_answers = []

In [31]:
for question in questions:
    structured_answer = get_answer(question)
    all_answers.append(structured_answer)

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.27s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.40it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.46it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 19.65it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 14.16it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 20.40it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 22.53it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 26.70it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 10.20it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  7.07it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 16.15it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  6.02it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 13.65it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [

In [32]:
for answer in all_answers:
    print(answer["response"])
    print(answer["pages"])
    print(answer["citations"])

The provided information states that security and privacy policies are reviewed at least annually. However, it does not explicitly mention when the access control policy was last reviewed.
[144, 159, 119]
[(119, 'security and privacy policies are reviewed at least \nannually, and supporting standards, guidelines, \nand FAQs are created and updated as needed.  No deviations noted.  \nInspected internal documentation and determined \nthat security and privacy policies, supporting \nstandards, guidelines and FAQS were in place.  No deviations noted.  \nInspected internal documentation and determined \nsecurity and privacy policies were reviewed at \nleast annually and authorized before they were \nimplemented.  No deviations noted.  \nInspected the most recent security and privacy \npolicy reviews and determined policies were \napproved by authorized personnel or committee, \nreviewed at least annually, and updated as \nneeded.  No deviations noted.  rishav.bhattacharya99@gmail.comGoogle 