In [1]:
!pip install faiss-cpu pypdf2

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-win_amd64.whl.metadata (3.8 kB)
Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading faiss_cpu-1.8.0.post1-cp310-cp310-win_amd64.whl (14.6 MB)
   ---------------------------------------- 0.0/14.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/14.6 MB ? eta -:--:--
   ---------------------------------------- 0.2/14.6 MB 1.5 MB/s eta 0:00:10
    --------------------------------------- 0.3/14.6 MB 1.9 MB/s eta 0:00:08
   - -------------------------------------- 0.6/14.6 MB 2.5 MB/s eta 0:00:06
   - -------------------------------------- 0.6/14.6 MB 2.3 MB/s eta 0:00:06
   --- ------------------------------------ 1.1/14.6 MB 3.6 MB/s eta 0:00:04
   --- ------------------------------------ 1.2/14.6 MB 3.4 MB/s eta 0:00:04
   ---- ----------------------------------- 1.8/14.6 MB 4.4 MB/s eta 0:00:03
   ------ --------------------------------- 2.4/14.6 MB 5.0 MB/s eta 0

In [1]:
import faiss
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import fitz
import itertools

In [2]:
# pdf_file_path = 'C:/Users/ribhattacharya/Desktop/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf'
# with fitz.open(pdf_file_path) as doc:
#         page_contents = [page.get_text() for page in doc]
pdf_loader = PyPDFLoader("C:/Users/ribhattacharya/Desktop/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf")
raw_documents= pdf_loader.load()
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
documents = text_splitter.split_documents(raw_documents)

In [3]:
embedding_model = OllamaEmbeddings(model='nomic-embed-text',show_progress=True)

In [4]:
batch_size = 50
texts=[doc.page_content for doc in documents]
#document_texts = [doc.page_content for doc in documents]
documents_embeddings = []

In [6]:
for i in range(0, len(texts),batch_size):
    batch_texts = texts[i:i+batch_size]
    batch_embeddings = embedding_model.embed_documents(batch_texts)
    documents_embeddings.extend(batch_embeddings)


OllamaEmbeddings:   0%|          | 0/50 [00:00<?, ?it/s][A
OllamaEmbeddings:   2%|▏         | 1/50 [00:04<03:39,  4.48s/it][A
OllamaEmbeddings:   4%|▍         | 2/50 [00:06<02:36,  3.26s/it][A
OllamaEmbeddings:   6%|▌         | 3/50 [00:10<02:48,  3.58s/it][A
OllamaEmbeddings:   8%|▊         | 4/50 [00:11<01:51,  2.43s/it][A
OllamaEmbeddings:  10%|█         | 5/50 [00:11<01:15,  1.67s/it][A
OllamaEmbeddings:  12%|█▏        | 6/50 [00:12<01:04,  1.47s/it][A
OllamaEmbeddings:  14%|█▍        | 7/50 [00:13<00:57,  1.33s/it][A
OllamaEmbeddings:  16%|█▌        | 8/50 [00:15<00:56,  1.34s/it][A
OllamaEmbeddings:  18%|█▊        | 9/50 [00:16<00:47,  1.16s/it][A
OllamaEmbeddings:  20%|██        | 10/50 [00:16<00:37,  1.08it/s][A
OllamaEmbeddings:  22%|██▏       | 11/50 [00:17<00:42,  1.10s/it][A
OllamaEmbeddings:  24%|██▍       | 12/50 [00:19<00:44,  1.16s/it][A
OllamaEmbeddings:  26%|██▌       | 13/50 [00:20<00:40,  1.10s/it][A
OllamaEmbeddings:  28%|██▊       | 14/50 [00:21<00:

In [8]:
import numpy as np

In [9]:
#create FAISS index and add the embeddings
document_embeddings_np = np.array(documents_embeddings)
dimension = document_embeddings_np.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(document_embeddings_np)

In [10]:
from langchain.docstore import InMemoryDocstore
from langchain.docstore.document import Document

In [11]:
docstore = InMemoryDocstore(dict(enumerate(documents)))
index_to_docstore_id = {i: i for i in range(len(documents))}

In [12]:
vector_store = FAISS(embedding_model, faiss_index, docstore, index_to_docstore_id)

In [13]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.output_parsers.json import SimpleJsonOutputParser

In [28]:
# LLM from Ollama
local_model = "knoopx/hermes-2-pro-mistral:7b-q8_0"
llm = ChatOllama(model=local_model,
                 model_kwargs={ "response_format": { "type": "json_object" } },
)

In [29]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""Act as an advisor to a security analyst who requires precise technical answers to the provided questions from the context document which could be of the following types: SOC 2 type reports, ISO 27001 certificates, penetration test reports, privacy policies document, security policies document. Based on the given context, your task to phrase the user question in the best possible way in order to get the most accurate answer from the context document.         
    Original question: {question}""",
)

In [30]:
retriever = MultiQueryRetriever.from_llm(
    vector_store.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
Your response should be concise. You must always output a JSON object with an "answer" key and a "reference" key. 
The "answer" key will contain your response and the "reference" key will contain the original text citation from the document along with the page numbers which you referred to while coming up with your response.
Generate output or responses based on the {question} given in the format of the following examples below:-

        Example document: "Google Cloud Platform SOC type 2 Report"
        Example 1:-
        question:- "What is the password management policy in place?"
        response:- {'answer': 'Google personnel are required to authenticate using valid credentials before resetting their passwords. Passwords are managed according to guidelines that enforce minimum length, complexity, history, and idle time lockout setting. Password configuration requirements are enforced by internal systems. In addition to the security requirements enforced during configuration, internal passwords are subject to cryptographic hashing to mitigate the risk of unauthorized disclosure or modification. Google has also supplemented passwords with a two-factor authentication requirement for internal personnel to access sensitive internal corporate and production services and to access Google Cloud Platform in the production environment from the corporate network',
                    'reference': 'The answer to the given question can be found on page 49 of the document. With respect to the password management policy, the provided document states under the sub-heading "Password Guidelines" the following: "Google personnel are required to authenticate using valid credentials prior to resetting their password. Passwords are managed in accordance with a set of password construction, protection, and management guidelines, which enforce the following:
                                    • Minimum length
                                    • Complexity
                                    • History
                                    • Idle time lockout setting
                                    Password configuration requirements are enforced by internal systems. In addition to the security requirements enforced during configuration, internal passwords are subject to cryptographic hashing to mitigate the risk of unauthorized disclosure or modification.
                                    Google has supplemented passwords with a two-factor authentication requirement for internal personnel to access sensitive internal corporate and production services and to access Google Cloud Platform in the production environment from the corporate network. Two-factor authentication provides additional protection to prevent user account manipulation in case the user’s password is compromised.
                                    Google Cloud Platform end users can also authenticate in one of three ways:
                                    • Using their user ID and a password that is managed by Google
                                    • Using a two-step authentication process that includes their user ID, password, and a security key
                                    • Through the Security Assertion Markup Language (SAML) based Single Sign-On (SSO) process which uses the user entity’s own account management system to authenticate users and a certificate with an embedded public key, which is registered with Google for each customer entity"'}
        
        Example 2:-
        question:- "How are incidents classified?"
        response:- answer key: 'Incidents are classified based on their severity level. Each severity level has been formally defined to capture the importance of each incident/problem type.',
                    reference key: 'The answer to the given question can be found on page 50 & 51 of the document. With respect to incident classification, the provided document states under the sub-heading "Incident Alert and Recording" under "Incident Management" the following: "Log sources are used to generate alerts whenever an anomaly occurs. Production monitoring tools, in response to an anomaly, automatically generate alerts to relevant teams based on the anomaly configurations set by each team. An anomaly may also be manually documented by a Google employee when an issue is identified or in response to a customer service request.Production systems are configured to send system events to monitoring and alerting tools. Google personnel use these tools to respond to potential incidents, including security and privacy incidents.
                                  Alerts capture information necessary for initial response (e.g., origin, service description, impacted area, etc.). Alerts are addressed by relevant teams to identify if the anomaly indicates an issue or potential issue. If necessary, incidents are created for alerts that require additional investigation. Additional details can be added to the incident to supplement the initial alert(s). The incident is assigned an initial severity level to prioritize mitigation efforts to incidents of greatest impact. Each severity level has been formally defined to capture the importance of each incident/problem type. There are established roles and responsibilities for personnel tasked with incident management, including the identification, assignment, managed remediation, and communication of incidents."
        

        Example 3:-
        question:- "What procedures are in place to perform root cause analysis for an incident?"
        response:- answer key: 'Google has an established post mortem process for performing technical analysis of incidents after the fact to identify root cause issues, document lessons learned, and implement fixes to strengthen and improve security controls, and to prevent future incidents.',
                    reference key: 'The answer to the given question can be found on page 51 of the document. With respect to root cause analysis for an incident, the provided document states under the sub-heading "Incident Resolution" under "Incident Management" the following: "After gathering the necessary information about the incident, the incident ticket is assigned to the appropriate support area based on the nature of the problem and/or the root cause. Incidents are usually forwarded to one of the corresponding technical departments:
                                    • System Reliability Engineers / Software Engineers
                                    • Networks
                                    • Database Administration
                                    • System Administration
                                    • Application Administration
                                    • Facilities
                                    • Network Security
                                    • Platform Support
                                    • Legal Team
                                    The incident ticket is closed upon resolution of the incident. Google also has an established post mortem process for performing technical analysis of incidents after the fact to identify root cause issues, document lessons learned, and implement fixes to strengthen and improve security controls, and to prevent future incidents. Processes for notifying customers of data security and privacy incidents that affect their accounts in accordance with disclosure laws or contractual agreements are established and implemented."
   
        """

prompt = ChatPromptTemplate.from_template(template)

In [31]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | SimpleJsonOutputParser()
)

In [32]:
chain.invoke("What access control procedures are in place?")

OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.78s/it]


KeyError: 'Input to ChatPromptTemplate is missing variables {"\'answer\'"}.  Expected: ["\'answer\'", \'context\', \'question\'] Received: [\'context\', \'question\']'

In [None]:
chain.invoke("When was the access control policy last reviewed?")

In [None]:
chain.invoke("What is the password management policy in place?")

In [None]:
chain.invoke("What procedures are followed for decommissioning of equipments?")

In [None]:
chain.invoke("What steps are taken to ensure removal of client information prior to decommissioning of equipments?")

In [65]:
chain.invoke("When were information security policies and procedures updated?")


OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:04<00:00,  4.24s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.83it/s][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.15it/s][A


'The information security policies and procedures were updated on \'1 October 20XX\' (year not clearly mentioned). This information can be found on page 35 of the document, which states: "Information Security Policies and Procedures were updated on 1 October 20XX."\n\nOriginal wording from the document: "Information Security Policies and Procedures were updated on 1 October 20XX."'

In [66]:
chain.invoke("How are emergency changes performed?")


OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:11<00:00, 11.35s/it][A

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 15.54it/s]

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.71it/s][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  4.77it/s][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.55it/s][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.45it/s][A

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 11.36it/s]

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00

'Emergency changes are performed through the Change Management process. According to page 50 of the document, "Emergency Changes: Any change that must be made immediately to avoid a significant impact to service or customer data is considered an emergency change." The process requires documentation and review prior to implementation, as well as post-implementation review and communication to the appropriate stakeholders. It also states that such changes are tracked in the same way as regular changes, with necessary escalation mechanisms in place for urgent issues. \n\nOriginal wording from the document: "Emergency Changes: Any change that must be made immediately to avoid a significant impact to service or customer data is considered an emergency change. Emergency changes follow the same change management process, but are documented and reviewed prior to implementation, implemented, and post-implementation reviewed. The change is communicated to the appropriate stakeholders as per the 

In [67]:
chain.invoke("What cryptographic controls are in place to encrypt the data at rest and in motion?")


OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.17s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.81it/s][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.12it/s][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.23it/s][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.39it/s][A

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 15.86it/s]

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.32it/s][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.76it/s][A

OllamaEmbeddings:   0%|          | 0/1 [00:00

"In order to find information about cryptographic controls for data encryption at rest and in motion, I would need access to the specific audit report or documentation related to Google Cloud's security measures. Please provide the relevant document or report so that I can locate the required information and respond accordingly. Make sure to mention the page number or position in the document along with the original wording from the source."

In [68]:
chain.invoke("When was the incident management policy last reviewed?")


OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.44s/it][A

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 18.18it/s]

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  4.74it/s][A


'I am unable to find information about the incident management policy review date in the provided document excerpts. Please provide the complete audit report or the specific section containing details about the incident management policy for further assistance.'

In [69]:
chain.invoke("When was the incident response procedure tested?")


OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.21s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  4.42it/s][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.53it/s][A


'The incident response procedure was tested during the audit process, specifically during the review of the "Incident Alert and Recording" section on page 51 of the document. The text states, "Log sources are used to generate alerts whenever an anomaly occurs... An anomaly may also be manually documented by a Google employee when an issue is identified or in response to a customer service request." (GCP-[FALL-2023] GCP SOC 2..pdf, p. 51)'

In [None]:
chain.invoke("How are incidents classified?")

In [None]:
chain.invoke("What procedures are in place to perform root cause analysis for an incident?")

In [72]:
chain.invoke("What physical perimeter controls are in place?")


OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:04<00:00,  4.08s/it][A


"I could not find any information about physical perimeter controls in the provided document excerpts. The mentioned Google Cloud services and features are described at various page numbers, but none specifically address physical perimeter controls. Please provide more context or refer to a different section of the document if you're looking for information on this particular topic.\n\nPage numbers mentioned in the request: 18, 22, 73, and 103."

In [73]:
chain.invoke("What procedures are in place to deploy patches throughout the IT infrastructure?")


OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:04<00:00,  4.27s/it][A

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 17.27it/s]

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.53it/s][A

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 10.98it/s]

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  8.77it/s][A


'The document does not specifically mention procedures for deploying patches throughout the IT infrastructure. However, it does discuss security updates and vulnerability management in the context of Google\'s Cloud Security Command Center. This service provides visibility into security and network traffic data across multi-cloud and on-premises environments (page 51). It also offers automated policy enforcement for identifying and responding to threats (page 52). These capabilities help ensure that patches and updates are applied promptly and consistently across the infrastructure, although the specific procedures for patch deployment are not detailed in the provided excerpts.\n\n(Source: "GCP-[FALL-2023] GCP SOC 2..pdf", pages 51-52)'

In [74]:
chain.invoke("How is a back up of the data performed?")


OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:04<00:00,  4.54s/it][A


'A backup of the data is performed as per the data retention and deletion policies. The process involves the use of data deletion tools that verify the backup data is deleted following the configured retention period, as part of the deletion mechanism process. This information can be found on page 77 of the document: "was disposed of as per the data retention and deletion policies. No deviations noted. Inspected a sample product and determined data deletion tools verified that backup data was deleted following the configured retention period, as part of the deletion mechanism process. No deviations noted."'

In [75]:
chain.invoke("What backgound checks are performed during hiring of an employee?")


OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:04<00:00,  4.85s/it][A


"Background checks performed during hiring of an employee include criminal, credit, and/or security checks on all potential employees, temporary workers, and independent contractors, as well as verification of the individual's education, previous employment, and referrals (Source: Page 43, Document metadata {'source': 'C:/Users/ribhattacharya/Desktop/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf'}). The specifics or extent of background checks performed depend on the position and location for which the individual is applying."

In [76]:
chain.invoke("What is done to ensure that periodic penetration tests are performed for the infrastructure, devices, and end-points?")


OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:04<00:00,  4.60s/it][A

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 12.04it/s]

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.06it/s][A

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 13.63it/s]

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.63it/s][A


'To ensure that periodic penetration tests are performed for the infrastructure, devices, and end-points, Google has the following measures in place:\n\n1. Penetration tests are performed at least annually. (CC4.1)\n2. The organization performs penetration tests by qualified internal personnel or an external service provider at least annually. (Inquired of the Program Manager)\n\nNo deviations were noted in these procedures during the testing process, as confirmed by the results shared by EY after their inspection. \n\nReference(s):\n- Page 54: "logs is restricted to authorized personnel. Security event logs are monitored continuously using a Google proprietary Security Event Management (SEM) system to detect intrusion attempts and other security related events."\n- Page 168: "Control Description SOC 2 Criteria, Controls, Tests and Results of Tests \\n...112. Penetration tests are performed at least annually. CC4.1..."'