In [1]:

from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
import faiss
import numpy as np
import tempfile
import os


In [2]:
#Testing applications document extraction method
def _get_page_contents_from_pdf_file_path(pdf_path):
    pdf_loader = PyPDFLoader(pdf_path)
    raw_documents= pdf_loader.load()
    return raw_documents

def _get_page_contents_from_pdf_in_memory(pdf_bytes):
    # Write bytes to a temporary file
    with tempfile.NamedTemporaryFile(delete=False,suffix='.pdf') as temp_pdf_file:
        temp_pdf_file.write(pdf_bytes)
        temp_pdf_file_path = temp_pdf_file.name
    # Load temp file
    pdf_loader = PyPDFLoader(temp_pdf_file_path)
    raw_documents= pdf_loader.load()
    # Delete temporary file
    os.remove(temp_pdf_file_path)
    return raw_documents

In [3]:
def create_vector_store(pdf_file, from_file_path=True):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )

    if from_file_path:
        data = _get_page_contents_from_pdf_file_path(pdf_file)
    else:
        data = _get_page_contents_from_pdf_in_memory(pdf_file)


    documents = text_splitter.split_documents(data)


    # Generate embeddings using OllamaEmbeddings
    embedding_model = OllamaEmbeddings(model ='nomic-embed-text', show_progress=True)
    batch_size = 50
    texts=[doc for doc in documents]
    documents_embeddings = []


    for i in range(0, len(texts),batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_embeddings = embedding_model.embed_documents(batch_texts)
        documents_embeddings.extend(batch_embeddings)
    
    #create FAISS index and add the embeddings
    document_embeddings_np = np.array(documents_embeddings)
    dimension = document_embeddings_np.shape[1]
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(document_embeddings_np)

    docstore = InMemoryDocstore(dict(enumerate(documents)))
    index_to_docstore_id = {i: i for i in range(len(documents))}

    vector_store = FAISS(embedding_model, faiss_index, docstore, index_to_docstore_id)
    
    return vector_store

In [4]:
def find_relevant_sections(vector_store, question, top_n=50):
    """Retrieves top-N most relevant sections from the vector store based on a question."""
    # Load pre-trained nomic model and tokenizer
    embedding_model = OllamaEmbeddings(model ='nomic-embed-text', show_progress=True)

    question_embedding = embedding_model.embed_query(question)  # Assuming your embedding model has an embed_query method

    # Perform similarity search
    similar_docs = vector_store.similarity_search_by_vector(question_embedding, k=top_n)

    # Extract relevant sections (assuming each document has a 'page_content' attribute)
    relevant_sections = [doc.page_content for doc in similar_docs]

    return relevant_sections

In [17]:
def semantic_similarity_all(relevant_sections, questions, ai_answers, third_party_answers):
    # Use the same OllamaEmbeddings model
    embedding_model = OllamaEmbeddings(model='nomic-embed-text', show_progress=True)

    similarities = {'ai': [], 'third_party': []}
    for question, rel_sections, ai_answer, third_party_answer in zip(questions, relevant_sections, ai_answers, third_party_answers):
        # Embed the answers
        ai_answer_embedding = embedding_model.embed_query(ai_answer)
        third_party_answer_embedding = embedding_model.embed_query(third_party_answer)

        max_similarities = {'ai': 0, 'third_party': 0}
        for section in rel_sections:
            # Embed the relevant section
            section_embedding = embedding_model.embed_query(section)

            # Calculate semantic similarity
            section_embedding_tensor = tf.constant(section_embedding)
            ai_answer_embedding_tensor = tf.constant(ai_answer_embedding)
            third_party_answer_embedding_tensor = tf.constant(third_party_answer_embedding)

            ai_similarity = tf.reduce_sum(section_embedding_tensor * ai_answer_embedding_tensor) / (tf.norm(section_embedding_tensor) * tf.norm(ai_answer_embedding_tensor))
            tp_similarity = tf.reduce_sum(section_embedding_tensor * third_party_answer_embedding_tensor) / (tf.norm(section_embedding_tensor) * tf.norm(third_party_answer_embedding_tensor))

            max_similarities['ai'] = max(max_similarities['ai'], ai_similarity.numpy())
            max_similarities['third_party'] = max(max_similarities['third_party'], tp_similarity.numpy())

        similarities['ai'].append(max_similarities['ai'])
        similarities['third_party'].append(max_similarities['third_party'])

    return similarities

In [6]:
ai_answers = ["Access control procedures mentioned in the document include:\n\n1. On page 60-62 (GCP SOC 2 doc): Customers are responsible for provisioning, maintaining, monitoring, and disabling end users' access according to their internal access management policies. The entity implements logical access security software, infrastructure, and architectures over protected information assets to protect them from security events that meet the entity's objectives. Prior to issuing system credentials and granting system access, the entity registers and authorizes new internal and external users whose access is administered by the entity. For those users whose access is administered by the entity, user system credentials are removed when access is no longer authorized.\n\n2. On page 67 (GCP SOC 2 doc): The entity authorizes, modifies, or removes access to data, software, functions, and other protected information assets based on roles. This is done within the customer's environment consistent with customer policies.\n\n3. On page 48 (GCP SOC 2 doc): Production system access is granted only to individuals who have completed required security and privacy training and require this level of access to perform tasks. Access to individual production systems via critical access groups is reviewed periodically by the system owners, and inappropriate access is removed for Google personnel who no longer have a business need for it. Access to all corporate and production resources are automatically removed upon submission of a termination request by the manager of any departing employee or by the appropriate Human Resources manager.\n\n4. Password guidelines: Google personnel are required to authenticate using valid credentials prior to resetting their passwords. Passwords are managed in accordance with a set of password construction requirements (cited from page 48, but the exact guidelines are not specified in the given context).",
                "A backup of the data is performed as per the data retention and deletion policies. The process involves the use of data deletion tools that verify the backup data is deleted following the configured retention period, as part of the deletion mechanism process. This information can be found on page 77 of the document: 'was disposed of as per the data retention and deletion policies. No deviations noted. Inspected a sample product and determined data deletion tools verified that backup data was deleted following the configured retention period, as part of the deletion mechanism process. No deviations noted.'",
                "To ensure that periodic penetration tests are performed for the infrastructure, devices, and end-points, Google has the following measures in place:\n\n1. Penetration tests are performed at least annually. (CC4.1)\n2. The organization performs penetration tests by qualified internal personnel or an external service provider at least annually. (Inquired of the Program Manager)\n\nNo deviations were noted in these procedures during the testing process, as confirmed by the results shared by EY after their inspection. \n\nReference(s):\n- Page 54: 'logs is restricted to authorized personnel. Security event logs are monitored continuously using a Google proprietary Security Event Management (SEM) system to detect intrusion attempts and other security related events.'\n- Page 168: 'Control Description SOC 2 Criteria, Controls, Tests and Results of Tests \\n...112. Penetration tests are performed at least annually. CC4.1...'"]

#answers to questions 1, 15, and 17

third_party_answers = ["Google follows a formal process to grant or revoke employee access to Google resources. Access to systems and data is granted only to authorized users. Access requests are reviewed and approved by an authorized second individual prior to being granted and the event is logged. Both user and internal access to customer data are restricted through the use of unique user account IDs and the Google Accounts Bring Your Own Identity (BYOID) system. Access to sensitive systems and applications requires two-factor authentication. Periodic reviews of access lists are implemented to ensure access to customer data is appropriate and authorized. Access to production machines, network devices and support tools is managed via an access group management system. Membership in these groups must be approved by respective group administrators. User group memberships are reviewed on a semiannual basis and any inappropriate access is removed. Access authorization in Google Cloud Platform is enforced at all relevant layers of the system. The granting or modification of access rights is based on the user's job responsibilities or on a need-to-know basis and must be authorized and approved by the user's functional manager or system owners. Access to all corporate and production resources are automatically removed upon submission of a termination request by the manager of any departing employee, or by the appropriate Human Resources manager.",
                "At Google Cloud Platform, data backup is performed through a robust, automated system that ensures data integrity and availability. We use a combination of incremental and full backups to optimize both storage efficiency and data recovery times. These backups are geographically distributed across multiple secure data centers to provide redundancy and high availability. Additionally, our backup procedures are compliant with industry standards and are regularly audited to ensure they meet stringent security and privacy requirements.",
                "External third-party penetration tests are performed on an annual basis for a predetermined subset of the services included in the Google Cloud Platform System. The subset of services included in any given year are determined by the Google Security and the Office of Compliance & Integrity teams. This is based on their understanding of the organization's current risk environment, as well as the organization's current regulatory and compliance requirements. Corrective actions are taken as necessary."]

questions = ["What access control procedures are in place?",
             "How is a back up of the data performed?",
             "What is done to ensure that periodic penetration tests are performed for the infrastructure, devices, and end-points?"]

pdf_path = "C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf"


In [7]:
evidence_text = create_vector_store(pdf_path)

OllamaEmbeddings: 100%|██████████| 50/50 [00:30<00:00,  1.63it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:30<00:00,  1.66it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:30<00:00,  1.67it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:28<00:00,  1.73it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:28<00:00,  1.74it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:28<00:00,  1.76it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:28<00:00,  1.74it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:28<00:00,  1.77it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]
OllamaEmbeddings: 100%|██████████| 30/30 [00:18<00:00,  1.60it/s]


In [11]:
relevant_sections = []
for question in questions:
    relevant_section = find_relevant_sections(evidence_text, question)
    relevant_sections.append(relevant_section)

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.07it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 21.46it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 14.44it/s]


In [18]:
# Assuming you have relevant_sections, questions, ai_answers, and third_party_answers defined
semantic_similarities = semantic_similarity_all(relevant_sections, questions, ai_answers, third_party_answers)

# Print the results
for i, question in enumerate(questions):
    print(f"Question: {question}")
    print(f"  AI Similarity: {semantic_similarities['ai'][i]}")
    print(f"  Third-party Similarity: {semantic_similarities['third_party'][i]}")
    print("---")

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.20it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.76it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.82it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.58it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.65it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.47it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.93it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.59it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.56it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.99it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.05it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.53it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.48it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [

Question: What access control procedures are in place?
  AI Similarity: 0.8381991982460022
  Third-party Similarity: 0.9081324934959412
---
Question: How is a back up of the data performed?
  AI Similarity: 0.8988619446754456
  Third-party Similarity: 0.8300479650497437
---
Question: What is done to ensure that periodic penetration tests are performed for the infrastructure, devices, and end-points?
  AI Similarity: 0.8731306195259094
  Third-party Similarity: 0.905375599861145
---





In [19]:
for i, question in enumerate(questions):
    print(f"Question: {question}")
    print(f"  AI Similarity: {((semantic_similarities['ai'][i] + 1) / 2) * 100:.2f}%")
    print(f"  Third-Party Similarity: {((semantic_similarities['third_party'][i] + 1) / 2) * 100:.2f}%")

Question: What access control procedures are in place?
  AI Similarity: 91.91%
  Third-Party Similarity: 95.41%
Question: How is a back up of the data performed?
  AI Similarity: 94.94%
  Third-Party Similarity: 91.50%
Question: What is done to ensure that periodic penetration tests are performed for the infrastructure, devices, and end-points?
  AI Similarity: 93.66%
  Third-Party Similarity: 95.27%
