In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
import faiss  # type: ignore
import numpy as np
import tempfile
import os
from PyPDF2 import PdfReader
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document

In [2]:
def load_multiple_pdf_files(file_paths):
    """
    Load multiple PDF files using PyPDFLoader and extract metadata.

    Args:
        file_paths (list): List of file paths to PDF files.

    Returns:
        list: List of Document objects with content and metadata.
    """
    documents_with_metadata = []
    for file_path in file_paths:
        loader = PyPDFLoader(file_path)
        documents=loader.load()
        
        for page_number, doc in enumerate(documents):
            metadata = {
                "title": doc.metadata.get("title", "Unknown Title"),
                "author": doc.metadata.get("author", "Unknown Author"),
                "source": file_path,
                "page_number": page_number +1
            }
            documents_with_metadata.append(Document(page_content=doc.page_content, metadata=metadata))
    
    return documents_with_metadata

In [3]:
def split_documents(raw_documents):
    """
    Split raw documents into chunks using RecursiveCharacterTextSplitter.

    Args:
        raw_documents (list): List of Document objects.

    Returns:
        list: List of split Document objects.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(raw_documents)


In [None]:
# class Document:
#     def __init__(self, text):
#         self.text = text

In [None]:
# def word_level_splitter(raw_documents, words_per_chunk=200):
#     """
#     Split raw documents into chunks based on a specified number of words.

#     Args:
#         raw_documents (list): List of Document objects.
#         words_per_chunk (int): Number of words per chunk.

#     Returns:
#         list: List of split Document objects.
#     """
#     split_documents = []
#     for doc in raw_documents:
#         # Split the document text into words
#         words = doc.text.split()
#         # Create chunks of specified number of words
#         for i in range(0, len(words), words_per_chunk):
#             chunk = ' '.join(words[i:i + words_per_chunk])
#             split_documents.append(Document(text=chunk))
#     return split_documents

In [4]:
def create_vector_store(documents):
    """
    Create a vector store from the documents.

    Args:
        documents (list): List of Document objects.

    Returns:
        FAISS: A FAISS vector store.
    """
    # Generate embeddings using OllamaEmbeddings
    embedding_model = OllamaEmbeddings(model='nomic-embed-text', show_progress=True)
    batch_size = 50
    texts = [doc.page_content for doc in documents]
    documents_embeddings = []


    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = embedding_model.embed_documents(batch_texts)
        documents_embeddings.extend(batch_embeddings)


    # Create FAISS index and add the embeddings
    document_embeddings_np = np.array(documents_embeddings)
    dimension = document_embeddings_np.shape[1]
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(document_embeddings_np)

    docstore = InMemoryDocstore({i: doc for i, doc in enumerate(documents)})
    index_to_docstore_id = {i: i for i in range(len(documents))}

    vector_store = FAISS(embedding_model, faiss_index, docstore, index_to_docstore_id)

    return vector_store

In [5]:
# Example usage:
file_paths = [
    "C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf",
    "C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/SOC 2 Type 2_Adobe.pdf"
]

In [6]:
# Load documents and extract metadata
raw_documents = load_multiple_pdf_files(file_paths)

In [7]:
# Split documents into chunks
documents = split_documents(raw_documents)

In [8]:
# Create a vector store from the documents
vector_store = create_vector_store(documents)

OllamaEmbeddings: 100%|██████████| 50/50 [00:23<00:00,  2.12it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:20<00:00,  2.42it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:20<00:00,  2.48it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:19<00:00,  2.62it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:19<00:00,  2.53it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:18<00:00,  2.74it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:18<00:00,  2.64it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:18<00:00,  2.68it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:20<00:00,  2.50it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:22<00:00,  2.18it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:22<00:00,  2.24it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:21<00:00,  2.33it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:20<00:00,  2.46it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:19<00:00,  2.63it/s]
OllamaEmbeddings: 100%|██████████| 50/50 [00:19<00:00,  2.57it/s]
OllamaEmbe

In [9]:
for i, doc in enumerate(documents):
    print(f"Document {i + 1}:")
    print(f"Title: {doc.metadata.get('title', 'No Title')}")
    print(f"Author: {doc.metadata.get('author', 'No Author')}")
    print(f"Source: {doc.metadata.get('source', 'No Source')}")
    print(f"Page Number: {doc.metadata.get('page_number', 'No Page Number')}")
    print(f"Content: {doc.page_content[:150]}...")  # Print the first 150 characters of the content
    print("-" * 40)  # Separator for readability


Document 1:
Title: Unknown Title
Author: Unknown Author
Source: C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf
Page Number: 1
Content: System and Organization Controls (SOC) 2 Type II Report  
Description of the Google Cloud Platform System  
For the Period 1 November 2022 to 31 Octob...
----------------------------------------
Document 2:
Title: Unknown Title
Author: Unknown Author
Source: C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf
Page Number: 2
Content: Table of Contents  
SECTION I - Google's Management Assertion  ................................ ................................ ........ 1 
SECTION I...
----------------------------------------
Document 3:
Title: Unknown Title
Author: Unknown Author
Source: C:/Users/ansutton/Desktop/TPRM/TPRM-

In [10]:
from langchain_community.chat_models import ChatOllama
from langchain.chains import RetrievalQA

In [11]:
local_model = "llama3.2" 
llm = ChatOllama(model=local_model)

In [12]:
qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type="stuff",
    retriever = vector_store.as_retriever(search_kwargs={"k":3}),
    return_source_documents = True
)

In [13]:
def get_answer(query):
    result = qa_chain({"query": query})
    response = result['result']

    source_documents = result['source_documents']
    pages = []
    citations = []
    for doc in source_documents:
        source = doc.metadata['source']
        pages.append((source,doc.metadata['page_number']))
        citations.append((source, doc.metadata['page_number'], doc.page_content))  # Include source in the citation
    return {
        "response": response,
        "pages": pages,
        "citations": citations
    }

In [14]:
query = "What access control procedures are in place?"

In [15]:
structured_answer = get_answer(query)

  warn_deprecated(
OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s]

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 10.41it/s]


In [16]:
print(structured_answer["response"])

The text does not specifically describe the "access control procedures" that are in place. However, it does mention some guidelines and standards related to access control, such as:

* Common Criteria 6.1: The entity implements logical access security software, infrastructure, and architectures over protected information assets.
* Common Criteria 4.1: The entity selects, develops, and performs ongoing evaluations to ascertain whether the components of internal control are present and functioning.
* Common Criteria 5.3: The entity deploys control activities through policies that establish what is expected and procedures that put policies into action.

It can be inferred that an access control process exists, but the specific details and procedures are not explicitly stated in the text.


In [17]:
print(structured_answer["pages"])

[('C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/SOC 2 Type 2_Adobe.pdf', 75), ('C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf', 61), ('C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf', 59)]


In [18]:
print(structured_answer["citations"])

[('C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/SOC 2 Type 2_Adobe.pdf', 75, 'applicable \n• Access start date \n• Access duration Inspected the physical security system \nworkflow to determine whether \nrequests for physical access required management approval and required documented specification of: \n• Account type (e.g., visitor, vendor, \nor regular) \n• Access privileges granted \n• Intended business purpose \n• Visitor identification method, if \napplicable \n• Temporary badge issued, if \napplicable \n• Access start date \n• Access duration No exceptions noted. \nInspected physical access request \ndocumentation for a selection of new \nphysical access requests to the Adobe-\nowned data center and data rooms to determine whether access is approved. No exceptions noted.'), ('C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-