In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
import faiss  # type: ignore
import numpy as np
import tempfile
import os
from PyPDF2 import PdfReader

In [2]:
def load_multiple_pdf_files(file_paths):
    """
    Load multiple PDF files using PyPDFLoader and extract metadata.

    Args:
        file_paths (list): List of file paths to PDF files.

    Returns:
        list: List of Document objects with content and metadata.
    """
    documents_with_metadata = []
    for file_path in file_paths:
        reader = PdfReader(file_path)
        
        for page_number, page in enumerate(reader.pages):
            content = page.extract_text()
            metadata = {
                "title": reader.metadata.get("title", "Unknown Title"),
                "author": reader.metadata.get("author", "Unknown Author"),
                "source": file_path,
                "page_number": page_number +1
            }
        documents_with_metadata.append(Document(page_content=content, metadata=metadata))
    
    return documents_with_metadata

In [3]:
def split_documents(raw_documents):
    """
    Split raw documents into chunks using RecursiveCharacterTextSplitter.

    Args:
        raw_documents (list): List of Document objects.

    Returns:
        list: List of split Document objects.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(raw_documents)


In [None]:
# class Document:
#     def __init__(self, text):
#         self.text = text

In [None]:
# def word_level_splitter(raw_documents, words_per_chunk=200):
#     """
#     Split raw documents into chunks based on a specified number of words.

#     Args:
#         raw_documents (list): List of Document objects.
#         words_per_chunk (int): Number of words per chunk.

#     Returns:
#         list: List of split Document objects.
#     """
#     split_documents = []
#     for doc in raw_documents:
#         # Split the document text into words
#         words = doc.text.split()
#         # Create chunks of specified number of words
#         for i in range(0, len(words), words_per_chunk):
#             chunk = ' '.join(words[i:i + words_per_chunk])
#             split_documents.append(Document(text=chunk))
#     return split_documents

In [4]:
def create_vector_store(documents):
    """
    Create a vector store from the documents.

    Args:
        documents (list): List of Document objects.

    Returns:
        FAISS: A FAISS vector store.
    """
    # Generate embeddings using OllamaEmbeddings
    embedding_model = OllamaEmbeddings(model='nomic-embed-text', show_progress=True)
    batch_size = 50
    texts = [doc.page_content for doc in documents]
    documents_embeddings = []


    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = embedding_model.embed_documents(batch_texts)
        documents_embeddings.extend(batch_embeddings)


    # Create FAISS index and add the embeddings
    document_embeddings_np = np.array(documents_embeddings)
    dimension = document_embeddings_np.shape[1]
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(document_embeddings_np)

    docstore = InMemoryDocstore({i: doc for i, doc in enumerate(documents)})
    index_to_docstore_id = {i: i for i in range(len(documents))}

    vector_store = FAISS(embedding_model, faiss_index, docstore, index_to_docstore_id)

    return vector_store

In [5]:
# Example usage:
file_paths = [
    "C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf",
    "C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/APP_SHEET-[SPR-2023] AppSheet SOC 2..pdf"
]

In [6]:
# Load documents and extract metadata
raw_documents = load_multiple_pdf_files(file_paths)

In [7]:
# Split documents into chunks
documents = split_documents(raw_documents)

In [8]:
# Create a vector store from the documents
vector_store = create_vector_store(documents)

OllamaEmbeddings: 100%|██████████| 7/7 [00:03<00:00,  1.93it/s]


In [9]:
for i, doc in enumerate(documents):
    print(f"Document {i + 1}:")
    print(f"Title: {doc.metadata.get('title', 'No Title')}")
    print(f"Author: {doc.metadata.get('author', 'No Author')}")
    print(f"Source: {doc.metadata.get('source', 'No Source')}")
    print(f"Page Number: {doc.metadata.get('page_number', 'No Page Number')}")
    print(f"Content: {doc.page_content[:150]}...")  # Print the first 150 characters of the content
    print("-" * 40)  # Separator for readability


Document 1:
Title: Unknown Title
Author: Unknown Author
Source: C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf
Page Number: 203
Content: Google LLC | Other Information Provided by Google LLC  201 
Other Information Provided by Google LLC  
 
Internal Google Traffic  
Connections between...
----------------------------------------
Document 2:
Title: Unknown Title
Author: Unknown Author
Source: C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf
Page Number: 203
Content: Service to decrypt it. The encrypted key is not stored alongside the encrypted data.  
The wrapping keys needed to decrypt user data are only known to...
----------------------------------------
Document 3:
Title: Unknown Title
Author: Unknown Author
Source: C:/Users/ansutton/Desktop/TPRM/T

In [10]:
from langchain_community.chat_models import ChatOllama
from langchain.chains import RetrievalQA

In [11]:
local_model = "llama3.2" 
llm = ChatOllama(model=local_model)

In [12]:
qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type="stuff",
    retriever = vector_store.as_retriever(search_kwargs={"k":3}),
    return_source_documents = True
)

In [13]:
def get_answer(query):
    result = qa_chain({"query": query})
    response = result['result']

    source_documents = result['source_documents']
    pages = []
    citations = []
    for doc in source_documents:
        source = doc.metadata['source']
        pages.append((source,doc.metadata['page_number']))
        citations.append((source, doc.metadata['page_number'], doc.page_content))  # Include source in the citation
    return {
        "response": response,
        "pages": pages,
        "citations": citations
    }

In [14]:
query = "What access control procedures are in place?"

In [15]:
structured_answer = get_answer(query)

  warn_deprecated(
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 11.27it/s]


In [16]:
print(structured_answer["response"])

According to the provided context, the following access control procedures are in place:

1. Access is restricted to a limited number of individuals and applications.
2. All access to/from the Key Management Service (KMS) is controlled by Access Control Lists (ACLs).
3. Auditing is enabled to determine whether access is appropriate.

Additionally, it's mentioned that Google uses a proprietary system for key rotations, which implies that access control procedures are also in place to manage and rotate keys securely.


In [17]:
print(structured_answer["pages"])

[('C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf', 203), ('C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/APP_SHEET-[SPR-2023] AppSheet SOC 2..pdf', 116), ('C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf', 203)]


In [18]:
print(structured_answer["citations"])

[('C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf', 203, 'Service to decrypt it. The encrypted key is not stored alongside the encrypted data.  \nThe wrapping keys needed to decrypt user data are only known to the Key Management Service. \nAll access to/from the Key Management Service is controlled by ACLs. Access is restricted to a \nlimited number of individuals and applications, and auditing is enabled to determine whether \nacce ss is appropriate.  \nKey Rotations  \nGoogle uses a proprietary system to periodically generate and rotate an encryption key used to \nprotect user data at rest on average at least every 90 days. New wrapped encryption keys are \ngenerated for each new Google stora ge file (a Google file is defined in Encryption of Data Stored \nat Google above). The system helps ensure that key rotations are managed appropriately, and \nthat customer dat