In [9]:
!pip install  -r ../requirements_pipelines.txt

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [10]:
# Python Imports
import os
import re

# Third party imports
from dotenv import load_dotenv


# Langchain imports
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.documents import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# Load the environment variables
load_dotenv(override=True)

True

In [11]:
class ProcessData:
    def __init__(self,
                data_directory: str="../data/APEC_ChromaDB_VectorStore",
                history_file: str="../data/processed_files.txt",
                embedding_model: str="text-embedding-3-large",
                ):
        
        # List to store the documents temporarily
        self.documents = []

        # History file
        self.history_file = history_file

        # Define the Database to store the embeddings
        self.data_directory = data_directory
        # Define the initial text splitter by "\n\n" line break
        self.text_splitter = CharacterTextSplitter(
                                                    separator="\n",  # Use two line breaks as separator
                                                    chunk_size=500,  # Don't limit the chunk size
                                                    chunk_overlap=50,  # No overlap between chunks
                                                    length_function=len,
                                                    is_separator_regex=False  # Indicate that the separator is not a regular expression
                                                    )

        

        # Embedding Model
        self.embedding_openai = OpenAIEmbeddings(model=embedding_model)

    def process_pdf(self, filepath: str):
        """
        This Method processes a PDF file and put the documents in to the self.documents list
        :param filepath: The path to the PDF file
        :return: None
        """
        print("Processing the file: ", filepath)
        # Verify if the file was already processed
        # if not self.verify_file_was_already_processed(filepath):
        #     return 'Document already processed before'
        
        # Load the file
        loader = UnstructuredPDFLoader(filepath)
        data = loader.load()

        # Split the text into chunks
        documents_pdf = self.text_splitter.split_documents(data)
        print("Number of documents: ", len(documents_pdf))
        # Save the processed data
        self.documents.extend(documents_pdf)

        # Verify the number of documents
        number_documents = len(self.documents)

        # If there are many docuemnts save and free the memory
        if number_documents > 50:

            # Save the data
            status_save = self.save_procceced_data_into_vector_store(self.documents)

            # Verify if the process was successful
            if status_save == 'Success':
                self.documents = []
                print("The data was already proccesed and saved")

        # Save the file in the history
        with open(self.history_file, 'a') as file:
            file.write(filepath + "\n")

        print(self.documents)
        return 'Success'
    
    def save_procceced_data_into_vector_store(self, documents_to_save: list):
        """
        This method saves the processed data in to a list
        :param data: The data to be saved
        :return: None
        """
        # Create a Chroma Vector Store
        vectorstore_chroma = Chroma.from_documents(
                                                    documents=documents_to_save,
                                                    embedding=self.embedding_openai,
                                                    persist_directory=self.data_directory
                                                    )
        
        # Persist the data
        vectorstore_chroma.persist()

        return 'Success'
    
    def verify_file_was_already_processed(self, filepath: str):
        """
        This method verifies if the file was already processed
        :param filepath: The path to the file
        :return: None
        """
        # Read the history file
        with open(self.history_file, 'r') as file:
            history_files = file.read().splitlines()

        # Verify if the file is in the history
        if filepath in history_files:
            return True
        
        return False

Processing the data to a Vector Store Database

In [12]:
# Define the base path to process the data
base_path = '/mnt/i'

# Create an Object to process the data
process_data = ProcessData()

# Iterate over all files in a directory
for dirpath, dirnames, filenames in os.walk(base_path):

    # Explore the files
    for filename in filenames:
        
        # Extract the extension
        file_extension = os.path.splitext(filename)[1].lower()

        if file_extension.strip() == '.pdf':
            
            # Process the PDF file
            process_data.process_pdf(os.path.join(dirpath, filename))

Processing the file:  /mnt/i/DX_Promote_Enhanced_Car_Wash_for_AX12_and_Anthem_UX_Rev_02_1_.pdf
Number of documents:  12
[Document(metadata={'source': '/mnt/i/DX_Promote_Enhanced_Car_Wash_for_AX12_and_Anthem_UX_Rev_02_1_.pdf'}, page_content='Feature User Guide DFS DX™ Connected Solutions Platform\nDX Promote® Enhanced Car Wash Configuration for the DFS AX12 Enhanced and Anthem UX® Platforms\nRev. 02 November 2023\nThe default car wash prompting for the DFS Anthem iXPay2 payment platforms appears as shown in the figures below. The prompts and selections sent by the site POS (point-of-sale) system are used to populate fixed screens.'), Document(metadata={'source': '/mnt/i/DX_Promote_Enhanced_Car_Wash_for_AX12_and_Anthem_UX_Rev_02_1_.pdf'}, page_content='When a site is using DX® Promote — and dispensers are equipped with the DFS AX12 Enhanced or Anthem UX® platform — the back- ground and panels can be customized with images downloaded from DX Promote. This functionality is enabled by check

Created a chunk of size 654, which is longer than the specified 500
Created a chunk of size 569, which is longer than the specified 500
Created a chunk of size 577, which is longer than the specified 500
Created a chunk of size 547, which is longer than the specified 500
Created a chunk of size 544, which is longer than the specified 500
Created a chunk of size 648, which is longer than the specified 500
Created a chunk of size 579, which is longer than the specified 500
Created a chunk of size 648, which is longer than the specified 500
Created a chunk of size 579, which is longer than the specified 500
Created a chunk of size 1188, which is longer than the specified 500
Created a chunk of size 767, which is longer than the specified 500
Created a chunk of size 810, which is longer than the specified 500
Created a chunk of size 969, which is longer than the specified 500
Created a chunk of size 561, which is longer than the specified 500


Number of documents:  345


OperationalError: attempt to write a readonly database