In [1]:
# !pip install  -r ../requirements_pipelines.txt

In [2]:
# # Define the base path to process the data
# base_path = '/mnt/i'

# # Create an Object to process the data
# process_data = ProcessData()

# # Iterate over all files in a directory
# for dirpath, dirnames, filenames in os.walk(base_path):

#     # Explore the files
#     for filename in filenames:
        
#         # Extract the extension
#         file_extension = os.path.splitext(filename)[1].lower()

#         if file_extension == '.pdf':
            
#             # Process the PDF file
#             result = process_data.process_pdf(os.path.join(dirpath, filename))
            
#             # If the file was processed successfully
#             if result == 'Success':
#                 # How many files were processed
#                 with open(process_data.history_file, 'r') as file:
#                     number_files = len(file.read().splitlines())
#                     print("Number of files processed: ", number_files)


In [3]:
# Python Imports
import os
import re

# Third party imports
from dotenv import load_dotenv


# Langchain imports
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.documents import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma

# Load the environment variables
load_dotenv(override=True)

True

In [4]:
class ProcessData:
    def __init__(self,
                 data_directory: str =  "/home/apec_ai/AI_Develops/AI_APEC_Develop/data/APEC_ChromaDB_VectorStore",
                 history_file: str = "/home/apec_ai/AI_Develops/AI_APEC_Develop/data/processed_files.txt",
                 error_file: str = "/home/apec_ai/AI_Develops/AI_APEC_Develop/data/error_files.txt",
                 embedding_model: str = "text-embedding-3-large"):
        
        # List to store the documents temporarily
        self.documents = []

        # History file
        os.makedirs(os.path.dirname(history_file), exist_ok=True)
        self.history_file = history_file

        # Error file
        os.makedirs(os.path.dirname(error_file), exist_ok=True)
        self.error_file = error_file

        # Define the Database to store the embeddings
        self.data_directory = data_directory

        # Load already processed files into memory
        self.processed_files = self.load_processed_files()

        # Define the initial text splitter
        self.text_splitter = CharacterTextSplitter(
            separator="\n",  # Use line breaks as separator
            chunk_size=5000,
            chunk_overlap=500,
            length_function=len,
            is_separator_regex=False
        )

        # Embedding Model
        self.embedding_openai = OpenAIEmbeddings(model=embedding_model)

        # Verify the data directory
        os.makedirs(self.data_directory, exist_ok=True)  # Create the directory and any necessary parent directories
        os.chmod(self.data_directory, 0o777) 
        # Vector Store Chroma
        self.vector_store = Chroma(
            collection_name="APEC_collection",
            embedding_function=self.embedding_openai,
            persist_directory=self.data_directory,  
        )

    def load_processed_files(self):
        """
        Loads the processed files from the history file into a list
        """
        try:
            with open(self.history_file, 'r') as file:
                return set(file.read().splitlines())
        except FileNotFoundError:
            return set()

    def process_pdf(self, filepath: str):
        """
        This Method processes a PDF file and puts the documents into the self.documents list
        :param filepath: The path to the PDF file
        :return: 'Success' if the file was processed, 'Skipped' if it was already processed
        """

        try:
            # Verify if the file was already processed
            if filepath in self.processed_files:
                print(f"{filepath} already processed. Skipping.")
                return 'Skipped'

            # Load the file
            loader = UnstructuredPDFLoader(filepath)
            data = loader.load()

            # Split the text into chunks
            documents_pdf = self.text_splitter.split_documents(data)
            print("Number of documents:", len(documents_pdf))

            # Save the processed data
            self.documents.extend(documents_pdf)

            # If there are many documents, save and free the memory
            if len(self.documents) > 50:
                status_save = self.save_procceced_data_into_vector_store()
                if status_save == 'Success':
                    self.documents = []  # Clear documents to free memory
                    print("Data was processed and saved")

            # Save the file in the history after processing
            self.processed_files.add(filepath)  # Add to in-memory history
            with open(self.history_file, 'a') as file:
                file.write(filepath + "\n")

            return 'Success'
        except Exception as e:
            print(f"Error processing file {filepath}: {e}")
            with open(self.error_file, 'a') as file:
                file.write(filepath + str(e) + "\n")
            return 'Error'

    def save_procceced_data_into_vector_store(self):
        """
        Saves the processed data into a vector store
        :param documents_to_save: List of processed documents
        :return: 'Success' if saved successfully
        """
        # Save the data in the vector store
        self.vector_store.add_documents(documents=self.documents)

        return 'Success'


Processing the data to a Vector Store Database

In [5]:
# Define the base path to process the data
base_path = '/mnt/i'

# Create an Object to process the data
process_data = ProcessData()

# Iterate over all files in a directory
for dirpath, dirnames, filenames in os.walk(base_path):

    # Explore the files
    for filename in filenames:
        
        # Extract the extension
        file_extension = os.path.splitext(filename)[1].lower()

        if file_extension == '.pdf':
            
            # Process the PDF file
            result = process_data.process_pdf(os.path.join(dirpath, filename))
            
            # If the file was processed successfully
            if result == 'Success':
                # How many files were processed
                with open(process_data.history_file, 'r') as file:
                    number_files = len(file.read().splitlines())
                    print("Number of files processed: ", number_files)


  self.embedding_openai = OpenAIEmbeddings(model=embedding_model)


/mnt/i/DX_Promote_Enhanced_Car_Wash_for_AX12_and_Anthem_UX_Rev_02_1_.pdf already processed. Skipping.
/mnt/i/ATG/Integra/Complete Integra 100, 500 configuration guide.pdf already processed. Skipping.
/mnt/i/ATG/Integra/My Tank Info/m2021-integra-configuration-guide pg 48.pdf already processed. Skipping.
/mnt/i/ATG/ProGauge/M2051 LX 4 and LX Plus Configuration Manual.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Tech_Docs.PDF already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/AdobeReader6/Help/ENU/Reader.pdf already processed. Skipping.


  from .autonotebook import tqdm as notebook_tqdm


Error processing file /mnt/i/ATG/Veeder-root tech docs/AdobeReader6/Reader/Messages/ENU/RdrMsgENU.pdf: Unable to get page count.
Command Line Error: Incorrect password

Number of documents: 0
Number of files processed:  30
Number of documents: 1
Number of files processed:  31
Number of documents: 0
Number of files processed:  32
Number of documents: 1
Number of files processed:  33
/mnt/i/ATG/Veeder-root tech docs/AdobeReader6/Resource/ENUtxt.pdf already processed. Skipping.


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/329839-001.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Number of documents: 13
Number of files processed:  34


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/330583-001.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Number of documents: 8
Number of files processed:  35


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/331651-001.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Number of documents: 10
Number of files processed:  36


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/331793-001.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Number of documents: 7
Number of files processed:  37


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/331794-001.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Number of documents: 7
Number of files processed:  38


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/331940-001.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
Created a chunk of size 1133, which is longer than the specified 500


Number of documents: 15
Data was processed and saved
Number of files processed:  39


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/331940-002.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
Created a chunk of size 2837, which is longer than the specified 500


Number of documents: 11
Number of files processed:  40


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/331940-003.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
Created a chunk of size 1552, which is longer than the specified 500


Number of documents: 13
Number of files processed:  41


Created a chunk of size 1902, which is longer than the specified 500


Number of documents: 4
Number of files processed:  42


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/331940-005.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Number of documents: 8
Number of files processed:  43


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/331940-006.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Number of documents: 15
Data was processed and saved
Number of files processed:  44


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/331940-008.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Number of documents: 13
Number of files processed:  45


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/332094-001.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Number of documents: 2
Number of files processed:  46


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/332280-001.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Number of documents: 3
Number of files processed:  47


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/332377-001.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Number of documents: 2
Number of files processed:  48


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Drawings/332771-001.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Number of documents: 3
Number of files processed:  49
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-285.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-301.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-306.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-308.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-498.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-499.pdf already processed. Skipping.


The PDF <_io.BufferedReader name='/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-567.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
Created a chunk of size 506, which is longer than the specified 500
Created a chunk of size 585, which is longer than the specified 500
Created a chunk of size 605, which is longer than the specified 500
Created a chunk of size 509, which is longer than the specified 500
Created a chunk of size 731, which is longer than the specified 500
Created a chunk of size 583, which is longer than the specified 500
Created a chunk of size 784, which is longer than the specified 500
Created a chunk of size 561, which is longer than the specified 500
Created a chunk of size 689, which is longer than the specified 500
Created a chunk of size 610, which is longer than the specified 500
Created a chunk of size 1859, which is longe

Number of documents: 273
Data was processed and saved
Number of files processed:  50


Created a chunk of size 550, which is longer than the specified 500
Created a chunk of size 720, which is longer than the specified 500
Created a chunk of size 536, which is longer than the specified 500
Created a chunk of size 2737, which is longer than the specified 500
Created a chunk of size 536, which is longer than the specified 500
Created a chunk of size 610, which is longer than the specified 500
Created a chunk of size 705, which is longer than the specified 500
Created a chunk of size 699, which is longer than the specified 500
Created a chunk of size 573, which is longer than the specified 500
Created a chunk of size 1096, which is longer than the specified 500
Created a chunk of size 664, which is longer than the specified 500
Created a chunk of size 689, which is longer than the specified 500
Created a chunk of size 536, which is longer than the specified 500
Created a chunk of size 524, which is longer than the specified 500


Number of documents: 279
Data was processed and saved
Number of files processed:  51


Created a chunk of size 707, which is longer than the specified 500
Created a chunk of size 585, which is longer than the specified 500
Created a chunk of size 588, which is longer than the specified 500
Created a chunk of size 608, which is longer than the specified 500
Created a chunk of size 811, which is longer than the specified 500
Created a chunk of size 612, which is longer than the specified 500
Created a chunk of size 578, which is longer than the specified 500
Created a chunk of size 2965, which is longer than the specified 500
Created a chunk of size 2034, which is longer than the specified 500
Created a chunk of size 964, which is longer than the specified 500
Created a chunk of size 691, which is longer than the specified 500
Created a chunk of size 614, which is longer than the specified 500
Created a chunk of size 577, which is longer than the specified 500
Created a chunk of size 593, which is longer than the specified 500
Created a chunk of size 1260, which is longer 

Number of documents: 393
Data was processed and saved
Number of files processed:  52
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-589.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-607.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-610.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-614.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-616.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-617.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-623.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-632.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-635.pdf already processed. Skipping.
/mnt/i/ATG/Veeder-root tech docs/Manuals/576013-637.pdf already processed. Skipping.


KeyboardInterrupt: 

In [12]:
results = process_data.vector_store.similarity_search_with_score(
    "How to create a car wash media set", k=4
)
for res, score in results:
    print(f"* [SIM={score:3f}] \n{res.page_content} \n[{res.metadata}]")

print(results)

* [SIM=0.753500] 
NOTE!
You cannot reuse the same car wash panel media for multiple sites if they do not have the same softkey number. Only one tag can be entered; you must upload a copy of the image file and uniquely tag it for each site with different softkey numbers.
When the image files have been uploaded and tagged, create a Carwash media set per the user guide. 
[{'source': '/mnt/i/DX_Promote_Enhanced_Car_Wash_for_AX12_and_Anthem_UX_Rev_02_1_.pdf'}]
* [SIM=0.813229] 
NOTE!
The car wash option panels can only be used for sites with three (3) or fewer car wash programs. On the point-of-sale (POS) system, the car wash programs must be assigned to softkey 1 (left), softkey 2 (center) and softkey 3 (right) to be associated with the corresponding tagged car wash panel in DX Promote. See Uploading and Tagging Images on page 3 to associate wash programs with option panels via tagging.
Media Specifications 
[{'source': '/mnt/i/DX_Promote_Enhanced_Car_Wash_for_AX12_and_Anthem_UX_Rev_02_1_.