Started once text preprocessing and vector database setup was finished :)

In [12]:
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Chroma
from langchain.llms import openai

class VectorDatabaseHandler:
    def __init__(self, model_name='all-MiniLM-L6-v2', persist_directory='vector_db'):
        # Initialize the SentenceTransformer model
        self.model = SentenceTransformer(model_name)

        # Set up Chroma vector store
        self.embedding_function = SentenceTransformerEmbeddingFunction(self.model)
        self.vector_store = Chroma(
            collection_name="document_collection",
            embedding_function=self.embedding_function,
            persist_directory=persist_directory  # Directory for persistent Chroma storage
        )

    def load_vector_database(self, filename='vector_database.pkl'):

        """Load the saved vector database from a pickle file."""
        with open(filename, 'rb') as file:
            vector_database = pickle.load(file)
        print(f"Vector database loaded from {filename}")

        return vector_database['embeddings'], vector_database['metadata']

    def save_vector_database(self, embeddings_list, metadata_list, filename='vector_database.pkl'):
        """Save embeddings and metadata to a pickle file."""
        vector_database = {
            'embeddings': embeddings_list,
            'metadata': metadata_list
        }
        with open(filename, 'wb') as file:
            pickle.dump(vector_database, file)
        print(f"Vector database saved to {filename}")

    def generate_embeddings(self, texts):
        """Generate embeddings for a list of texts."""
        return self.model.encode(texts, convert_to_numpy=True)

    def populate_vector_store_from_saved_data(self, filename='vector_database.pkl'):
        """Populate the vector store with data loaded from the saved vector database."""

        try: 
            # Load data from the saved pickle file
            embeddings, metadata = self.load_vector_database(filename)
            embeddings = [
                np.nan_to_num(embedding, nan=0.0, posinf=0.0, neginf=0.0)
                for embedding in embeddings
            ]

            # Ensure embeddings are in the correct format (list of lists)
            if isinstance(embeddings, np.ndarray):
                embeddings = embeddings.tolist()  # Convert to list if it's a numpy array
            elif not isinstance(embeddings, list):
                raise ValueError("Embeddings should be a list or numpy array.")

            # Ensure embeddings are not empty
            if isinstance(embeddings, np.ndarray):  # If embeddings is a NumPy array
                if len(embeddings)== 0:
                    raise ValueError("Embeddings are empty.")
            elif isinstance(embeddings, list):  # If embeddings is a list
                if len(embeddings) == 0:
                    raise ValueError("Embeddings are empty.")
            else:
                raise ValueError("Unknown type for embeddings. Expected list or numpy.ndarray.")

            # old version of flattening
            # embeddings = [embedding if isinstance(embedding, list) else embedding.tolist() for embedding in embeddings]
            # new version of flattering : ensure each embedding is a list (flattening if necessary)
            embeddings = [embedding.flatten().tolist() if isinstance(embedding, np.ndarray) else embedding for embedding in embeddings]

            # Ensure embeddings is not empty
            if len(embeddings)==0:
                raise ValueError("Embeddings are empty.")

            # Ensure metadata is correctly structured
            if not metadata:
                raise ValueError("Metadata is empty.")

            # Ensures original text exists
            texts = [meta['original_text'] for meta in metadata if 'original_text' in meta]
            if not texts:
                raise ValueError("No 'original_text' key found in metadata.")

            # Ensures embeddings are numeric
            for idx, embedding in enumerate(embeddings):
                if not all(isinstance(val, (int, float, np.floating, np.integer)) for val in embedding):
                    raise ValueError(f"Embedding at index {idx} contains non-numeric values: {embedding}")

            # Add embeddings and metadata to the vector store
            self.vector_store.add_texts(
                texts=[meta['original_text'] for meta in metadata],  # Assuming original_text is part of the metadata
                metadatas=metadata,
                embeddings=embeddings  # Now we directly pass embeddings
            )

            # Add texts and embeddings one by one to debug
            # for i in range(len(metadata)):
            #     try:
            #         self.vector_store.add_texts(
            #             texts=[metadata[i]['original_text']],  # Add a single text at a time
            #             metadatas=[metadata[i]],  # Add the corresponding metadata
            #             embeddings=[embeddings[i]]  # Add a single embedding
            #         )
            #     except Exception as e:
            #         print(f"Error adding item {i}: {e}")

            self.vector_store.persist()
            print("Vector store populated with data from saved file.")

            # debugging the data structure
            print(f"Embeddings type: {type(embeddings)}, length: {len(embeddings)}")
            print(f"Metadata type: {type(metadata[:3])}, length: {len(metadata)}")
        except ValueError as e:
            print(f"ValueError occured: {e}")
        except Exception as e:
            print(f"An error occured: {e}")

        # test the data after loading
        print(f"Loaded embeddings: {embeddings[:3]}")  # Preview first 3 embeddings
        print(f"Loaded metadata: {metadata[:3]}")    # Preview first 3 metadata entries
        print(f"Embeddings preview: {embeddings[:3]}")  # Check first 3 embeddings
        # does not print so len of embeddings and metadata match so not the problem
        if len(embeddings) != len(metadata):
            raise ValueError(f"Number of embeddings ({len(embeddings)}) does not match number of metadata ({len(metadata)}).")
        # both print class list so that's not the problem
        print(f"Type of embeddings: {type(embeddings)}")
        print(f"Type of metadata: {type(metadata)}")

    def handle_query(self, user_query, k=2):
        """Handles a user query by retrieving relevant documents and generating a response."""
        # Perform similarity search in the vector database
        results = self.vector_store.similarity_search(
            query=user_query,
            k=k  # Number of relevant documents to retrieve
        )

        # Extract relevant documents and metadata
        retrieved_docs = [result['text'] for result in results]
        retrieved_metadata = [result['metadata'] for result in results]

        # Construct a simple prompt without using PromptTemplate
        prompt = f"""
        You are an AI assistant. Below are some relevant documents retrieved based on a user's query.
        Use this information to generate a concise and helpful response.

        Relevant Documents:
        {retrieved_docs}

        User Query:
        {user_query}

        Your Response:
        """

        # Initialize an LLM (e.g., OpenAI's GPT) for response generation
        llm = openai(model_name="gpt-4")  # Replace with your preferred LLM model or API key setup

        # Generate the response
        response = llm(prompt)
        return response

class SentenceTransformerEmbeddingFunction:
    def __init__(self, model):
        self.model = model

    # Embedding method for documents
    def embed_documents(self, texts):
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        return embeddings

    # Embedding method for queries
    def embed_query(self, query):
        embedding = self.model.encode([query], convert_to_numpy=True)
        return embedding


# Example usage
if __name__ == "__main__":
    # Initialize VectorDatabaseHandler
    vector_db_handler = VectorDatabaseHandler()

    # Step 1: Populate the vector store with the data from the saved vector database
    vector_db_handler.populate_vector_store_from_saved_data()

    # Step 2: Define a function to process a user query and generate a response
    user_query = "how to make chicken"
    response = vector_db_handler.handle_query(user_query)

    # Step 3: Display the response
    print(f"User Query: {user_query}")
    print(f"AI Response:\n{response}")

Vector database loaded from vector_database.pkl
ValueError occured: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Loaded embeddings: [[-0.028531987220048904, -0.017205100506544113, -0.01807147078216076, -0.025295797735452652, -0.02556699886918068, -0.03505215793848038, -0.005073211621493101, 0.037991516292095184, 0.05599883571267128, 0.05340452119708061, 0.02292126603424549, 0.01862766221165657, -0.08795951306819916, -0.012520243413746357, 0.012860649265348911, -0.021059958264231682, 0.018680723384022713, 0.05720271170139313, -0.09307244420051575, -0.004500418435782194, 0.1516454517841339, -0.01776779256761074, -0.12361284345388412, 0.027725908905267715, 0.04199016094207764, -0.030380191281437874, -0.03875354677438736, 0.0005685559008270502, -0.05026441812515259, 0.08586353063583374, 0.02625224180519581, 0.014997834339737892, -0.022484201937913895, 0.04712257534265518, 0.024279026314616203, 0.047875892370939255, 0.013904040679335594, 0.0032

ValueError: Expected each value in the embedding to be a int or float, got an embedding with ['ndarray'] - [[-6.07468039e-02 -4.96362261e-02 -3.19795497e-02 -2.53695971e-03
  -3.09494585e-02 -3.39189311e-03  2.99431756e-02 -3.21766213e-02
   1.97168719e-02 -1.28607675e-02 -1.46663059e-02 -4.06498201e-02
  -3.48833241e-02 -3.47304493e-02 -3.92895080e-02 -6.10014647e-02
   2.16313563e-02 -7.56658334e-03 -2.43046340e-02 -8.01153854e-02
  -2.52891816e-02 -8.94731283e-02  6.37236312e-02 -2.95828450e-02
   1.13018928e-02  4.55256412e-03  5.39371595e-02  3.66967432e-02
   2.76301056e-02 -1.12386756e-02  8.90882611e-02 -3.67554352e-02
  -5.01450012e-03 -1.47742992e-02 -4.10877839e-02 -5.58128767e-03
   6.59301784e-03 -2.65122368e-03  7.13664368e-02  3.71642932e-02
   4.88988310e-02 -6.09838292e-02  9.86229926e-02 -3.71611193e-02
   9.10603032e-02  4.72435318e-02  2.93108169e-02  4.37863022e-02
   4.92542349e-02 -1.15918107e-01 -5.89001328e-02 -2.56316252e-02
  -5.35212606e-02 -7.29585811e-02  1.77037250e-02  1.15508027e-01
  -1.45816401e-01 -3.91526073e-02 -5.88883311e-02 -1.93833783e-02
  -2.38132440e-02  4.56814952e-02 -2.59309299e-02 -4.32808930e-03
   9.85720474e-03 -6.76927045e-02  2.80272588e-03  5.43777831e-02
   4.89798933e-03  6.56978488e-02  1.97341163e-02  2.59065954e-03
  -9.68443006e-02  5.02866916e-02 -5.97373620e-02  4.04034890e-02
   9.73214433e-02 -4.80916239e-02  2.49177087e-02  4.62183617e-02
  -1.03004545e-01 -3.11266948e-02 -4.01595607e-02  5.53395599e-02
  -2.14626510e-02  6.57580197e-02  5.68868369e-02 -1.09659322e-02
  -1.57159101e-02 -1.85073949e-02  1.36852220e-01  5.57755381e-02
  -6.42761365e-02 -4.17265855e-03  2.13188268e-02  2.78739780e-02
   3.04390080e-02 -5.92880473e-02  2.14639530e-02  4.59595546e-02
  -3.68463621e-02 -5.55031747e-02  1.82604380e-02 -8.35875049e-02
   8.08737427e-03  1.09882951e-02  6.18798956e-02  5.60033508e-02
   5.58578745e-02  2.36243866e-02 -1.12473473e-01  1.40656075e-02
  -2.54402198e-02 -1.65362358e-02 -1.72278460e-03 -7.39192544e-03
   5.28704897e-02 -3.90432477e-02 -6.48268759e-02 -2.72328779e-02
   5.65361232e-02 -3.95386331e-02 -1.68947391e-02  6.28310964e-02
  -4.20579538e-02  5.40958419e-02 -1.79190692e-02 -4.19297182e-33
   4.77216654e-02  5.83389355e-03  7.15545639e-02  3.53285111e-02
   8.22971687e-02  6.60515353e-02  2.45957319e-02 -5.75612001e-02
   1.53749837e-02  6.47922009e-02  3.25948745e-02 -1.08789608e-01
  -3.92969921e-02  1.13123804e-02  7.16809407e-02 -8.02825019e-02
   3.99284624e-03 -2.93326899e-02  3.50708850e-02 -9.63119790e-03
  -3.61054987e-02  1.54529354e-02  8.10517222e-02  7.22448621e-03
   2.67434549e-02  5.89251472e-03  6.53354963e-03 -9.82497782e-02
  -6.26112800e-03 -1.13165868e-03  1.01165557e-02  2.37634778e-02
  -8.54219273e-02 -3.92655376e-03 -4.57816981e-02 -1.29701542e-02
  -7.58085167e-03 -1.87320784e-02  5.41326664e-02  9.13322344e-02
   1.06568500e-01 -1.03285186e-01  1.79458819e-02  2.50588395e-02
   1.96392089e-02 -9.67319030e-03  2.15503629e-02  2.04022657e-02
  -6.66481629e-03  4.85118059e-03  6.21492006e-02  9.39984620e-03
   3.15813161e-02 -6.58064634e-02 -7.37468433e-03 -1.57435574e-02
   1.36217475e-02 -6.83898330e-02  6.89710751e-02  1.90121084e-02
  -9.61529687e-02  5.72712049e-02 -3.46478522e-02  7.55792111e-02
  -3.31610590e-02 -5.57503812e-02 -4.34450209e-02 -3.66966054e-02
   3.77061515e-04 -3.89919057e-02  5.29671125e-02 -9.17219967e-02
   1.54140331e-02 -1.28890857e-01 -6.80391043e-02  2.79585831e-02
  -8.25636461e-02  2.74405777e-02  7.18056485e-02 -4.72764075e-02
   2.82813627e-02  4.33321744e-02 -6.64566308e-02  5.92343626e-04
   1.07612263e-03 -1.69535894e-02 -8.87385681e-02  3.16514298e-02
   1.30779982e-01 -1.72719695e-02 -3.94071080e-02 -4.48481515e-02
   9.26359296e-02 -1.00752026e-01 -4.46815938e-02  2.03678310e-33
  -7.01271817e-02  1.77483466e-02 -7.75242224e-02  6.13280721e-02
  -2.08666530e-02 -8.67476091e-02 -1.14128608e-02 -1.95971541e-02
   3.71343866e-02 -1.77502795e-03 -3.99317890e-02 -5.13824895e-02
   3.29449140e-02 -5.21093681e-02 -5.33130132e-02  1.36906570e-02
   6.32445291e-02  1.47469357e-01  2.08395664e-02 -1.44533124e-02
  -3.83652304e-03  4.49902471e-03  1.60082821e-02  5.56847490e-02
  -1.19373191e-03  2.04020664e-02  8.63016397e-02  4.57356274e-02
   2.82740314e-03 -1.69448424e-02  1.62887108e-02 -1.09908618e-01
   1.84120834e-02 -1.37236426e-02 -7.22369552e-02  7.86645152e-03
   6.11488558e-02  1.78439040e-02  8.39602202e-02 -6.88061640e-02
   1.69641022e-02  5.65474562e-04 -1.43876793e-02  2.73795892e-02
  -2.60449778e-02  1.13080963e-02  2.10202672e-02 -5.86355925e-02
   6.65383041e-03  2.18503252e-02  5.42301163e-02 -1.07885614e-01
   5.23861460e-02 -7.51687735e-02  2.06369776e-02 -3.16794850e-02
   9.94829368e-03  1.62049886e-02  8.14796463e-02  3.24025042e-02
  -4.45142165e-02  2.05299761e-02  9.18493047e-03 -2.48337965e-02
   5.32786846e-02  2.17113905e-02 -5.44015737e-03  7.08642676e-02
   1.13499518e-02 -4.19614604e-03 -8.90089273e-02  6.46590367e-02
   3.19755524e-02  3.59646119e-02 -2.87028844e-03  1.14975329e-02
   1.45604033e-02  7.99964182e-03  3.81086357e-02  8.09740648e-03
   6.88227592e-03 -1.01566017e-01 -3.66732851e-02 -1.82638410e-02
  -2.63672136e-02 -4.18882398e-03  3.35844271e-02  5.02904728e-02
   3.25172432e-02  2.34288182e-02 -3.56808193e-02 -3.22638638e-02
   5.43617532e-02  1.33890316e-01  5.34578972e-02 -1.21088499e-08
  -2.26724986e-02 -1.03489928e-01  3.30106146e-03  3.63118649e-02
   5.28239086e-02  5.43956459e-02  2.26909854e-02 -9.42807198e-02
   1.30358502e-01 -8.86205807e-02 -6.46435395e-02  1.91010777e-02
   4.12633978e-02  3.58879901e-02 -1.10789528e-03 -2.13368814e-02
   3.10920738e-03  6.41010627e-02 -2.00082064e-02 -2.16573980e-02
   1.57821216e-02  3.16915847e-02 -3.53541300e-02 -3.02170292e-02
   8.36992711e-02 -4.27409001e-02 -1.77578181e-02  3.58883478e-02
  -1.17353536e-02  4.99735139e-02 -3.28164771e-02 -2.54536420e-03
   9.50534735e-03  4.32854556e-02 -2.47525405e-02 -1.02178492e-01
  -2.60708556e-02 -2.55090464e-03  7.50308973e-04 -6.23754337e-02
  -4.57558967e-02  9.10225287e-02 -1.34046078e-02 -4.20006812e-02
  -1.51762217e-02  1.35623747e-02 -8.67472142e-02 -4.36085463e-02
  -5.28549440e-02  7.01193735e-02  5.60413152e-02  2.98580417e-04
   1.31975159e-01 -3.10015623e-02  9.64637194e-03  1.54512031e-02
   1.73766259e-02 -7.89805315e-03  7.68832192e-02 -3.33809294e-02
  -5.88042028e-02  9.44017842e-02  3.17369513e-02  2.02115569e-02]] in query.