Started once text preprocessing and vector database setup was finished :)

In [14]:
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Chroma
from langchain.llms import openai

class VectorDatabaseHandler:
    def __init__(self, model_name='all-MiniLM-L6-v2', persist_directory='vector_db'):
        # Initialize the SentenceTransformer model
        self.model = SentenceTransformer(model_name)

        # Set up Chroma vector store
        self.embedding_function = SentenceTransformerEmbeddingFunction(self.model)
        self.vector_store = Chroma(
            collection_name="document_collection",
            embedding_function=self.embedding_function,
            persist_directory=persist_directory  # Directory for persistent Chroma storage
        )

    def load_vector_database(self, filename='vector_database.pkl'):
        """Load the saved vector database from a pickle file."""
        with open(filename, 'rb') as file:
            vector_database = pickle.load(file)
        print(f"Vector database loaded from {filename}")
        return vector_database['embeddings'], vector_database['metadata']

    def save_vector_database(self, embeddings_list, metadata_list, filename='vector_database.pkl'):
        """Save embeddings and metadata to a pickle file."""
        vector_database = {
            'embeddings': embeddings_list,
            'metadata': metadata_list
        }
        with open(filename, 'wb') as file:
            pickle.dump(vector_database, file)
        print(f"Vector database saved to {filename}")

    def generate_embeddings(self, texts):
        """Generate embeddings for a list of texts."""
        return self.model.encode(texts, convert_to_numpy=True)

    def populate_vector_store_from_saved_data(self, filename='vector_database.pkl'):
        """Populate the vector store with data loaded from the saved vector database."""
        # Load data from the saved pickle file
        embeddings, metadata = self.load_vector_database(filename)
        
        # Ensure embeddings are in the correct format (list of lists)
        if isinstance(embeddings, np.ndarray):
            embeddings = embeddings.tolist()  # Convert to list if it's a numpy array
        elif not isinstance(embeddings, list):
            raise ValueError("Embeddings should be a list or numpy array.")
        
        # Ensure each embedding is a list (flattening if necessary)
        embeddings = [embedding if isinstance(embedding, list) else embedding.tolist() for embedding in embeddings]
        
        # Ensure embeddings is not empty
        if len(embeddings) == 0:
            raise ValueError("Embeddings are empty.")
        
        # Ensure metadata is correctly structured
        if not metadata:
            raise ValueError("Metadata is empty.")
        
        # Add embeddings and metadata to the vector store
        self.vector_store.add_texts(
            texts=[meta['original_text'] for meta in metadata],  # Assuming original_text is part of the metadata
            metadatas=metadata,
            embeddings=embeddings  # Now we directly pass embeddings
        )
        self.vector_store.persist()
        print("Vector store populated with data from saved file.")

    def handle_query(self, user_query, k=2):
        """Handles a user query by retrieving relevant documents and generating a response."""
        # Perform similarity search in the vector database
        results = self.vector_store.similarity_search(
            query=user_query,
            k=k  # Number of relevant documents to retrieve
        )

        # Extract relevant documents and metadata
        retrieved_docs = [result['text'] for result in results]
        retrieved_metadata = [result['metadata'] for result in results]

        # Construct a simple prompt without using PromptTemplate
        prompt = f"""
        You are an AI assistant. Below are some relevant documents retrieved based on a user's query.
        Use this information to generate a concise and helpful response.

        Relevant Documents:
        {retrieved_docs}

        User Query:
        {user_query}

        Your Response:
        """

        # Initialize an LLM (e.g., OpenAI's GPT) for response generation
        llm = openai(model_name="gpt-4")  # Replace with your preferred LLM model or API key setup

        # Generate the response
        response = llm(prompt)
        return response

class SentenceTransformerEmbeddingFunction:
    def __init__(self, model):
        self.model = model

    # Embedding method for documents
    def embed_documents(self, texts):
        return self.model.encode(texts)

    # Embedding method for queries
    def embed_query(self, query):
        return self.model.encode([query])


# Example usage
if __name__ == "__main__":
    # Initialize VectorDatabaseHandler
    vector_db_handler = VectorDatabaseHandler()

    # Step 1: Populate the vector store with the data from the saved vector database
    vector_db_handler.populate_vector_store_from_saved_data()

    # Step 2: Define a function to process a user query and generate a response
    user_query = "how to make chicken"
    response = vector_db_handler.handle_query(user_query)

    # Step 3: Display the response
    print(f"User Query: {user_query}")
    print(f"AI Response:\n{response}")


Vector database loaded from vector_database.pkl


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()