This notebook is made for managing of RAG vector database. It helps to load RAG database, look at its nodes and deleting them if necessary.

In [1]:
from typing import Tuple

import chromadb


from llama_index.core import VectorStoreIndex, Settings, StorageContext
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.vector_stores.chroma import ChromaVectorStore



chromadb_path = "chroma_db"


def init_vector_storage_retriever(
        model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
        top_k: int = 1
        ) -> Tuple[VectorStoreIndex, VectorIndexRetriever]:
    """
    Initialize a retriever model for finding the most relevant answer to query in vector database.

    Parameters:
        model_name (str): The name of the model to load. Default is 'sentence-transformers/all-MiniLM-L6-v2'.
            Model names can be found at the Hugging Face model hub: https://huggingface.co/models

    Returns:
        VectorStoreIndex: retriever for vector database.

    Raises:
        ValueError: If the model name is empty.
        RuntimeError: If the model fails to load.
    """

    # Explicitely set LLM to None to prevent using OpenAI API key
    Settings.llm = None

    # initialize chromadb client
    db = chromadb.PersistentClient(path=chromadb_path)

    # get collection
    chroma_collection = db.get_or_create_collection("quickstart")

    # assign chroma as the vector_store to the context
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    # load your index from stored vectors
    index = VectorStoreIndex.from_vector_store(
        vector_store, 
        storage_context=storage_context, 
        embed_model=f"local:{model_name}"
    )

    retriever = VectorIndexRetriever(index=index, similarity_top_k=top_k, use_metadata=False)

    return index, retriever, vector_store

index, retriever, vector_store = init_vector_storage_retriever(model_name="sentence-transformers/all-MiniLM-L6-v2", top_k=1)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/alex/anaconda3/envs/dwh/lib/python3.11/site-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


LLM is explicitly disabled. Using MockLLM.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ids = index.storage_context.vector_store._get(limit=100, where={}).ids
nodes = index.storage_context.vector_store._get(limit=100, where={}).nodes

queries = [n.text for n in nodes]
answers = [n.metadata["answer"] for n in nodes]

In [3]:
ids

['1634b47f-3a76-4aa2-93bb-5e235e100984',
 '7974a6fa-7eed-4149-9e4d-b911954532fc',
 'b9fdb5d6-031c-4d16-b0a6-ec02f60db106',
 'c5681639-86ef-4c6c-84e1-94f4841c83ae',
 'c9eaa81e-40d2-4c56-ba3b-4f32fad83f4a',
 'e336be11-7501-48e3-8ede-5078566de53e',
 'e537e8b3-dd83-4f68-89d6-775c27eeb370']

In [5]:
queries

['Каков средний рейтинг продуктов по каждой категории',
 'How many orders were totally made',
 'How many unique items were totally sold?',
 'Выведи все страны и количество клиентов в каждой из них',
 'Show average product rating for each category',
 'How many unique customers bought items?',
 'Calculate average product price']

In [6]:
answers

['\nSELECT AVG(p.rating) as avg_rating, c.name as category_name\nFROM products p\nJOIN categories c ON p.category_id = c.id\nGROUP BY c.name;\n',
 'SELECT COUNT(id) AS total_orders FROM orders;',
 ' \nSELECT SUM(quantity) AS total_sold_items FROM order_items;\n',
 ' \nSELECT c.name as country_name, COUNT(DISTINCT u.id) as customer_count \nFROM countries c \nLEFT JOIN users u ON c.id = u.country_code \nGROUP BY c.name; \n',
 '\nSELECT AVG(p.rating) as avg_rating, c.name as category_name\nFROM products p\nJOIN categories c ON p.category_id = c.id\nGROUP BY c.name;\n',
 'SELECT COUNT(DISTINCT user_id) AS unique_customers FROM orders;',
 '\nSELECT AVG(price) AS avg_price FROM products;\n']

In [7]:
# index.storage_context.vector_store.delete_nodes(ids[:2])