In [107]:
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import QueryBundle, NodeWithScore, TextNode
from typing import List
import numpy as np

In [108]:
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore

In [109]:
import chromadb
CHROMA_DB_PORT = 8010
def get_chroma_db_client():
    client = chromadb.HttpClient(
        host="localhost",
        port=int(CHROMA_DB_PORT))
    return client

In [110]:
import os
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(PROJECT_ROOT)

In [111]:
from src.embedding_client import RemoteEmbedding

In [112]:
EMBEDDING_SERVER_PORT = 8020
embed_model = RemoteEmbedding(f"http://localhost:{EMBEDDING_SERVER_PORT}")

In [113]:
client = get_chroma_db_client()

In [114]:
for i in client.list_collections():
    print(i, i.count())
    #client.delete_collection(i.name)

Collection(name=qna) 130319
Collection(name=context) 19029


In [115]:
#client.delete_collection(i.name)

In [116]:
collection = client.get_collection('qna')

In [117]:
results = collection.get(include=["documents", "metadatas"], limit=2)

In [118]:
results

{'ids': ['56be85543aeaaa14008c9063', '56be85543aeaaa14008c9065'],
 'embeddings': None,
 'metadatas': [{'title': 'Beyoncé',
   'answer': 'in the late 1990s',
   'database': 'chatbot_ui_v3',
   'collection': 'qna',
   'context_hash': '5566f9c0998385b8a8a2c94aa64aa980'},
  {'answer': 'singing and dancing',
   'context_hash': '5566f9c0998385b8a8a2c94aa64aa980',
   'collection': 'qna',
   'database': 'chatbot_ui_v3',
   'title': 'Beyoncé'}],
 'documents': ['When did Beyonce start becoming popular?',
  'What areas did Beyonce compete in when she was growing up?'],
 'data': None,
 'uris': None,
 'included': ['documents', 'metadatas']}

In [119]:
vec_store_qna = ChromaVectorStore(chroma_collection = collection)

In [121]:
# Fetch stored documents and metadata from Chroma
results = collection.get(
    include=["metadatas", "documents"],
)

# Inspect what's stored
print(results.keys())  # Should show: dict_keys(['ids', 'metadatas', 'documents'])

# Example: Print text and metadata for each stored record

for idx, doc in enumerate(results["documents"]):
    print(f"Document {idx}: {doc}")
    print(f"Metadata: {results['metadatas'][idx]}\n")
    if idx >2:
        break


dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'data', 'uris', 'included'])
Document 0: When did Beyonce start becoming popular?
Metadata: {'title': 'Beyoncé', 'context_hash': '5566f9c0998385b8a8a2c94aa64aa980', 'database': 'chatbot_ui_v3', 'collection': 'qna', 'answer': 'in the late 1990s'}

Document 1: What areas did Beyonce compete in when she was growing up?
Metadata: {'collection': 'qna', 'title': 'Beyoncé', 'context_hash': '5566f9c0998385b8a8a2c94aa64aa980', 'answer': 'singing and dancing', 'database': 'chatbot_ui_v3'}

Document 2: When did Beyonce leave Destiny's Child and become a solo singer?
Metadata: {'title': 'Beyoncé', 'database': 'chatbot_ui_v3', 'context_hash': '5566f9c0998385b8a8a2c94aa64aa980', 'collection': 'qna', 'answer': '2003'}

Document 3: In what city and state did Beyonce  grow up? 
Metadata: {'title': 'Beyoncé', 'database': 'chatbot_ui_v3', 'context_hash': '5566f9c0998385b8a8a2c94aa64aa980', 'answer': 'Houston, Texas', 'collection': 'qna'}



In [122]:
query = 'Where did beyonce grow up?'
query_embedding =  embed_model._get_text_embedding(query)
print(len(query_embedding), type(query_embedding))

384 <class 'list'>


In [None]:
# We cant directly pass query_embedding to llama index vec db

In [123]:
from llama_index.core.vector_stores.types import VectorStoreQuery

In [124]:
store_query = VectorStoreQuery(
    query_embedding=query_embedding,
    similarity_top_k=3,  # Number of results you want
)

In [125]:
results  = vec_store_qna.query(store_query)

In [126]:
results

VectorStoreQueryResult(nodes=[TextNode(id_='56bf6b0f3aeaaa14008c9601', embedding=None, metadata={'context_hash': '5566f9c0998385b8a8a2c94aa64aa980', 'collection': 'qna', 'title': 'Beyoncé', 'database': 'chatbot_ui_v3', 'answer': 'Houston, Texas'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='In what city and state did Beyonce  grow up? ', mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), TextNode(id_='56be85543aeaaa14008c9065', embedding=None, metadata={'answer': 'singing and dancing', 'database': 'chatbot_ui_v3', 'title': 'Beyoncé', 'collection': 'qna', 'context_hash': '5566f9c0998385b8a8a2c94aa64aa980'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='What areas did Beyonce compete in when she was gr

In [40]:
for node in results.nodes:
    print(node.__dict__)
    print(5*'=')

print(node)

{'id_': '56bf6b0f3aeaaa14008c9601', 'embedding': None, 'metadata': {'answer': 'Houston, Texas', 'context_hash': '5566f9c0998385b8a8a2c94aa64aa980', 'title': 'Beyoncé'}, 'excluded_embed_metadata_keys': [], 'excluded_llm_metadata_keys': [], 'relationships': {}, 'metadata_template': '{key}: {value}', 'metadata_separator': '\n', 'text': 'In what city and state did Beyonce  grow up? ', 'mimetype': 'text/plain', 'start_char_idx': None, 'end_char_idx': None, 'metadata_seperator': '\n', 'text_template': '{metadata_str}\n\n{content}'}
=====
{'id_': '56be85543aeaaa14008c9065', 'embedding': None, 'metadata': {'answer': 'singing and dancing', 'context_hash': '5566f9c0998385b8a8a2c94aa64aa980', 'title': 'Beyoncé'}, 'excluded_embed_metadata_keys': [], 'excluded_llm_metadata_keys': [], 'relationships': {}, 'metadata_template': '{key}: {value}', 'metadata_separator': '\n', 'text': 'What areas did Beyonce compete in when she was growing up?', 'mimetype': 'text/plain', 'start_char_idx': None, 'end_char_

In [20]:
help(vec_store_qna.query)

Help on method query in module llama_index.vector_stores.chroma.base:

query(query: llama_index.core.vector_stores.types.VectorStoreQuery, **kwargs: Any) -> llama_index.core.vector_stores.types.VectorStoreQueryResult method of llama_index.vector_stores.chroma.base.ChromaVectorStore instance
    Query index for top k most similar nodes.
    
    Args:
        query_embedding (List[float]): query embedding
        similarity_top_k (int): top k most similar nodes



In [42]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import VectorStoreIndex

In [45]:
help(VectorStoreIndex.from_vector_store)

Help on method from_vector_store in module llama_index.core.indices.vector_store.base:

from_vector_store(vector_store: llama_index.core.vector_stores.types.BasePydanticVectorStore, embed_model: Union[llama_index.core.base.embeddings.base.BaseEmbedding, ForwardRef('LCEmbeddings'), str, NoneType] = None, **kwargs: Any) -> 'VectorStoreIndex' method of abc.ABCMeta instance



In [47]:
index = VectorStoreIndex.from_vector_store(vec_store_qna, embed_model=embed_model)

In [49]:
retriever = VectorIndexRetriever(
    index=index,  # Index linked to your ChromaVectorStore
    similarity_top_k=5
)

In [55]:
query = 'Where did beyonce grow up?'
for i in retriever.retrieve(query):
    print(i.__dict__)
    print(5*'=')

{'node': TextNode(id_='56bf6b0f3aeaaa14008c9601', embedding=None, metadata={'answer': 'Houston, Texas', 'title': 'Beyoncé', 'context_hash': '5566f9c0998385b8a8a2c94aa64aa980'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='In what city and state did Beyonce  grow up? ', mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), 'score': 0.8588517725815735}
=====
{'node': TextNode(id_='56be85543aeaaa14008c9065', embedding=None, metadata={'title': 'Beyoncé', 'context_hash': '5566f9c0998385b8a8a2c94aa64aa980', 'answer': 'singing and dancing'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='What areas did Beyonce compete in when she was growing up?', mimetype='text/plain', start_char_idx=None, end_char_idx=None, me

In [59]:
from llama_index.vector_stores.chroma import ChromaVectorStore

from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import VectorStoreIndex

def get_retriever(chroma_collection, embed_model, similarity_top_k=3):
    vec_store = ChromaVectorStore(chroma_collection = chroma_collection)
    index = VectorStoreIndex.from_vector_store(vec_store, embed_model=embed_model)
    retriever = VectorIndexRetriever(
            index=index,
            similarity_top_k=similarity_top_k
        )
    return retriever

In [60]:
context_collection = client.get_collection('context')
qna_collection = client.get_collection('qna')

context_retriever = get_retriever(chroma_collection=context_collection, embed_model=embed_model, similarity_top_k=3)
qna_collection = get_retriever(chroma_collection=qna_collection, embed_model=embed_model, similarity_top_k=3)

In [63]:
query = 'Where did beyonce grow up?'
context_node = context_retriever.retrieve(query)
qna_nodes = qna_collection.retrieve(query)

In [65]:
combined_nodes = context_node + qna_nodes

In [68]:
reranked_results = sorted(combined_nodes, key=lambda node: node.score, reverse=True)

In [72]:
for i in reranked_results:
    print(i)
    print(5*'-')
    print(i.__dict__)
    print(5*'=')

Node ID: 56bf6b0f3aeaaa14008c9601
Text: In what city and state did Beyonce  grow up?
Score:  0.859

-----
{'node': TextNode(id_='56bf6b0f3aeaaa14008c9601', embedding=None, metadata={'title': 'Beyoncé', 'answer': 'Houston, Texas', 'context_hash': '5566f9c0998385b8a8a2c94aa64aa980'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='In what city and state did Beyonce  grow up? ', mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), 'score': 0.8588517725815735}
=====
Node ID: 56be85543aeaaa14008c9065
Text: What areas did Beyonce compete in when she was growing up?
Score:  0.728

-----
{'node': TextNode(id_='56be85543aeaaa14008c9065', embedding=None, metadata={'context_hash': '5566f9c0998385b8a8a2c94aa64aa980', 'answer': 'singing and dancing', 'title': 'Beyoncé'}, excluded_embed_metadata_keys=[], excluded_llm_met