In [1]:
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

In [4]:
import sys
sys.path.append("../src")
from rag_build_index_chroma import load_medquad_txt, chunk_documents, build_chroma_index


In [6]:
import torch

# 2Ô∏è Load Clean Corpus
txt_path = "../docs/medquad_qa_corpus.txt"
docs = load_medquad_txt(txt_path)
print(f"Loaded {len(docs)} MedQuAD entries")

‚úÖ Loaded 16407 Q/A documents from ../docs/medquad_qa_corpus.txt
Loaded 16407 MedQuAD entries


In [7]:
chunk_size, chunk_overlap = 600, 80
chunks = chunk_documents(docs, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
print(f"Total chunks: {len(chunks)}")
print(f"Chunk sample:\n{'-'*60}\n{chunks[0].page_content[:400]}...")

Splitting..: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16407/16407 [00:01<00:00, 11982.28it/s]

‚úÖ Total chunks created: 59334
Total chunks: 59334
Chunk sample:
------------------------------------------------------------
Q: What is (are) Adult Acute Lymphoblastic Leukemia ?...





In [8]:
persist_dir = "../chroma_store"
embedding_model = "sentence-transformers/all-mpnet-base-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"
vectordb = build_chroma_index(
    chunks, 
    persist_directory=persist_dir, 
    embedding_model_name=embedding_model, 
    device=device
)

print("Chroma index built and persisted! ")

 Loading embeddings: sentence-transformers/all-mpnet-base-v2
 Embedding on device: cuda
 Building Chroma vector index...
 Chroma store created and saved in: ../chroma_store
Chroma index built and persisted! 


In [13]:
from langchain_chroma import Chroma

def load_chroma_vectorstore(persist_directory="../chroma_store", embedding_model="sentence-transformers/all-mpnet-base-v2", device="cuda"):
    embedding = HuggingFaceEmbeddings(model_name=embedding_model, model_kwargs={"device": device})
    return Chroma(persist_directory=persist_directory, embedding_function=embedding)

retriever = load_chroma_vectorstore(persist_directory=persist_dir, embedding_model=embedding_model, device=device).as_retriever(search_kwargs={"k": 3})

query = "What are the symptoms of liver cancer?"
results = retriever.invoke(query)


for i, doc in enumerate(results, 1):
    print(f"\nüîπ Result {i}:\n{'-'*60}\n{doc.page_content[:1000]}...\n")



üîπ Result 1:
------------------------------------------------------------
Q: What are the symptoms of Adult Primary Liver Cancer ?...


üîπ Result 2:
------------------------------------------------------------
Q: What are the symptoms of Adult Primary Liver Cancer ?...


üîπ Result 3:
------------------------------------------------------------
Q: What are the symptoms of Hepatoblastoma ?...

