In [None]:
import os
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

from dotenv import load_dotenv

load_dotenv()

# ---------------------------------------------------------
# Resolve project-root-level Chroma path
# ---------------------------------------------------------
try:
    # Works when running as a .py script
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    # Fallback for notebooks / interactive mode
    BASE_DIR = os.getcwd()

PROJECT_ROOT = os.path.dirname(BASE_DIR)                     # go one level up
CHROMA_PATH = os.path.join(PROJECT_ROOT, "chroma_db")         # <project_root>/chroma_db
COLLECTION_NAME = "global_rag_collection"

# ---------------------------------------------------------
# Initialize Chroma client and embeddings
# ---------------------------------------------------------
client = chromadb.PersistentClient(path=CHROMA_PATH)

openai_emb = OpenAIEmbeddingFunction(
    api_key=os.getenv("OPENAI_API_KEY"),
    model_name="text-embedding-3-small"
)
# ---------------------------------------------------------
# Indexing function
# ---------------------------------------------------------
def index_chunks(chunks, source_name, job_id):
    collection = client.get_or_create_collection(name=COLLECTION_NAME, embedding_function=openai_emb)
    ids = [f"{job_id}_{i}" for i in range(len(chunks))]
    metas = [
        {"source": source_name, "job_id": job_id, "chunk_id": i}
        for i in range(len(chunks))
    ]
    collection.add(documents=chunks, ids=ids, metadatas=metas)
    print(f"✅ Indexed {len(chunks)} chunks in '{COLLECTION_NAME}' at {CHROMA_PATH}")
    return len(chunks)

# ---------------------------------------------------------
# Test block
# ---------------------------------------------------------
if __name__ == "__main__":
    sample_chunks = [
        "This is the first sample chunk.",
        "This is the second sample chunk."
    ]
    index_chunks(sample_chunks, source_name="test_source", job_id="job123")

    # Verify
    coll = client.get_collection(name=COLLECTION_NAME)
    print(f"📦 Total docs in collection: {coll.count()}")
    print("🔎 Peek at stored docs:", coll.peek())


✅ Indexed 2 chunks in 'global_rag_collection' at c:\Users\aniln\Desktop\github_celery_redis\realtime-rag-pipeline\chroma_db
📦 Total docs in collection: 2
🔎 Peek at stored docs: {'ids': ['job123_0', 'job123_1'], 'embeddings': array([[ 0.0149151 , -0.01345615,  0.00756176, ..., -0.02635218,
        -0.01427681, -0.02069877],
       [ 0.01427765, -0.01042295, -0.015034  , ..., -0.02466744,
        -0.01419804, -0.00069746]], shape=(2, 1536)), 'documents': ['This is the first sample chunk.', 'This is the second sample chunk.'], 'uris': None, 'included': ['metadatas', 'documents', 'embeddings'], 'data': None, 'metadatas': [{'source': 'test_source', 'chunk_id': 0, 'job_id': 'job123'}, {'source': 'test_source', 'chunk_id': 1, 'job_id': 'job123'}]}


In [3]:
import os
from dotenv import load_dotenv

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

# ---------------------------------------------------------
# Load environment variables
# ---------------------------------------------------------
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# ---------------------------------------------------------
# Resolve project-root-level paths
# ---------------------------------------------------------
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    BASE_DIR = os.getcwd()

PROJECT_ROOT = os.path.dirname(BASE_DIR)
VECTOR_STORE_PATH = os.path.join(PROJECT_ROOT, "chroma_db")
COLLECTION_NAME = "global_rag_collection"

# ---------------------------------------------------------
# Initialize LangChain embeddings
# ---------------------------------------------------------
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# ---------------------------------------------------------
# Initialize Chroma vector store
# ---------------------------------------------------------
vector_store = Chroma(
    persist_directory=VECTOR_STORE_PATH,
    embedding_function=embeddings,
    collection_name=COLLECTION_NAME
)

# ---------------------------------------------------------
# Index chunks function
# ---------------------------------------------------------
def index_chunks(chunks, source_name, job_id):
    ids = [f"{job_id}_{i}" for i in range(len(chunks))]
    metadatas = [{"source": source_name, "job_id": job_id, "chunk_id": i} for i in range(len(chunks))]
    
    vector_store.add_texts(
        texts=chunks,
        metadatas=metadatas,
        ids=ids
    )
    vector_store.persist()
    print(f"✅ Indexed {len(chunks)} chunks to '{COLLECTION_NAME}' at {VECTOR_STORE_PATH}")
    return len(chunks)

# ---------------------------------------------------------
# Retriever
# ---------------------------------------------------------
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# ---------------------------------------------------------
# Prompt template
# ---------------------------------------------------------
system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Use three sentences maximum and keep the answer concise.\n"
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

# ---------------------------------------------------------
# LLM
# ---------------------------------------------------------
llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)

# ---------------------------------------------------------
# Chains
# ---------------------------------------------------------
question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)

# ---------------------------------------------------------
# Ask question function
# ---------------------------------------------------------
def ask_question(query: str) -> str:
    result = chain.invoke({"input": query})
    print("*" * 100)
    print("result:", result)
    return result.get("answer", "No answer found.")

# ---------------------------------------------------------
# Test: index and query
# ---------------------------------------------------------
if __name__ == "__main__":
    # Sample chunks to index
    sample_chunks = [
        "Graph RAG combines retrieval augmented generation with knowledge graphs.",
        "LangChain simplifies building RAG pipelines using Chroma or other vector stores.",
        "Chroma DB stores embeddings persistently so queries can be fast and scalable."
    ]
    
    index_chunks(sample_chunks, source_name="sample_doc", job_id="job1")
    
    # Ask question
    query = "What is Graph RAG?"
    answer = ask_question(query)
    print("Answer:", answer)
# 

  embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
  vector_store.persist()


✅ Indexed 3 chunks to 'global_rag_collection' at c:\Users\aniln\Desktop\github_celery_redis\realtime-rag-pipeline\chroma_db
****************************************************************************************************
result: {'input': 'What is Graph RAG?', 'context': [Document(metadata={'chunk_id': 0, 'source': 'sample_doc', 'job_id': 'job1'}, page_content='Graph RAG combines retrieval augmented generation with knowledge graphs.'), Document(metadata={'job_id': 'job1', 'chunk_id': 1, 'source': 'sample_doc'}, page_content='LangChain simplifies building RAG pipelines using Chroma or other vector stores.'), Document(metadata={'source': 'sample_doc', 'job_id': 'job1', 'chunk_id': 2}, page_content='Chroma DB stores embeddings persistently so queries can be fast and scalable.')], 'answer': 'Graph RAG combines retrieval augmented generation with knowledge graphs to enhance the process of information retrieval and generation. It leverages knowledge graphs to improve the quality and rele

In [1]:
from indexing import index_chunks
chunks = ["Graph RAG combines retrieval with knowledge graphs.", "LangChain simplifies RAG pipelines."]
index_chunks(chunks, source_name="test_doc", job_id="manual_test")


  embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
  vector_store = Chroma(


✅ Indexed 2 chunks to 'global_rag_collection' at c:\Users\aniln\Desktop\github_celery_redis\realtime-rag-pipeline\chroma_db


  vector_store.persist()


2

In [None]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
import os

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
VECTOR_STORE_PATH = os.path.join(os.getcwd(), "chroma_db")

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Create / load Chroma store without specifying collection_name
vs = Chroma(
    persist_directory=VECTOR_STORE_PATH,
    embedding_function=embeddings
)

# Add texts
chunks = [
    "Graph RAG combines retrieval with knowledge graphs.",
    "LangChain simplifies RAG pipelines."
]
vs.add_texts(texts=chunks)
# Persist (optional, automatic in latest versions)
vs.persist()

# Reload in another instance
vs_reload = Chroma(
    persist_directory=VECTOR_STORE_PATH,
    embedding_function=embeddings
)

print("Total docs after reload:", vs_reload._collection.count())


✅ Indexed 2 chunks to 'global_rag_collection' at c:\Users\aniln\Desktop\github_celery_redis\realtime-rag-pipeline\chroma_db
Total docs after indexing: 0


In [None]:
import os
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

load_dotenv()

# ---------------------------------------------------------
# Paths
# ---------------------------------------------------------
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    BASE_DIR = os.getcwd()

PROJECT_ROOT = os.path.dirname(BASE_DIR)
VECTOR_STORE_PATH = os.path.join(PROJECT_ROOT, "chroma_db", "39359794-2f90-4588-a5b4-6e7ac5f24012")

# ---------------------------------------------------------
# OpenAI embeddings
# ---------------------------------------------------------
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# ---------------------------------------------------------
# Initialize Chroma vector store (default collection)
# ---------------------------------------------------------
vector_store = Chroma(
    persist_directory=VECTOR_STORE_PATH,
    embedding_function=embeddings
)

# ---------------------------------------------------------
# Index chunks function
# ---------------------------------------------------------
def index_chunks(chunks, source_name, job_id):
    """
    Index a list of text chunks into Chroma.
    Each chunk is stored with metadata: source_name, job_id, chunk_id
    """
    ids = [f"{job_id}_{i}" for i in range(len(chunks))]
    metadatas = [
        {"source": source_name, "job_id": job_id, "chunk_id": i}
        for i in range(len(chunks))
    ]

    vector_store.add_texts(texts=chunks, metadatas=metadatas, ids=ids)
    print(f"✅ Indexed {len(chunks)} chunks at {VECTOR_STORE_PATH}")
    return len(chunks)


# ------------------------
# Quick test
# ------------------------
if __name__ == "__main__":
    sample_chunks = [
        "Graph RAG combines retrieval with knowledge graphs.",
        "LangChain simplifies RAG pipelines."
    ]
    index_chunks(sample_chunks, source_name="test_doc1", job_id="manual_test")


  vector_store = Chroma(


NameError: name 'chunks' is not defined

In [6]:
vs.persist()

# Reload in another instance
vs_reload = Chroma(
    persist_directory=VECTOR_STORE_PATH,
    embedding_function=embeddings
)

print("Total docs after reload:", vs_reload._collection.count())


Total docs after reload: 2


  vs.persist()


In [9]:
import os
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

load_dotenv()

# ---------------------------------------------------------
# Paths
# ---------------------------------------------------------
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    BASE_DIR = os.getcwd()

PROJECT_ROOT = os.path.dirname(BASE_DIR)
VECTOR_STORE_PATH = os.path.join(PROJECT_ROOT, "chroma_db")

# ---------------------------------------------------------
# Embeddings
# ---------------------------------------------------------
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# ---------------------------------------------------------
# Load Chroma vector store (default collection)
# ---------------------------------------------------------
vector_store = Chroma(
    persist_directory=VECTOR_STORE_PATH,
    embedding_function=embeddings
)

retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# ---------------------------------------------------------
# LLM and prompt
# ---------------------------------------------------------
system_prompt = (
    "You are an assistant that answers questions **only** using the provided context. "
    "Do not make up answers. If the answer is not in the context, reply exactly: 'I don't know.' "
    "Use a maximum of three sentences.\n"
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", "{input}")])

llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)

qa_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, qa_chain)

# ---------------------------------------------------------
# Ask question function
# ---------------------------------------------------------
def ask_question(query: str) -> str:
    result = chain.invoke({"input": query})
    return result.get("answer", "No answer found.")


# ------------------------
# Quick test
# ------------------------
if __name__ == "__main__":
    query = "What is Grpah RAG?"
    print("Question:", query)
    print("Answer:", ask_question(query))


Question: What is Grpah RAG?
Answer: Graph RAG combines retrieval with knowledge graphs.


In [5]:
import os
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

load_dotenv()

# Path to your Chroma DB
VECTOR_STORE_PATH = "./chroma_db"

# Embeddings
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

VECTOR_STORE_COLLECTION = "default_collection"

vector_store = Chroma(
    persist_directory=VECTOR_STORE_PATH,
    collection_name=VECTOR_STORE_COLLECTION,
    embedding_function=embeddings,
)

# Get total number of documents
total_docs = vector_store._collection.count()
print(f"Total documents in Chroma DB: {total_docs}")

# Optional: Inspect first few documents
results = vector_store._collection.get(include=["documents", "metadatas"])
for doc, meta in zip(results["documents"], results["metadatas"]):
    print(meta, doc[:100])  # print first 100 chars


Total documents in Chroma DB: 0


In [2]:
print(os.getcwd())

c:\Users\aniln\Desktop\github_celery_redis\realtime-rag-pipeline\backend


In [None]:
import getpass
import os
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import chromadb
from uuid import uuid4


if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")


embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",
)

client = chromadb.PersistentClient(path="./chroma_langchain_db")

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)
\

documents = [
    document_1,
    document_2,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)