<a href="https://colab.research.google.com/github/Yugsolanki/chromadb_haystack_rag/blob/main/ChromaDB_Haystack_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install chromadb chroma-haystack -q

In [2]:
#!/bin/bash
!kaggle datasets download jensenbaxter/10dataset-text-document-classification

Dataset URL: https://www.kaggle.com/datasets/jensenbaxter/10dataset-text-document-classification
License(s): DbCL-1.0
Downloading 10dataset-text-document-classification.zip to /content
 75% 1.00M/1.33M [00:00<00:00, 2.96MB/s]
100% 1.33M/1.33M [00:00<00:00, 3.90MB/s]


In [None]:
!unzip /content/10dataset-text-document-classification.zip -d /content/data

# Writing Documents to ChromaDocumentStore

In [4]:
import os
from pathlib import Path

from haystack import Pipeline
from haystack.components.converters import TextFileToDocument
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.chroma import ChromaDocumentStore



In [5]:
import os
from pathlib import Path

file_paths = []
for root, dirs, files in os.walk("data"):
    for file in files:
        file_paths.append(Path(root) / file)

In [6]:
file_paths[:5]

[PosixPath('data/business/business_78.txt'),
 PosixPath('data/business/business_35.txt'),
 PosixPath('data/business/business_55.txt'),
 PosixPath('data/business/business_71.txt'),
 PosixPath('data/business/business_3.txt')]

In [7]:
document_store = ChromaDocumentStore()

In [8]:
indexing = Pipeline()
indexing.add_component("converter", TextFileToDocument())
indexing.add_component("writer", DocumentWriter(document_store))

In [9]:
indexing.connect("converter", "writer")
indexing.run({"converter": {"sources": file_paths}})

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:07<00:00, 11.5MiB/s]


{'writer': {'documents_written': 1000}}

# Build RAG on top of Chroma

In [10]:
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
from haystack.components.generators import HuggingFaceLocalGenerator
from haystack.components.builders import PromptBuilder

In [11]:
prompt = """
Answer the query based on the provided context.
If the context does not contain the answer, say 'Answer not found'.
Context:
{% for doc in documents %}
  {{ doc.content }}
{% endfor %}
query: {{query}}
Answer:
"""

In [12]:
prompt_builder = PromptBuilder(template=prompt)

In [13]:
llm = HuggingFaceLocalGenerator(model="bigscience/bloomz-560m")
llm.warm_up()
retriever = ChromaQueryTextRetriever(document_store)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [14]:
querying = Pipeline()
querying.add_component("retriever", retriever)
querying.add_component("prompt_builder", prompt_builder)
querying.add_component("llm", llm)

In [15]:
querying.connect("retriever.documents", "prompt_builder.documents")
querying.connect("prompt_builder", "llm")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7e87cc855420>
🚅 Components
  - retriever: ChromaQueryTextRetriever
  - prompt_builder: PromptBuilder
  - llm: HuggingFaceLocalGenerator
🛤️ Connections
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [19]:
query = "Was the sixth and final Star Wars movie suitable for childrens if not then tell why it was not?"

In [20]:
results = querying.run(
    {
        "retriever": {"query": query, "top_k": 3},
        "prompt_builder": {"query": query}
     }
  )

In [21]:
results

{'llm': {'replies': [' not for children']}}

In [22]:
query = "How to make Mexican BBQ Chicken"

results = querying.run(
    {
        "retriever": {"query": query, "top_k": 3},
        "prompt_builder": {"query": query}
     }
  )

print(results)

{'llm': {'replies': [' marinade ingredients']}}
