In [1]:
# pip install chromadb langchain sentence-transformers
import os
os.environ["CHROMA_TELEMETRY"] = "0"

import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer

# Data Ingestion and Storage

This notebook implements the core data ingestion and chunking pipeline for building a Retrieval-Augmented Generation (RAG) system on the Rockfish platform. Key steps include:

**Loading documents** (e.g., from text files)  
**Chunking** them into smaller passages for better searchability  
**Generating embeddings** using `sentence-transformers/all-MiniLM-L6-v2`  
**Storing** these chunks in a Chroma vector database (or other vector DB)  
**Validating** that embeddings and chunks were successfully stored

This sets up the **retrieval layer** of your RAG pipeline—ready for LLM integration later!

In [2]:
# Load data
DATA_FOLDER = 'Data'
docs = []
for file_name in os.listdir(DATA_FOLDER):
    if file_name.endswith('.txt'):
        loader = TextLoader(os.path.join(DATA_FOLDER, file_name))
        loaded_docs = loader.load()
        docs.extend(loaded_docs)
print(f"Loaded {len(docs)} documents")

Loaded 88 documents


In [3]:
# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_documents(docs)
print(f"Generated {len(chunks)} chunks")

Generated 1612 chunks


In [4]:
# Initialize Chroma vectorstore
persist_directory = "chroma_db"
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

chroma_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory=persist_directory
)
chroma_db.persist()
print("Chunks stored in Chroma DB")

python(99373) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Chunks stored in Chroma DB


  warn_deprecated(


In [5]:
# Query / retrieval function
def retrieve_top_k_chunks(question, top_k=5):
    retriever = chroma_db.as_retriever(search_kwargs={"k": top_k})
    relevant_docs = retriever.get_relevant_documents(question)
    return relevant_docs

# Example query
question = "How does Rockfish integrate with existing data pipelines?"
top_chunks = retrieve_top_k_chunks(question, top_k=5)
print("\nTop Chunks:")
for idx, doc in enumerate(top_chunks, start=1):
    print(f"\n Chunk {idx}:\n{doc.page_content}")

  warn_deprecated(



Top Chunks:

 Chunk 1:
(Optional) Rockfish Integration

Follow this tutorial to understand how to integrate the Rockfish platform into your pipeline.

Load a dataset

 Chunk 2:
Now that you have followed our tutorial to understand the basics of how to integrate Rockfish in your Ops pipeline, lets focus on your use cases for Synthetic data.

 Chunk 3:
Now that you have followed our tutorial to understand the basics of how to integrate Rockfish in your Ops pipeline, lets focus on your use cases for Synthetic data.
Follow these use case tutorials to familiarize yourself with how to use Rockfish platform to solve your use case.
These examples provide a starting point for common use cases which you can modify to suit your specific needs.

 Chunk 4:
Now that you have followed our tutorial to understand the basics of how to integrate Rockfish in your Ops pipeline, lets focus on your use cases for Synthetic data.
Follow these use case tutorials to familiarize yourself with how to use Rockfish