In [None]:
!pip install -U -q tqdm json_repair ddgs requests beautifulsoup4 SQLAlchemy wikipedia-api

In [None]:
# Hugging Face + datasets + transformers
!pip install -U -q datasets transformers accelerate

# LangChain + community integrations
!pip install -U -q langchain langchain-community chromadb

# Embeddings dependencies
!pip install -U -q sentence-transformers

!pip install -U -q langchain-text-splitters

In [None]:
import requests
import zipfile
import io
import sys
import os
import shutil

# 1. The URL for the source code ZIP
url = "https://github.com/anpc849/kagentic/archive/refs/heads/main.zip"

# 2. Download the file using requests
print("Downloading...")
response = requests.get(url)
if response.status_code == 200:
    # 3. Unzip the content directly from memory
    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
        z.extractall(".")
    print("Unzipped successfully!")
else:
    print(f"Failed to download. Status code: {response.status_code}")

# 4. Add the folder to the Python Path
# GitHub's ZIPs always create a folder named 'repo-branch'
shutil.move(os.path.abspath('kagentic-main'), os.path.abspath('kagentic'))
module_path = "/kaggle/working"
if module_path not in sys.path:
    sys.path.append(module_path)

# 5. Verify the import
try:
    import kagentic
    print("Import successful! You can now use kagentic.")
except ImportError:
    print("Import failed. Try checking !ls kagentic to see the folder structure.")

In [1]:
import os
from tqdm import tqdm

import kaggle_benchmarks as kbench

from datasets import load_dataset
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from transformers import AutoTokenizer

from kagentic import CodeAgent, Document, Tool, ToolInput

In [2]:
# ---------------------------------------------------------------------------
# 1. Load source documents from HuggingFace Hub
# ---------------------------------------------------------------------------
knowledge_base = load_dataset("m-ric/huggingface_doc", split="train")

source_docs = [
    Document(
        page_content=doc["text"],
        metadata={"source": doc["source"].split("/")[1]},
    )
    for doc in knowledge_base.select(range(5))
]

# ---------------------------------------------------------------------------
# 2. Split documents (deduplicated)
# ---------------------------------------------------------------------------
print("Splitting documents...")

text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    AutoTokenizer.from_pretrained("thenlper/gte-small"),
    chunk_size=200,
    chunk_overlap=20,
    add_start_index=True,
    strip_whitespace=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

docs_processed = []
unique_texts = {}
for doc in tqdm(source_docs):
    # source_docs are kagent.Document; text_splitter expects objects with .page_content
    new_docs = text_splitter.split_documents([doc])
    for new_doc in new_docs:
        if new_doc.page_content not in unique_texts:
            unique_texts[new_doc.page_content] = True
            docs_processed.append(new_doc)

print(f"  â†’ {len(docs_processed)} unique chunks produced.")



Splitting documents...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:00<00:00, 36.35it/s]

  â†’ 70 unique chunks produced.





In [3]:
# ---------------------------------------------------------------------------
# 3. Embed + store in ChromaDB
# ---------------------------------------------------------------------------
db_path = "./chroma_db"

print("Preparing vector store...")

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# check if Chroma DB already exists
if os.path.exists(db_path) and os.listdir(db_path):
    print("Loading existing ChromaDB...")
    vector_store = Chroma(
        persist_directory=db_path,
        embedding_function=embeddings,
    )
else:
    print("Embedding documents... (may take a few minutes)")
    vector_store = Chroma.from_documents(
        docs_processed,
        embedding=embeddings,
        persist_directory=db_path,
    )
    vector_store.persist()

print("  â†’ Vector store ready.")

Preparing vector store...


  embeddings = HuggingFaceEmbeddings(


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading existing ChromaDB...


  vector_store = Chroma(


  â†’ Vector store ready.


In [4]:
# ---------------------------------------------------------------------------
# 4. Define RetrieverTool
# ---------------------------------------------------------------------------
class RetrieverTool(Tool):
    """
    Semantic-search retriever backed by a Chroma vector store.
    Finds the k most relevant documentation chunks for a query.
    """

    name = "retriever"
    description = (
        "Uses semantic search to retrieve the parts of documentation that could be "
        "most relevant to answer your query."
    )
    inputs = {
        "query": ToolInput(
            type="string",
            description=(
                "The query to perform. This should be semantically close to your target "
                "documents. Use the affirmative form rather than a question."
            ),
            required=True,
        ),
        "topk": ToolInput(
            type="integer",
            description="The number of documents to retrieve.",
            required=True,
        ),
    }
    output_type = "string"

    def __init__(self, vector_store, **kwargs):
        super().__init__(**kwargs)
        self.vector_store = vector_store

    def forward(self, query: str, topk: int) -> str:
        assert isinstance(query, str), "Your search query must be a string"

        docs = self.vector_store.similarity_search(query, k=topk)
        return "\nRetrieved documents:\n" + "".join(
            [
                f"\n\n===== Document {i} =====\n{doc.page_content}"
                for i, doc in enumerate(docs)
            ]
        )

In [5]:
@kbench.task(
    name="rag_retrieval_answering",
    description="Evaluate whether the agent uses a retrieval tool to obtain context and answer a question."
)
def rag_retrieval_answering(llm):
    retriever_tool = RetrieverTool(vector_store)

    agent = CodeAgent(
        tools=[retriever_tool],
        model=llm,
        max_steps=10,
        verbosity_level=2,
        additional_instructions=(
            "Your answer must be based only on the knowledge base. "
            "If there is no useful information, respond exactly with: 'I dont know.', no yapping."
        ),
    )

    question = "How can I push a model to the Hub?"
    agent_output = agent.run(question)

    # âœ… Assert the agent correctly abstains
    kbench.assertions.assert_true(
        "I dont know" in agent_output,
        expectation="The agent should abstain and respond 'I dont know' when retrieval has no relevant information."
    )

In [6]:
# Run task
rag_retrieval_answering.run(kbench.llm)

[agent] 
[agent] ðŸ¤– kagentic starting â€” model: google/gemini-2.5-flash
[agent] ðŸ“‹ Task: How can I push a model to the Hub?


[agent] --- Step 1 ---
[agent]   ðŸŽ¯ Action:  retriever({"query": "push model to Hub", "topk": 3})
[agent]   ðŸ”§ Calling tool: retriever({"query": "push model to Hub", "topk": 3})
[agent]   ðŸ“¤ Tool result: 
Retrieved documents:


===== Document 0 =====
</Tip>

### How do I actually train my model on TPU?

Once your training is XLA-compatible and (if youâ€™re using TPU Node / Colab) your dataset has been p...

[agent] --- Step 2 ---
[agent]   ðŸ’­ Thought: The retrieved documents do not contain any information about pushing a model to the Hub. They are all related to TPU training and inference endpoints. Therefore, I cannot answer the question based on the provided knowledge base.
[agent]   ðŸŽ¯ Action:  final_answer({"answer": "I dont know."})
[agent] 
âœ… Final answer after 2 steps.


BokehModel(combine_events=True, render_bundle={'docs_json': {'9e1f3af4-24bc-4fa8-9751-081e7d6f236d': {'versionâ€¦