In [1]:
#%pip install openai
#%pip install chromadb

In [1]:
from __future__ import annotations
from dataclasses import dataclass, field, asdict
from typing import List, Dict, Any, Optional, Tuple
from time import time
import json
import hashlib
import os

In [2]:
os.chdir(r"<folder where hybrid_rk_merged.py is >")

In [3]:
os.environ["OPENAI_API_KEY"]="sk-<>"

In [4]:
from hybrid_rk_merged import OpenAIResponsesAdapter, RAgent, KAgent, HybridController, OpenAIEmbedder, ChromaRetriever , OpenAIChatAdapter

In [5]:

# ==========================================
# ==== Simple In-Memory KB + Retriever   ===
# ==========================================

class InMemoryKB:
    """A toy KB. Replace with your RAG/vector DB/web search."""
    def __init__(self, rows: List[Dict[str, Any]]):
        self.rows = rows

    def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
        # naive: rank by shared tokens
        q_tokens = set(query.lower().split())
        scored = []
        for r in self.rows:
            text = (r.get("text","") + " " + r.get("meta","")).lower()
            score = len(q_tokens.intersection(set(text.split())))
            if score > 0:
                scored.append((score, r))
        scored.sort(key=lambda x: x[0], reverse=True)
        return [r for _, r in scored[:k]]



In [7]:

if __name__ == "__main__":
    # Choose stack via env: RK_STACK in {"chroma","pgvector"}
    stack = os.environ.get("RK_STACK", "chroma").lower()

    # Choose LLM
    # Default to OpenAI Responses adapter. Swap to Azure by instantiating AzureOpenAIChatAdapter().
    llm = OpenAIResponsesAdapter()             # OpenAIResponsesAdapter(model=os.environ.get("OPENAI_MODEL", "gpt-4o-mini"))

    # Build retriever + embedder
    if stack == "pgvector":
        dsn = os.environ.get("PG_DSN", "postgresql://postgres:postgres@localhost:5432/postgres")
        embedder: Embedder = OpenAIEmbedder()
        retr = PGVectorRetriever(dsn=dsn, dim=3072)
        # Seed a couple docs idempotently
        docs = [
            ("pa", "Model A typical on-device latency is ~80ms for 256 tokens on Snapdragon X.", "kb://models#A_latency", "2025-01-15"),
            ("pb", "Model B typical on-device latency is ~120ms for 256 tokens on Snapdragon X.", "kb://models#B_latency", "2025-02-10"),
            ("pm", "Model A memory footprint: ~1.8GB int8 quantized; Model B: ~2.3GB int8.", "kb://models#mem", "2025-02-12"),
        ]
        embs = embedder.embed([d[1] for d in docs])
        retr.add(((d[0], d[1], d[2], d[3], e) for d, e in zip(docs, embs)))
        k_agent = KAgent(retriever=retr, embedder=embedder)
    else:
        # Chroma
        embedder = OpenAIEmbedder()
        retr = ChromaRetriever(collection="rk_demo", persist_dir=os.environ.get("CHROMA_DIR", ".chroma"), embedder=embedder)
        if retr.count() == 0:
            texts = [
                "Model A typical on-device latency is ~80ms for 256 tokens on Snapdragon X.",
                "Model B typical on-device latency is ~120ms for 256 tokens on Snapdragon X.",
                "Model A memory footprint: ~1.8GB int8 quantized; Model B: ~2.3GB int8.",
                "Model A license permits on-device commercial use with attribution.",
                "Model B license restricts redistribution of weights; on-device inference allowed.",
            ]
            metas = [
                {"source": "kb://models#A_latency", "date": "2025-01-15"},
                {"source": "kb://models#B_latency", "date": "2025-02-10"},
                {"source": "kb://models#mem", "date": "2025-02-12"},
                {"source": "kb://models#licenseA", "date": "2024-11-02"},
                {"source": "kb://models#licenseB", "date": "2025-03-01"},
            ]
            retr.add(ids=[f"r{i}" for i in range(len(texts))], texts=texts, metadatas=metas)
        k_agent = KAgent(retriever=retr)

    r_agent = RAgent(llm=llm)
    controller = HybridController(r=r_agent, k=k_agent, enable_critic_pass=True, use_k_based_checker=True)

    question = os.environ.get("RK_QUESTION", "Compare Model A vs Model B for on-device summarization.")
    result = controller.solve(question)

    print("\n=== PLAN ===")
    print(json.dumps(result["plan"], indent=2))
    print("\n=== FINAL ===")
    print(result["final"]) 
    print("\n=== CHECKS ===")
    print(json.dumps(result["checks"], indent=2))



=== PLAN ===
{
  "subquestions": [
    {
      "q": "Compare Model A vs Model B for on-device summarization.",
      "needs_facts": true
    }
  ],
  "assumptions": []
}

=== FINAL ===
Answer:
- Model A memory footprint: ~1.8GB int8 quantized; Model B: ~2.3GB int8. (source: kb://models#mem)
- Model B license restricts redistribution of weights; on-device inference allowed. (source: kb://models#licenseB)
- Model B typical on-device latency is ~120ms for 256 tokens on Snapdragon X. (source: kb://models#B_latency)
- Model A license permits on-device commercial use with attribution. (source: kb://models#licenseA)
- Model A typical on-device latency is ~80ms for 256 tokens on Snapdragon X. (source: kb://models#A_latency)

Groundedness audit:
? Answer:  (sources: —)
✔ - Model A memory footprint: ~1.8GB int8 quantized; Model B: ~2.3GB int8. (source: kb://models#mem)  (sources: kb://models#mem)
✔ - Model B license restricts redistribution of weights; on-device inference allowed. (source: kb:/