In [1]:
import json

from openai import OpenAI
from minsearch import Index
from gitsource import GithubRepositoryDataReader, chunk_documents

openai_client = OpenAI()

reader = GithubRepositoryDataReader(
    repo_owner="evidentlyai",
    repo_name="docs",
    allowed_extensions={"md", "mdx"},
)

files = reader.read()
parsed_docs = [doc.parse() for doc in files]
chunked_docs = chunk_documents(parsed_docs, size=3000, step=1500)

index = Index(
    text_fields=["title", "description", "content"],
    keyword_fields=["filename"]
)
index.fit(chunked_docs)

instructions = """
You're a documentation assistant. Answer the QUESTION based on the CONTEXT.

Use only facts from the CONTEXT when answering.
If the answer isn't in the CONTEXT, say so.
"""

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    context = json.dumps(search_results, indent=2)
    return prompt_template.format(
        question=question,
        context=context
    )

def search(query):
    return index.search(query=query, num_results=5)

def llm_structured(user_prompt, instructions, output_format):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.responses.parse(
        model="gpt-4o-mini",
        input=messages,
        text_format=output_format
    )

    return response.output_parsed

print(f"Indexed {len(chunked_docs)} chunks from {len(files)} documents")

Indexed 385 chunks from 95 documents


In [2]:
from typing import Literal
from pydantic import BaseModel, Field


class RAGResponse(BaseModel):
    answer: str = Field(description="The main answer to the user's question")
    found_answer: bool = Field(description="True if relevant information was found")
    confidence: float = Field(description="Confidence score from 0.0 to 1.0")
    answer_type: Literal["how-to", "explanation", "troubleshooting", "comparison", "reference"] = Field(description="Category of the answer")
    followup_questions: list[str] = Field(description="Suggested follow-up questions")

In [4]:
def rag(query, output_format=RAGResponse):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    return llm_structured(prompt, instructions, output_format)

In [6]:
query = 'llm as a judge'

answer = rag(query)

print(f"found_answer: {answer.found_answer}")
print(f"confidence: {answer.confidence}")
print(f"answer_type: {answer.answer_type}")
print(f"answer: {answer.answer[:100]}...")

found_answer: True
confidence: 0.95
answer_type: explanation
answer: Using an LLM as a judge involves evaluating text responses based on custom criteria. There are two m...


In [7]:
!uv add sentence-transformers

[2mResolved [1m155 packages[0m [2min 3.74s[0m[0m
[2mInstalled [1m16 packages[0m [2min 28.45s[0m[0m
 [32m+[39m [1mclick[0m[2m==8.3.1[0m
 [32m+[39m [1mfilelock[0m[2m==3.20.3[0m
 [32m+[39m [1mfsspec[0m[2m==2026.1.0[0m
 [32m+[39m [1mhf-xet[0m[2m==1.2.0[0m
 [32m+[39m [1mhuggingface-hub[0m[2m==1.3.4[0m
 [32m+[39m [1mmpmath[0m[2m==1.3.0[0m
 [32m+[39m [1mnetworkx[0m[2m==3.6.1[0m
 [32m+[39m [1mregex[0m[2m==2026.1.15[0m
 [32m+[39m [1msafetensors[0m[2m==0.7.0[0m
 [32m+[39m [1msentence-transformers[0m[2m==5.2.2[0m
 [32m+[39m [1mshellingham[0m[2m==1.5.4[0m
 [32m+[39m [1msympy[0m[2m==1.14.0[0m
 [32m+[39m [1mtokenizers[0m[2m==0.22.2[0m
 [32m+[39m [1mtorch[0m[2m==2.10.0[0m
 [32m+[39m [1mtransformers[0m[2m==5.0.0[0m
 [32m+[39m [1mtyper-slim[0m[2m==0.21.1[0m


In [8]:
from sentence_transformers import SentenceTransformer

In [11]:
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/multi-qa-MiniLM-L6-cos-v1
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [12]:
q1 = 'install Evidently locally' # -
q2 = 'how can I set up evidently in my projects?' # -> [1000+]

In [16]:
v1 = model.encode(q1)
v1[:20]

array([ 0.07184873, -0.02879701,  0.0612349 , -0.0181076 ,  0.12533106,
       -0.04134636, -0.06354983, -0.08986848, -0.04142817, -0.0446535 ,
        0.06421643,  0.02077088,  0.00219164,  0.00508833, -0.00833959,
       -0.03770021,  0.03059419, -0.0484462 ,  0.06848747, -0.01030121],
      dtype=float32)

In [17]:
v1.shape

(384,)

In [18]:
v2 = model.encode(q2)

In [19]:
v1.dot(v2)

np.float32(0.43477958)

In [20]:
q3 = 'how do i bake pretzels?'

In [21]:
v3 = model.encode(q3)

In [22]:
v1.dot(v3)

np.float32(0.022272678)

In [23]:
texts = []

for doc in chunked_docs:
    title = doc.get('title', '')
    description = doc.get('description', '')
    content = doc.get('content', '')

    text = title + " " + description + " " + content
    texts.append(text.strip())


In [26]:
embeddings = model.encode(texts, show_progress_bar=True)

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

In [27]:
embeddings.shape

(385, 384)

In [28]:
embeddings

array([[-0.04849773,  0.06601194,  0.01094063, ...,  0.03090537,
         0.06879068,  0.05070846],
       [-0.0323205 , -0.01947777, -0.01677329, ...,  0.01519906,
         0.0237962 , -0.00774648],
       [-0.02108493, -0.05657953,  0.00483409, ..., -0.04806887,
         0.07104319,  0.04317407],
       ...,
       [-0.00160594, -0.04139439, -0.05660466, ...,  0.05727464,
         0.0903789 , -0.0273216 ],
       [-0.02243875, -0.0011336 , -0.05355769, ...,  0.04080837,
         0.0889126 , -0.00143775],
       [-0.06048306, -0.05174244,  0.01422591, ...,  0.00431385,
         0.09889203,  0.00393771]], shape=(385, 384), dtype=float32)

In [32]:
scores = embeddings.dot(v1)
scores[:15]

array([ 0.0795968 ,  0.12935531,  0.18664983,  0.260543  ,  0.08233505,
       -0.02807797, -0.07214396, -0.05210198, -0.01589224,  0.00971274,
       -0.0527408 , -0.10643847,  0.04062283,  0.0634449 , -0.04217617],
      dtype=float32)

In [33]:
from minsearch import VectorSearch

In [34]:
vindex = VectorSearch(keyword_fields=['filename'])
vindex.fit(embeddings, chunked_docs)

<minsearch.vector.VectorSearch at 0x1f3f7e7e7b0>

In [44]:
def vector_search(query, num_results=5):
    vector = model.encode(query)
    return vindex.search(vector, num_results=num_results)

In [41]:
def rag_vector(query, output_format=RAGResponse):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    return llm_structured(prompt, instructions, output_format)

In [43]:
answer = rag_vector(query)

print(f"Query: {query}")
print(f"found_answer: {answer.found_answer}")
print(f"confidence: {answer.confidence}")
print(f"answer: {answer.answer[:200]}...")

Query: llm as a judge
found_answer: True
confidence: 0.95
answer: Using an LLM as a judge involves evaluating text responses against custom criteria or comparing them to reference responses. The primary applications include:

1. **Reference-based Evaluation**: This ...


In [55]:
def hybrid_search(query):
    vector_results = vector_search(query)
    text_results = search(query)
    return vector_results + text_results

In [53]:
results = hybrid_search(query)
len(results)

10