In [8]:
import sys

sys.path.insert(0, '..')

import docs

In [1]:
import pandas as pd

In [6]:
df_eval = pd.read_csv("./ground_truth_evidently.csv")

In [19]:
ground_truth = df_eval.to_dict(orient='records')

In [11]:
import docs

raw_documents = docs.read_github_data()
documents = docs.parse_data(raw_documents)
chunks = docs.chunk_documents(documents)


In [13]:
from minsearch import Index
from typing import Any, Dict, List, TypedDict

index = Index(
    text_fields=["content", "filename", "title", "description"],
)

index.fit(chunks)

<minsearch.minsearch.Index at 0x10d0ef1a0>

In [14]:
class SearchResult(TypedDict):
    """Represents a single search result entry."""
    start: int
    content: str
    title: str
    description: str
    filename: str


def search(query: str) -> List[SearchResult]:
    """
    Search the index for documents matching the given query.

    Args:
        query (str): The search query string.

    Returns:
        List[SearchResult]: A list of search results. Each result dictionary contains:
            - start (int): The starting position or offset within the source file.
            - content (str): A text excerpt or snippet containing the match.
            - title (str): The title of the matched document.
            - description (str): A short description of the document.
            - filename (str): The path or name of the source file.
    """
    return index.search(
        query=query,
        num_results=5,
    )


In [20]:

all_relevancies = []

for rec in ground_truth:
    filename = rec['filename']
    sr = search(rec['question'])
    relevancy = [r['filename'] == filename for r in sr]
    all_relevancies.append(relevancy)

In [23]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [24]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
                break

    return total_score / len(relevance_total)


In [26]:
hit_rate(all_relevancies), mrr(all_relevancies)

(0.43610547667342797, 0.34367816091954)

In [28]:
ground_truth[1]

{'question': 'create Dataset object Evidently',
 'summary_answer': 'To create a `Dataset` object in Evidently, you can use the `Dataset.from_pandas()` method with a data definition to specify the roles and types of your data columns.',
 'difficulty': 'beginner',
 'intent': 'code',
 'filename': 'docs/library/data_definition.mdx'}

In [33]:
from tqdm.auto import tqdm

def evaluate(
        ground_truth,
        search_function,
        question_column='question',
        id_column='filename'
):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q[id_column]
        results = search_function(q[question_column])
        relevance = [d[id_column] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }


In [34]:
evaluate(ground_truth, search)

  0%|          | 0/493 [00:00<?, ?it/s]

{'hit_rate': 0.43610547667342797, 'mrr': 0.34367816091954}

In [36]:
## vector search

from sentence_transformers import SentenceTransformer

In [37]:

embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')


In [38]:
import numpy as np
from tqdm.auto import tqdm

embeddings = []

for d in tqdm(chunks):
    text = d.get('title', '') + ' ' + d.get('description', '') + ' ' + d.get('content', '')
    text = text.strip()
    v = embedding_model.encode(text)
    embeddings.append(v)

embeddings = np.array(embeddings)

  0%|          | 0/575 [00:00<?, ?it/s]

In [39]:
from minsearch import VectorSearch

vindex = VectorSearch()
vindex.fit(embeddings, chunks)

<minsearch.vector.VectorSearch at 0x14cd12180>

In [40]:
def v_search(query: str) -> List[SearchResult]:
    """
    Search the index for documents matching the given query.

    Args:
        query (str): The search query string.

    Returns:
        List[SearchResult]: A list of search results. Each result dictionary contains:
            - start (int): The starting position or offset within the source file.
            - content (str): A text excerpt or snippet containing the match.
            - title (str): The title of the matched document.
            - description (str): A short description of the document.
            - filename (str): The path or name of the source file.
    """

    q = embedding_model.encode(query)

    return vindex.search(
        q,
        num_results=5,
    )


In [41]:
evaluate(ground_truth, v_search)

  0%|          | 0/493 [00:00<?, ?it/s]

{'hit_rate': 0.7302231237322515, 'mrr': 0.5679513184584176}

In [42]:
def h_search(query: str) -> List[SearchResult]:
    """
    Search the index for documents matching the given query.

    Args:
        query (str): The search query string.

    Returns:
        List[SearchResult]: A list of search results. Each result dictionary contains:
            - start (int): The starting position or offset within the source file.
            - content (str): A text excerpt or snippet containing the match.
            - title (str): The title of the matched document.
            - description (str): A short description of the document.
            - filename (str): The path or name of the source file.
    """

    return v_search(query) + search(query)


In [None]:
# here have 10 serach, so might be better
# otherwise, it is better to be only 5
# hybrid need to also have 5 search

In [43]:
evaluate(ground_truth, h_search)

  0%|          | 0/493 [00:00<?, ?it/s]

{'hit_rate': 0.7748478701825557, 'mrr': 0.5746804468914001}