In [1]:
import minsearch
import pandas as pd
from dotenv import load_dotenv
import os
from openai import OpenAI
from tqdm.auto import tqdm
import math
from qdrant_client.http.models import ScoredPoint 
load_dotenv()
import optuna
from sklearn.model_selection import train_test_split

from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

# docker run -p 6333:6333 -p 6334:6334 -v "${PWD}\qdrant_storage:/qdrant/storage" qdrant/qdrant
from qdrant_client import QdrantClient, models
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

poi_data = pd.read_csv('../data/krakow_pois_selected.csv')
documents = poi_data.to_dict(orient='records')

df_question = pd.read_csv("../data/ground-truth-retrieval.csv")
ground_truth_test, ground_truth_valid = train_test_split(
    df_question,
    test_size=0.3, 
    random_state=123
)


ground_truth_test = ground_truth_test.to_dict(orient='records')
ground_truth_valid = ground_truth_valid.to_dict(orient='records')

In [3]:
text_columns = ['name','amenity','leisure','natural','tourism','historic','wiki_summary_en']

In [4]:
def hit_rate(relevance_total):
    """
    Calculate the Hit Rate for a set of ranked results.

    The Hit Rate measures the proportion of queries for which at least one 
    relevant item is present in the returned results, regardless of its rank.

    Args:
        relevance_total (list of list of bool): 
            A list where each sublist corresponds to a single query's ranked 
            results. Each boolean in a sublist indicates whether the result at 
            that rank is relevant (True) or not (False).

    Returns:
        float: Hit Rate value between 0 and 1.
    """
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt += 1
    return cnt / len(relevance_total)


In [5]:

def mrr(relevance_total):
    """
    Calculate the Mean Reciprocal Rank (MRR) for a set of ranked results.

    MRR measures the average reciprocal rank of the first relevant result 
    across queries. If a query has multiple relevant results, only the first 
    relevant result contributes to the score.

    Args:
        relevance_total (list of list of bool): 
            A list where each sublist corresponds to a single query's ranked 
            results. Each boolean in a sublist indicates whether the result at 
            that rank is relevant (True) or not (False).

    Returns:
        float: MRR value between 0 and 1.
    """
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
                break
    return total_score / len(relevance_total)


In [6]:

def recall_at_k(relevance_total, k=1):
    """
    Calculate Recall@k for a set of ranked results.

    Args:
        relevance_total (list of list of int or bool):
            A list where each sublist corresponds to a single query's ranked results.
            Each element indicates relevance of a result (1/True = relevant, 0/False = not relevant).
        k (int):
            Cutoff rank.

    Returns:
        float: Average Recall@k score across all queries (0–1).
    """
    scores = []
    for relevances in relevance_total:
        total_relevant = sum(relevances)
        if total_relevant == 0:
            scores.append(0.0)  # brak dokumentów istotnych → recall = 0
            continue

        retrieved_relevant = sum(relevances[:k])
        scores.append(retrieved_relevant / total_relevant)

    return sum(scores) / len(scores) if scores else 0.0




In [7]:

def ndcg(relevance_total, k=None):
    """
    Calculate the Normalized Discounted Cumulative Gain (nDCG) at rank k.

    Args:
        relevance_total (list of list of int):
            A list where each sublist corresponds to a single query's ranked results.
            Each value in a sublist indicates the relevance score of the result at that rank.
            (0 = not relevant, 1 = relevant, or graded relevance if available).
        k (int, optional):
            The rank cutoff. If None, considers the full list.

    Returns:
        float: Average nDCG score across all queries (0–1).
    """
    def dcg(relevances, k):
        """Compute DCG at rank k"""
        return sum(
            (rel / math.log2(idx + 2))  # idx + 2 because log2(1) is undefined
            for idx, rel in enumerate(relevances[:k])
        )

    scores = []
    for relevances in relevance_total:
        # if k not provided, take all
        k_val = k if k is not None else len(relevances)

        ideal_relevances = sorted(relevances, reverse=True)
        idcg = dcg(ideal_relevances, k_val)
        actual_dcg = dcg(relevances, k_val)

        score = (actual_dcg / idcg) if idcg > 0 else 0.0
        scores.append(score)

    return sum(scores) / len(scores) if scores else 0.0



# DataFrame with metrics
df_metrics = pd.DataFrame({"method" : [], "hit_rate": [],  "mrr": [],"recall_at_k": [],'ndcg': []})

In [8]:
def is_qdrant_query_response(obj):
    return 'QueryResponse' in str(type(obj))

In [9]:
def evaluate(ground_truth, search_function):
    """
    Run queries through a search function and record relevance results.

    For each query in `ground_truth`, this calls `search_function` and
    produces a list of booleans indicating whether each retrieved document's
    ID matches the query's correct ID. Supports both Qdrant results and plain
    list/dict results.

    Args:
        ground_truth (list of dict): Queries with a correct 'id'.
        search_function (callable): Function returning ranked search results.

    Returns:
        list[list[bool]]: Per-query relevance lists for metric calculations.
    """
    
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)

        if is_qdrant_query_response(results):
            relevance = [doc_id == d.id for d in results.points]
        elif isinstance(results, list) and len(results) > 0 and isinstance(results[0], ScoredPoint): 
            relevance = [doc_id == d.id for d in results]
        else: 
            relevance = [d['id'] == doc_id for d in results]

        relevance_total.append(relevance)


    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
        'recall_at_k': recall_at_k(relevance_total),
        'ndcg': ndcg(relevance_total)
    }


## Minsearch

In [10]:
index = minsearch.Index(
    text_fields=text_columns,keyword_fields=['id'])
index.fit(documents)

<minsearch.minsearch.Index at 0x1c87ad93cb0>

In [11]:
# Recall@k and nDCG are complementary metrics for evaluating retrieval.
# - Recall@k checks whether all relevant documents are included in the top-k results
#   (coverage of relevant items).
# - nDCG measures how well the retrieved documents are ranked,
#   giving higher scores to relevant items appearing earlier in the list.
# Using both together ensures that the system not only finds the right documents,
# but also presents them in a useful order.

In [12]:
def minsearch_search(query,filter_dict={},num_results=10,boost={}):

    results = index.search(
        query=query,
        filter_dict=filter_dict,
        boost_dict=boost,
        num_results=num_results
    )

    return results

def objective(trial):
    boost = {col: trial.suggest_int(col, 0, 3) for col in text_columns}
    metrics = evaluate(ground_truth_valid, lambda q: minsearch_search(q['question'], boost=boost))
    # To optimize both recall_at_k and ndcg as a multi-objective problem:
    return metrics['recall_at_k'], metrics['ndcg']

study = optuna.create_study(directions=["maximize", "maximize"])
study.optimize(objective, n_trials=20)



[I 2025-09-10 13:39:27,344] A new study created in memory with name: no-name-59ec14b4-a767-4974-9003-f3ac1b2813e8
100%|██████████| 1002/1002 [00:03<00:00, 308.08it/s]
[I 2025-09-10 13:39:30,606] Trial 0 finished with values: [0.4720558882235529, 0.5977333501461435] and parameters: {'name': 3, 'amenity': 1, 'leisure': 3, 'natural': 3, 'tourism': 1, 'historic': 3, 'wiki_summary_en': 2}.
100%|██████████| 1002/1002 [00:03<00:00, 306.00it/s]
[I 2025-09-10 13:39:33,886] Trial 1 finished with values: [0.5449101796407185, 0.6404467208801998] and parameters: {'name': 3, 'amenity': 1, 'leisure': 1, 'natural': 2, 'tourism': 2, 'historic': 0, 'wiki_summary_en': 3}.
100%|██████████| 1002/1002 [00:03<00:00, 308.17it/s]
[I 2025-09-10 13:39:37,144] Trial 2 finished with values: [0.2714570858283433, 0.3631365014165338] and parameters: {'name': 0, 'amenity': 3, 'leisure': 1, 'natural': 1, 'tourism': 3, 'historic': 1, 'wiki_summary_en': 3}.
100%|██████████| 1002/1002 [00:03<00:00, 309.79it/s]
[I 2025-09-

In [13]:
# Show metrics and parameters for each trial
trails_results = {}

for trial in study.trials:
    trails_results[trial.number] = {
        "recall_at_k": trial.values[0],
        "ndcg": trial.values[1],
        "params": trial.params
    }

trails_results = pd.DataFrame(trails_results).T  

In [14]:
trails_results.sort_values(by=['recall_at_k', 'ndcg'], ascending=False).reset_index(drop=True)

Unnamed: 0,recall_at_k,ndcg,params
0,0.54491,0.640447,"{'name': 3, 'amenity': 1, 'leisure': 1, 'natur..."
1,0.527944,0.619368,"{'name': 1, 'amenity': 0, 'leisure': 2, 'natur..."
2,0.523952,0.620341,"{'name': 2, 'amenity': 3, 'leisure': 3, 'natur..."
3,0.51996,0.618447,"{'name': 1, 'amenity': 1, 'leisure': 3, 'natur..."
4,0.50998,0.639238,"{'name': 2, 'amenity': 1, 'leisure': 0, 'natur..."
5,0.506986,0.621058,"{'name': 2, 'amenity': 3, 'leisure': 3, 'natur..."
6,0.494012,0.610547,"{'name': 1, 'amenity': 3, 'leisure': 3, 'natur..."
7,0.472056,0.597733,"{'name': 3, 'amenity': 1, 'leisure': 3, 'natur..."
8,0.46507,0.593971,"{'name': 1, 'amenity': 2, 'leisure': 3, 'natur..."
9,0.464072,0.591127,"{'name': 2, 'amenity': 3, 'leisure': 1, 'natur..."


In [15]:
boost = trails_results.iloc[0]['params']

In [16]:

minsearch_metrics = evaluate(ground_truth_test, lambda q: minsearch_search(q['question'],boost=boost))
minsearch_metrics['function'] = 'minsearch_metrics'

100%|██████████| 2338/2338 [00:07<00:00, 313.08it/s]


## Vector search

In [17]:
texts = []


for doc in documents:
    t = doc['name'] + ' ' + doc['amenity'] + ' ' + doc['leisure'] + ' ' + doc['natural'] + ' ' + doc['tourism'] + ' ' + doc['historic'] + ' ' + doc['wiki_summary_en']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

vindex = VectorSearch(keyword_fields={'id'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x1c87c0bb380>

In [18]:

def vector_search(query):

    query = pipeline.transform([query])
    results = vindex.search(
        query_vector=query,
        num_results=5
    )

    return results

In [19]:

vector_search_metrics = evaluate(ground_truth_test, lambda q: vector_search(q['question']))
vector_search_metrics['function'] = 'vector_search_tfidf_svd'


100%|██████████| 2338/2338 [00:03<00:00, 739.20it/s]


docker pull qdrant/qdrant

docker run -p 6333:6333 -p 6334:6334 \
   -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
   qdrant/qdrant

In [20]:

qdrant_client = QdrantClient("http://localhost:6333")

In [21]:

EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [22]:

collection_name = "vector_search_collection"

qdrant_client.delete_collection(collection_name)


True

In [23]:

# Create the collection with specified vector parameters
qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

True

In [24]:
points = []

for doc in documents:

    point = models.PointStruct(
        id=doc['id'],
        vector=models.Document(text= doc['name'] + ' ' + doc['amenity'] + ' ' + doc['leisure'] + ' ' + doc['natural'] + ' ' + doc['tourism'] + ' ' + doc['historic'] + ' ' + doc['wiki_summary_en'], model=model_handle), #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
        payload={
            "name": doc['name'],
            "wiki_summary_en": doc['wiki_summary_en'],
            'id'    : doc['id'],
        } #save all needed metadata fields
    )
    points.append(point)

In [25]:

qdrant_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [26]:

def qdrant_vector_search(query, limit=5):

    results = qdrant_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle 
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [27]:
qdrant_vector_search_metrics = evaluate(ground_truth_test, lambda q: qdrant_vector_search(q['question']))
qdrant_vector_search_metrics['function'] = 'qdrant_vector_search'

100%|██████████| 2338/2338 [00:37<00:00, 61.55it/s]


Hybrid search

In [28]:
qdrant_client.delete_collection('hybrid_search')


True

In [29]:

qdrant_client.create_collection(
    collection_name="hybrid_search",
    vectors_config={
        # Named dense vector for jinaai/jina-embeddings-v2-small-en
        "jina-small": models.VectorParams(
            size=512,
            distance=models.Distance.COSINE,
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

True

In [30]:
qdrant_client.upsert(
    collection_name="hybrid_search",
    points=[
        models.PointStruct(
            id=doc['id'],
            vector={
                "jina-small": models.Document(
                    text=doc['name'] + ' ' + doc['amenity'] + ' ' + doc['leisure'] + ' ' + doc['natural'] + ' ' + doc['tourism'] + ' ' + doc['historic'] + ' ' + doc['wiki_summary_en'],
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                "bm25": models.Document(
                    text=doc['name'] + ' ' + doc['amenity'] + ' ' + doc['leisure'] + ' ' + doc['natural'] + ' ' + doc['tourism'] + ' ' + doc['historic'] + ' ' + doc['wiki_summary_en'],
                    model="Qdrant/bm25",
                ),
            },
            payload={
                "name": doc['name'],
            "wiki_summary_en": doc['wiki_summary_en'],
            'id'    : doc['id'],
            }
        )
        for doc in documents
    ]
)


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [31]:
def multi_stage_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = qdrant_client.query_points(
        collection_name="hybrid_search",
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                # Prefetch ten times more results, then
                # expected to return, so we can really rerank
                limit=(10 * limit),
            ),
        ],
        query=models.Document(
            text=query,
            model="Qdrant/bm25", 
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )

    return results.points

In [32]:
multi_stage_search_metrics = evaluate(ground_truth_test, lambda q: multi_stage_search(q['question']))
multi_stage_search_metrics['function'] = 'multi_stage_search'

100%|██████████| 2338/2338 [00:43<00:00, 53.74it/s]


In [33]:

def rrf_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = qdrant_client.query_points(
        collection_name="hybrid_search",
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                limit=(5 * limit),
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25",
                ),
                using="bm25",
                limit=(5 * limit),
            ),
        ],
        # Fusion query enables fusion on the prefetched results
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True,
    )

    return results.points

In [34]:
rrf_search_metrics = evaluate(ground_truth_test, lambda q: rrf_search(q['question']))
rrf_search_metrics['function'] = 'rrf_search'

100%|██████████| 2338/2338 [00:58<00:00, 39.95it/s]


In [35]:
df_results = pd.DataFrame([
    minsearch_metrics,
    vector_search_metrics,
    qdrant_vector_search_metrics,
    multi_stage_search_metrics,
    rrf_search_metrics
])

In [None]:
df_results.sort_values(by=['recall_at_k', 'ndcg'], ascending=False).reset_index(drop=True)

Unnamed: 0,hit_rate,mrr,recall_at_k,ndcg,function
0,0.791702,0.691267,0.628743,0.716113,rrf_search
1,0.611206,0.611206,0.611206,0.611206,multi_stage_search
2,0.694183,0.622277,0.573995,0.640374,qdrant_vector_search
3,0.69846,0.542688,0.461078,0.580345,minsearch_metrics
4,0.434559,0.321607,0.25278,0.349857,vector_search_tfidf_svd


In [37]:
df_results.to_csv('../data/retrieval_evaluation_results.csv', index=False)