# Set up and imports

In [12]:
# Add project root to path
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

print("Project root:", PROJECT_ROOT)

Project root: /home/al/Documents/rag-google-io


In [13]:
import json
import time
import numpy as np
from tqdm import tqdm

import optuna
import mlflow
import mlflow.optuna

from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer

from retrieval.rerankers.cross_encoder import CrossEncoderReranker

In [14]:
from retrieval.evaluation.pipelines import (
    dense_retrieve,
    evaluate_reranking,
)

from retrieval.rerankers.cross_encoder import CrossEncoderReranker

# MLflow Setup

In [15]:
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("dense_rerank_hpo")

<Experiment: artifact_location='file:///home/al/Documents/rag-google-io/notebooks/../mlruns/624004126080311294', creation_time=1769018106611, experiment_id='624004126080311294', last_update_time=1769018106611, lifecycle_stage='active', name='dense_rerank_hpo', tags={}>

In [16]:
mlflow.get_tracking_uri() #check

'file:../mlruns'

# Load Data & Models

In [17]:
# Load ground truth
with open("../data/eval/ground_truth_gpt5nano.json", "r") as f:
    ground_truth = json.load(f)

print(f"Loaded {len(ground_truth)} evaluation queries")

Loaded 10 evaluation queries


In [19]:
# Qdrant
q_client = QdrantClient(url="http://localhost:6333")
q_client = QdrantClient(url="http://localhost:6333")
print(q_client.get_collections()) #check
COLLECTION = "hybrid_collection"

# Embedding model (fixed for HPO)
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def embed_query(text: str):
    return embedding_model.encode(text).tolist()

collections=[CollectionDescription(name='sparse_collection'), CollectionDescription(name='google-io-transcripts'), CollectionDescription(name='hybrid_collection')]


# Evaluation Wrapper (Key for HPO)

In [20]:
def evaluate_config(
    retrieve_k: int,
    rerank_k: int,
    rerank_model: str,
):
    reranker = CrossEncoderReranker(rerank_model)

    recalls = []
    mrrs = []
    latencies = []

    for item in ground_truth:
        start = time.time()

        results = retrieve_and_rerank(
            query=item["query"],
            retrieve_k=retrieve_k,
            rerank_k=rerank_k,
            reranker=reranker,
        )

        latencies.append(time.time() - start)

        relevant_ids = item["relevant_doc_ids"]
        recalls.append(recall_at_k(results, relevant_ids, 5))
        mrrs.append(mrr(results, relevant_ids))

    return {
        "recall@5": float(np.mean(recalls)),
        "mrr": float(np.mean(mrrs)),
        "latency_avg": float(np.mean(latencies)),
    }


# Optuna Objective (with MLflow logging)

In [21]:
def objective(trial):

    retrieve_k = trial.suggest_categorical("retrieve_k", [20, 30, 50, 75, 100])
    rerank_k = trial.suggest_categorical("rerank_k", [5, 10])
    rerank_model = trial.suggest_categorical(
        "rerank_model",
        [
            "cross-encoder/ms-marco-TinyBERT-L-2-v2",
            "cross-encoder/ms-marco-MiniLM-L-6-v2",
            "BAAI/bge-reranker-base",
        ],
    )

    reranker = CrossEncoderReranker(rerank_model)

    metrics = evaluate_reranking(
        ground_truth=ground_truth,
        retrieve_fn=lambda q, k: dense_retrieve(
            q_client, embed_query, COLLECTION, q, k
        ),
        reranker=reranker,
        retrieve_k=retrieve_k,
        rerank_k=rerank_k,
    )

    cost_penalty = (retrieve_k / 100) * 0.1
    return metrics["recall@5"] - cost_penalty


# Run the Study

In [22]:
study = optuna.create_study(
    direction="maximize",
    study_name="dense_rerank_hpo",
)

study.optimize(objective, n_trials=20)

[32m[I 2026-01-21 18:07:07,770][0m A new study created in memory with name: dense_rerank_hpo[0m


Loading reranker model: cross-encoder/ms-marco-MiniLM-L-6-v2


[32m[I 2026-01-21 18:07:25,153][0m Trial 0 finished with value: 0.88 and parameters: {'retrieve_k': 20, 'rerank_k': 5, 'rerank_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2'}. Best is trial 0 with value: 0.88.[0m


Loading reranker model: BAAI/bge-reranker-base


config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

[32m[I 2026-01-21 18:09:48,304][0m Trial 1 finished with value: 0.97 and parameters: {'retrieve_k': 30, 'rerank_k': 10, 'rerank_model': 'BAAI/bge-reranker-base'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: cross-encoder/ms-marco-TinyBERT-L-2-v2


[32m[I 2026-01-21 18:09:57,467][0m Trial 2 finished with value: 0.9 and parameters: {'retrieve_k': 100, 'rerank_k': 5, 'rerank_model': 'cross-encoder/ms-marco-TinyBERT-L-2-v2'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: BAAI/bge-reranker-base


[32m[I 2026-01-21 18:16:00,438][0m Trial 3 finished with value: 0.9 and parameters: {'retrieve_k': 100, 'rerank_k': 5, 'rerank_model': 'BAAI/bge-reranker-base'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: cross-encoder/ms-marco-MiniLM-L-6-v2


[32m[I 2026-01-21 18:16:21,759][0m Trial 4 finished with value: 0.97 and parameters: {'retrieve_k': 30, 'rerank_k': 5, 'rerank_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: cross-encoder/ms-marco-MiniLM-L-6-v2


[32m[I 2026-01-21 18:16:40,405][0m Trial 5 finished with value: 0.97 and parameters: {'retrieve_k': 30, 'rerank_k': 10, 'rerank_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: BAAI/bge-reranker-base


[32m[I 2026-01-21 18:17:51,957][0m Trial 6 finished with value: 0.88 and parameters: {'retrieve_k': 20, 'rerank_k': 5, 'rerank_model': 'BAAI/bge-reranker-base'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: cross-encoder/ms-marco-TinyBERT-L-2-v2


[32m[I 2026-01-21 18:17:58,590][0m Trial 7 finished with value: 0.9 and parameters: {'retrieve_k': 100, 'rerank_k': 10, 'rerank_model': 'cross-encoder/ms-marco-TinyBERT-L-2-v2'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: cross-encoder/ms-marco-TinyBERT-L-2-v2


[32m[I 2026-01-21 18:18:01,651][0m Trial 8 finished with value: 0.97 and parameters: {'retrieve_k': 30, 'rerank_k': 10, 'rerank_model': 'cross-encoder/ms-marco-TinyBERT-L-2-v2'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: BAAI/bge-reranker-base


[32m[I 2026-01-21 18:25:55,409][0m Trial 9 finished with value: 0.925 and parameters: {'retrieve_k': 75, 'rerank_k': 10, 'rerank_model': 'BAAI/bge-reranker-base'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: BAAI/bge-reranker-base


[32m[I 2026-01-21 18:31:18,472][0m Trial 10 finished with value: 0.95 and parameters: {'retrieve_k': 50, 'rerank_k': 10, 'rerank_model': 'BAAI/bge-reranker-base'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: cross-encoder/ms-marco-MiniLM-L-6-v2


[32m[I 2026-01-21 18:31:37,620][0m Trial 11 finished with value: 0.97 and parameters: {'retrieve_k': 30, 'rerank_k': 5, 'rerank_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: cross-encoder/ms-marco-MiniLM-L-6-v2


[32m[I 2026-01-21 18:31:56,377][0m Trial 12 finished with value: 0.97 and parameters: {'retrieve_k': 30, 'rerank_k': 5, 'rerank_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: BAAI/bge-reranker-base


[32m[I 2026-01-21 18:33:43,414][0m Trial 13 finished with value: 0.97 and parameters: {'retrieve_k': 30, 'rerank_k': 10, 'rerank_model': 'BAAI/bge-reranker-base'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: cross-encoder/ms-marco-MiniLM-L-6-v2


[32m[I 2026-01-21 18:34:13,647][0m Trial 14 finished with value: 0.95 and parameters: {'retrieve_k': 50, 'rerank_k': 10, 'rerank_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: BAAI/bge-reranker-base


[32m[I 2026-01-21 18:38:38,920][0m Trial 15 finished with value: 0.925 and parameters: {'retrieve_k': 75, 'rerank_k': 5, 'rerank_model': 'BAAI/bge-reranker-base'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: cross-encoder/ms-marco-MiniLM-L-6-v2


[32m[I 2026-01-21 18:38:57,529][0m Trial 16 finished with value: 0.97 and parameters: {'retrieve_k': 30, 'rerank_k': 5, 'rerank_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: BAAI/bge-reranker-base


[32m[I 2026-01-21 18:40:42,963][0m Trial 17 finished with value: 0.97 and parameters: {'retrieve_k': 30, 'rerank_k': 10, 'rerank_model': 'BAAI/bge-reranker-base'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: cross-encoder/ms-marco-MiniLM-L-6-v2


[32m[I 2026-01-21 18:41:01,417][0m Trial 18 finished with value: 0.97 and parameters: {'retrieve_k': 30, 'rerank_k': 10, 'rerank_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2'}. Best is trial 1 with value: 0.97.[0m


Loading reranker model: cross-encoder/ms-marco-TinyBERT-L-2-v2


[32m[I 2026-01-21 18:41:05,376][0m Trial 19 finished with value: 0.95 and parameters: {'retrieve_k': 50, 'rerank_k': 5, 'rerank_model': 'cross-encoder/ms-marco-TinyBERT-L-2-v2'}. Best is trial 1 with value: 0.97.[0m


# Results Summary

In [23]:
print("Best trial:")
trial = study.best_trial

print("  Value:", trial.value)
print("  Params:")
for k, v in trial.params.items():
    print(f"    {k}: {v}")

Best trial:
  Value: 0.97
  Params:
    retrieve_k: 30
    rerank_k: 10
    rerank_model: BAAI/bge-reranker-base
