# Chunking Experiments with MLflow Tracking

This notebook explores different chunking strategies for RAG:
- Multiple chunk sizes (128, 256, 512, 1024 tokens)
- Multiple overlap values (0, 25, 50 tokens)
- Multiple embedders
- Token-based chunking using each embedder's tokenizer

All experiments are tracked in MLflow for comparison.

In [1]:
import numpy as np
import faiss
import torch
import gc
from time import time
from datasets import load_from_disk, disable_caching, Dataset, load_dataset
from rag.config import PROJECT_ROOT
from rag.tracking import ExperimentTracker
from rag.utils import embed_dataset, get_metrics
from rag.embeddings import LocalEmbedder
from rag.config import settings
from rag.ingestion.chunker import RecursiveChunker

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load datasets
doc_ds = load_dataset("rag-datasets/rag-mini-bioasq", "text-corpus", split="passages")
doc_ds = doc_ds.filter(lambda row: row['passage'] != 'nan')
query_ds = load_dataset("rag-datasets/rag-mini-bioasq", "question-answer-passages", split="test")

# Precompute
doc_id_to_text = doc_ds.select_columns(['id', 'passage']).to_pandas().set_index('id')['passage'].to_dict()
queries = np.array(query_ds['question'])
qrels = [np.array(eval(gold)) for gold in query_ds['relevant_passage_ids']]
qrels_counts = [len(s) for s in qrels]

disable_caching()

In [3]:
# Initialize tracker
tracker = ExperimentTracker('chunking-experiments-bioasq')

# Experiment configuration
embedder_models = [
    "sentence-transformers/all-MiniLM-L6-v2",
    "sentence-transformers/all-MiniLM-L12-v2",
    "BAAI/bge-small-en-v1.5",
    "BAAI/bge-base-en-v1.5",
    "BAAI/bge-large-en-v1.5",
]

chunk_sizes = [None, 128, 256, 512]  # None = no chunking, others in tokens
chunk_overlaps = [0, 25, 50, 100]  # in tokens

faiss_metric = 'IP'
rerank_model = None


def deduplicate_retrieved_docs(retrieved_ids_all, k):
    """
    Deduplicate document IDs per query, keeping only first occurrence.
    
    For chunked retrieval, multiple chunks from same document may be retrieved.
    This keeps only the highest-ranked occurrence of each unique document.
    
    Args:
        retrieved_ids_all: (n_queries, n_retrieved) array of document IDs
        k: Number of unique documents to keep per query
    
    Returns:
        (n_queries, k) array of unique document IDs
    """
    deduped = []
    for query_results in retrieved_ids_all:
        seen = set()
        unique_docs = []
        for doc_id in query_results:
            if doc_id not in seen:
                unique_docs.append(doc_id)
                seen.add(doc_id)
            if len(unique_docs) == k:
                break
        # Pad if needed
        while len(unique_docs) < k:
            unique_docs.append(0)  # Padding with 0 (won't match any real doc)
        deduped.append(unique_docs)
    return np.array(deduped)

[2025-10-26 10:12:58] [rag.tracking] [INFO] Tracking to: http://localhost:5000
[2025-10-26 10:12:58] [rag.tracking] [INFO] Experiment: chunking-experiments-bioasq


In [4]:
# Run experiments
for embedder_name in embedder_models:
    embedder_name_short = embedder_name.split('/')[-1]
    
    for chunk_size in chunk_sizes:
        # Skip overlap experiments when not chunking
        overlaps_to_test = [0] if chunk_size is None else chunk_overlaps
        
        for chunk_overlap in overlaps_to_test:
            # Skip invalid configurations
            if chunk_size is not None and chunk_overlap >= chunk_size:
                continue
            
            print(f"\n{'='*80}")
            print(f"Testing: {embedder_name_short} | chunk_size={chunk_size} | overlap={chunk_overlap}")
            print(f"{'='*80}")
            
            try:
                # Create chunked dataset using token-based chunking
                if chunk_size is None:
                    current_doc_ds = doc_ds
                else:
                    chunker = RecursiveChunker(
                        chunk_size=chunk_size,
                        chunk_overlap=chunk_overlap,
                        embedder_model=embedder_name,
                    )
                    current_doc_ds = chunker.chunk_dataset(doc_ds, text_col='passage', id_col='id')
                    # Rename columns to match expected format
                    current_doc_ds = current_doc_ds.rename_column('doc_id', 'parent_id')
                    current_doc_ds = current_doc_ds.rename_column('text', 'passage')
                    print(f"Created {len(current_doc_ds)} chunks from {len(doc_ds)} documents")
                
                # Initialize embedder
                embedder = LocalEmbedder(embedder_name, device="cuda")
                
                # Embed documents and queries
                start_time = time()
                current_doc_ds = embed_dataset(current_doc_ds, embedder, column="passage")
                current_query_ds = embed_dataset(query_ds, embedder, column="question")
                elapsed_time = time() - start_time
                
                # Build FAISS index
                current_doc_ds.add_faiss_index(
                    column='embedding',
                    string_factory='Flat',
                    metric_type=faiss.METRIC_INNER_PRODUCT,
                    batch_size=128,
                )
                
                # Retrieve top-100 chunks/documents
                retrieve_k = 100
                res = current_doc_ds.get_index('embedding').search_batch(
                    np.array(current_query_ds['embedding']), 
                    k=retrieve_k
                )
                
                # Map chunk IDs back to parent document IDs if chunked
                if chunk_size is None:
                    index_to_doc_id = np.array(current_doc_ds['id'])
                else:
                    index_to_doc_id = np.array(current_doc_ds['parent_id'])
                
                retrieved_ids_all = index_to_doc_id[res.total_indices]
                
                # Deduplicate documents (keeping top-100 unique docs)
                retrieved_ids_all = deduplicate_retrieved_docs(retrieved_ids_all, retrieve_k)
                
                # Calculate metrics at different k values
                metrics = {}
                for k in [1, 3, 5, 10]:
                    retrieved_ids = retrieved_ids_all[:, :k]
                    metrics = {
                        **metrics,
                        **get_metrics(retrieved_ids, query_ds, k),
                    }
                
                metrics = {
                    **{k: round(v, 4) for k, v in metrics.items()},
                    "elapsed_time": round(elapsed_time, 1),
                    "num_chunks": len(current_doc_ds),
                }
                
                # Parameters to track
                params = {
                    'embed_model': embedder_name,
                    'rerank_model': rerank_model,
                    'chunked': chunk_size is not None,
                    'chunk_size': chunk_size if chunk_size is not None else 'none',
                    'chunk_overlap': ch
                    unk_overlap if chunk_size is not None else 'none',
                    'faiss_metric': faiss_metric,
                }
                
                # Create run name
                if chunk_size is None:
                    run_name = f"{embedder_name_short}_no-chunking"
                else:
                    run_name = f"{embedder_name_short}_cs{chunk_size}_ov{chunk_overlap}"
                
                # Tags for filtering
                tags = {
                    'experiment_type': 'chunking',
                    'phase': 'exploration',
                    'dataset': 'bioasq-mini',
                    'embedder': embedder_name_short,
                    'chunked': str(chunk_size is not None),
                }
                
                # Log to MLflow
                with tracker.start_run(run_name=run_name, tags=tags):
                    tracker.log_params(params)
                    tracker.log_metrics(metrics)

                print(f"\nResults:")
                print(f"  P@10: {metrics.get('P@10', 0):.4f}")
                print(f"  R@10: {metrics.get('R@10', 0):.4f}")
                print(f"  MRR@10: {metrics.get('MRR@10', 0):.4f}")
                print(f"  Time: {elapsed_time:.1f}s")
                
                # Cleanup
                current_doc_ds.drop_index('embedding')
                del embedder
                del current_doc_ds
                del current_query_ds
                gc.collect()
                torch.cuda.empty_cache()
                
            except Exception as e:
                print(f"\nFailed: {e}")
                import traceback
                traceback.print_exc()
                gc.collect()
                torch.cuda.empty_cache()
                continue

print("\n" + "="*80)
print("All experiments completed!")
print("="*80)


Testing: all-MiniLM-L6-v2 | chunk_size=None | overlap=0


Map: 100%|██████████| 28001/28001 [00:18<00:00, 1505.49 examples/s]
Map: 100%|██████████| 4719/4719 [00:01<00:00, 3126.43 examples/s]
100%|██████████| 219/219 [00:00<00:00, 4905.49it/s]


🏃 View run all-MiniLM-L6-v2_no-chunking at: http://localhost:5000/#/experiments/354692744483317738/runs/82816706e00a45318dbaacf717d0c83f
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2844
  R@10: 0.3734
  MRR@10: 0.6314
  Time: 35.3s

Testing: all-MiniLM-L6-v2 | chunk_size=128 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:39<00:00, 705.38it/s]


Created 109865 chunks from 28001 documents


Map: 100%|██████████| 109865/109865 [00:46<00:00, 2375.70 examples/s]
Map: 100%|██████████| 4719/4719 [00:01<00:00, 3116.75 examples/s]
100%|██████████| 859/859 [00:00<00:00, 4658.70it/s]


🏃 View run all-MiniLM-L6-v2_cs128_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/31a78fd5de754c03bc9fde8432359c42
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3039
  R@10: 0.4021
  MRR@10: 0.6519
  Time: 63.5s

Testing: all-MiniLM-L6-v2 | chunk_size=128 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:39<00:00, 704.81it/s]


Created 121866 chunks from 28001 documents


Map: 100%|██████████| 121866/121866 [00:51<00:00, 2375.26 examples/s]
Map: 100%|██████████| 4719/4719 [00:01<00:00, 3123.39 examples/s]
100%|██████████| 953/953 [00:00<00:00, 4913.20it/s]


🏃 View run all-MiniLM-L6-v2_cs128_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/55cd982cac7e4afc9d7aa0e619794677
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3123
  R@10: 0.4144
  MRR@10: 0.6838
  Time: 68.2s

Testing: all-MiniLM-L6-v2 | chunk_size=128 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:37<00:00, 754.33it/s]


Created 146055 chunks from 28001 documents


Map: 100%|██████████| 146055/146055 [01:01<00:00, 2362.95 examples/s]
Map: 100%|██████████| 4719/4719 [00:01<00:00, 3239.36 examples/s]
100%|██████████| 1142/1142 [00:00<00:00, 4351.06it/s]


🏃 View run all-MiniLM-L6-v2_cs128_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/9d5b1cad6c4e49e5b2413b1ba001b0ac
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3157
  R@10: 0.4203
  MRR@10: 0.6882
  Time: 78.7s

Testing: all-MiniLM-L6-v2 | chunk_size=128 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:39<00:00, 708.96it/s]


Created 328219 chunks from 28001 documents


Map: 100%|██████████| 328219/328219 [02:18<00:00, 2362.44 examples/s]
Map: 100%|██████████| 4719/4719 [00:01<00:00, 3160.88 examples/s]
100%|██████████| 2565/2565 [00:00<00:00, 4461.08it/s]


🏃 View run all-MiniLM-L6-v2_cs128_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/8f34b13ac47c4d399b3a204ebdad407f
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3220
  R@10: 0.4294
  MRR@10: 0.6946
  Time: 155.9s

Testing: all-MiniLM-L6-v2 | chunk_size=256 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:36<00:00, 757.08it/s]


Created 60178 chunks from 28001 documents


Map: 100%|██████████| 60178/60178 [00:34<00:00, 1726.82 examples/s]
Map: 100%|██████████| 4719/4719 [00:01<00:00, 3145.45 examples/s]
100%|██████████| 471/471 [00:00<00:00, 5332.92it/s]


🏃 View run all-MiniLM-L6-v2_cs256_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/9a018bdc449d4efcb85a8f316244bac3
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2847
  R@10: 0.3753
  MRR@10: 0.6215
  Time: 52.0s

Testing: all-MiniLM-L6-v2 | chunk_size=256 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:37<00:00, 743.39it/s]


Created 61682 chunks from 28001 documents


Map: 100%|██████████| 61682/61682 [00:35<00:00, 1732.57 examples/s]
Map: 100%|██████████| 4719/4719 [00:01<00:00, 3084.62 examples/s]
100%|██████████| 482/482 [00:00<00:00, 5399.80it/s]


🏃 View run all-MiniLM-L6-v2_cs256_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/d7f780d3a1194e22b643b04621ed2eaf
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2923
  R@10: 0.3854
  MRR@10: 0.6459
  Time: 52.6s

Testing: all-MiniLM-L6-v2 | chunk_size=256 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:35<00:00, 786.51it/s]


Created 64042 chunks from 28001 documents


Map: 100%|██████████| 64042/64042 [00:36<00:00, 1735.67 examples/s]
Map: 100%|██████████| 4719/4719 [00:01<00:00, 3109.52 examples/s]
100%|██████████| 501/501 [00:00<00:00, 5518.91it/s]


🏃 View run all-MiniLM-L6-v2_cs256_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/9520a33fb47444e4acc7864784c16682
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2957
  R@10: 0.3907
  MRR@10: 0.6543
  Time: 53.9s

Testing: all-MiniLM-L6-v2 | chunk_size=256 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:35<00:00, 786.78it/s]


Created 71128 chunks from 28001 documents


Map: 100%|██████████| 71128/71128 [00:41<00:00, 1714.75 examples/s]
Map: 100%|██████████| 4719/4719 [00:01<00:00, 3193.57 examples/s]
100%|██████████| 556/556 [00:00<00:00, 4539.49it/s]


🏃 View run all-MiniLM-L6-v2_cs256_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/4f4b491c224848eb8bae170b83a305df
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2976
  R@10: 0.3924
  MRR@10: 0.6568
  Time: 58.1s

Testing: all-MiniLM-L6-v2 | chunk_size=512 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:32<00:00, 869.58it/s]


Created 34787 chunks from 28001 documents


Map: 100%|██████████| 34787/34787 [00:22<00:00, 1541.38 examples/s]
Map: 100%|██████████| 4719/4719 [00:01<00:00, 3141.21 examples/s]
100%|██████████| 272/272 [00:00<00:00, 4940.27it/s]


🏃 View run all-MiniLM-L6-v2_cs512_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/b6e3e6774ba14b478fcec38526fa43cf
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2768
  R@10: 0.3632
  MRR@10: 0.6057
  Time: 39.6s

Testing: all-MiniLM-L6-v2 | chunk_size=512 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:32<00:00, 865.53it/s]


Created 34803 chunks from 28001 documents


Map: 100%|██████████| 34803/34803 [00:22<00:00, 1562.08 examples/s]
Map: 100%|██████████| 4719/4719 [00:01<00:00, 3175.51 examples/s]
100%|██████████| 272/272 [00:00<00:00, 4812.90it/s]


🏃 View run all-MiniLM-L6-v2_cs512_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/0491d52e014b4a9f9614a1e6e2706b79
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2837
  R@10: 0.3706
  MRR@10: 0.6240
  Time: 39.3s

Testing: all-MiniLM-L6-v2 | chunk_size=512 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:31<00:00, 898.14it/s]


Created 34841 chunks from 28001 documents


Map: 100%|██████████| 34841/34841 [00:22<00:00, 1557.71 examples/s]
Map: 100%|██████████| 4719/4719 [00:01<00:00, 3202.59 examples/s]
100%|██████████| 273/273 [00:00<00:00, 4927.72it/s]


🏃 View run all-MiniLM-L6-v2_cs512_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/2cf3d355a43c4ab38032cc5b2b5a0d32
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2857
  R@10: 0.3736
  MRR@10: 0.6277
  Time: 39.1s

Testing: all-MiniLM-L6-v2 | chunk_size=512 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:32<00:00, 871.39it/s]


Created 34965 chunks from 28001 documents


Map: 100%|██████████| 34965/34965 [00:22<00:00, 1556.37 examples/s]
Map: 100%|██████████| 4719/4719 [00:01<00:00, 3105.58 examples/s]
100%|██████████| 274/274 [00:00<00:00, 4977.54it/s]


🏃 View run all-MiniLM-L6-v2_cs512_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/2e59b4e6e22d4a439daea8eac5b17519
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2866
  R@10: 0.3738
  MRR@10: 0.6326
  Time: 39.4s

Testing: all-MiniLM-L12-v2 | chunk_size=None | overlap=0


Map: 100%|██████████| 28001/28001 [00:21<00:00, 1279.92 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2143.30 examples/s]
100%|██████████| 219/219 [00:00<00:00, 6018.88it/s]


🏃 View run all-MiniLM-L12-v2_no-chunking at: http://localhost:5000/#/experiments/354692744483317738/runs/f99c8e18fec3464fbceb8f138fc7d04c
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2696
  R@10: 0.3507
  MRR@10: 0.6049
  Time: 40.3s

Testing: all-MiniLM-L12-v2 | chunk_size=128 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:38<00:00, 728.80it/s]


Created 109865 chunks from 28001 documents


Map: 100%|██████████| 109865/109865 [01:11<00:00, 1528.46 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2104.92 examples/s]
100%|██████████| 859/859 [00:00<00:00, 4595.37it/s]


🏃 View run all-MiniLM-L12-v2_cs128_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/e623e79452954b7094340d5d6919059c
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2818
  R@10: 0.3768
  MRR@10: 0.6169
  Time: 89.7s

Testing: all-MiniLM-L12-v2 | chunk_size=128 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:39<00:00, 702.72it/s]


Created 121866 chunks from 28001 documents


Map: 100%|██████████| 121866/121866 [01:19<00:00, 1525.90 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2137.02 examples/s]
100%|██████████| 953/953 [00:00<00:00, 4856.22it/s]


🏃 View run all-MiniLM-L12-v2_cs128_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/33fbb046253a4611a4fc06e7159c99be
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2923
  R@10: 0.3910
  MRR@10: 0.6463
  Time: 98.0s

Testing: all-MiniLM-L12-v2 | chunk_size=128 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:38<00:00, 719.63it/s]


Created 146055 chunks from 28001 documents


Map: 100%|██████████| 146055/146055 [01:35<00:00, 1531.18 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2110.46 examples/s]
100%|██████████| 1142/1142 [00:00<00:00, 4343.97it/s]


🏃 View run all-MiniLM-L12-v2_cs128_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/97a3b27aef814108b63f7e56614ed912
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2990
  R@10: 0.4038
  MRR@10: 0.6626
  Time: 113.2s

Testing: all-MiniLM-L12-v2 | chunk_size=128 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:38<00:00, 720.68it/s]


Created 328219 chunks from 28001 documents


Map: 100%|██████████| 328219/328219 [03:36<00:00, 1518.60 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2125.59 examples/s]
100%|██████████| 2565/2565 [00:00<00:00, 4539.15it/s]


🏃 View run all-MiniLM-L12-v2_cs128_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/ef791783b4ad4d8ab2c5b3265e498150
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3040
  R@10: 0.4093
  MRR@10: 0.6721
  Time: 234.4s

Testing: all-MiniLM-L12-v2 | chunk_size=256 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:36<00:00, 763.81it/s]


Created 60178 chunks from 28001 documents


Map: 100%|██████████| 60178/60178 [00:42<00:00, 1404.36 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2131.10 examples/s]
100%|██████████| 471/471 [00:00<00:00, 5261.38it/s]


🏃 View run all-MiniLM-L12-v2_cs256_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/9c95db18bcf641808d3f5b7a39104fa1
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2666
  R@10: 0.3539
  MRR@10: 0.5882
  Time: 60.7s

Testing: all-MiniLM-L12-v2 | chunk_size=256 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:36<00:00, 762.94it/s]


Created 61682 chunks from 28001 documents


Map: 100%|██████████| 61682/61682 [00:44<00:00, 1388.30 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2158.76 examples/s]
100%|██████████| 482/482 [00:00<00:00, 5278.28it/s]


🏃 View run all-MiniLM-L12-v2_cs256_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/77e913e87a2f47649737330b5c613161
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2757
  R@10: 0.3629
  MRR@10: 0.6168
  Time: 62.6s

Testing: all-MiniLM-L12-v2 | chunk_size=256 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:36<00:00, 759.01it/s]


Created 64042 chunks from 28001 documents


Map: 100%|██████████| 64042/64042 [00:45<00:00, 1394.04 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2110.06 examples/s]
100%|██████████| 501/501 [00:00<00:00, 5285.25it/s]


🏃 View run all-MiniLM-L12-v2_cs256_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/a0b9a5b5875b4ad38cf7453b80ba2f4f
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2809
  R@10: 0.3735
  MRR@10: 0.6287
  Time: 64.1s

Testing: all-MiniLM-L12-v2 | chunk_size=256 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:35<00:00, 783.18it/s]


Created 71128 chunks from 28001 documents


Map: 100%|██████████| 71128/71128 [00:51<00:00, 1388.43 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2078.01 examples/s]
100%|██████████| 556/556 [00:00<00:00, 4582.61it/s]


🏃 View run all-MiniLM-L12-v2_cs256_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/cfa377354c3841acb1f78f398f64b8d4
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2867
  R@10: 0.3818
  MRR@10: 0.6365
  Time: 69.5s

Testing: all-MiniLM-L12-v2 | chunk_size=512 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:32<00:00, 864.96it/s]


Created 34787 chunks from 28001 documents


Map: 100%|██████████| 34787/34787 [00:26<00:00, 1314.03 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2090.65 examples/s]
100%|██████████| 272/272 [00:00<00:00, 4497.72it/s]


🏃 View run all-MiniLM-L12-v2_cs512_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/5bd5bbf9867c4f73add9d4bba7cbff1d
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2590
  R@10: 0.3382
  MRR@10: 0.5727
  Time: 45.1s

Testing: all-MiniLM-L12-v2 | chunk_size=512 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:30<00:00, 914.04it/s]


Created 34803 chunks from 28001 documents


Map: 100%|██████████| 34803/34803 [00:26<00:00, 1312.91 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2118.51 examples/s]
100%|██████████| 272/272 [00:00<00:00, 4465.82it/s]


🏃 View run all-MiniLM-L12-v2_cs512_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/a94278c7521241d78c26b7a8d8e8eb44
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2675
  R@10: 0.3481
  MRR@10: 0.5969
  Time: 44.4s

Testing: all-MiniLM-L12-v2 | chunk_size=512 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:30<00:00, 910.60it/s]


Created 34841 chunks from 28001 documents


Map: 100%|██████████| 34841/34841 [00:26<00:00, 1308.88 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2106.23 examples/s]
100%|██████████| 273/273 [00:00<00:00, 4491.96it/s]


🏃 View run all-MiniLM-L12-v2_cs512_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/b1612df8fbb3423f9c11810be4aab73e
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2693
  R@10: 0.3495
  MRR@10: 0.6045
  Time: 44.4s

Testing: all-MiniLM-L12-v2 | chunk_size=512 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:30<00:00, 918.92it/s]


Created 34965 chunks from 28001 documents


Map: 100%|██████████| 34965/34965 [00:26<00:00, 1296.24 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2136.46 examples/s]
100%|██████████| 274/274 [00:00<00:00, 4707.60it/s]


🏃 View run all-MiniLM-L12-v2_cs512_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/2e627c8290e04e9d983473cfb05f4d68
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.2704
  R@10: 0.3507
  MRR@10: 0.6066
  Time: 45.1s

Testing: bge-small-en-v1.5 | chunk_size=None | overlap=0


Map: 100%|██████████| 28001/28001 [00:37<00:00, 741.56 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2133.77 examples/s]
100%|██████████| 219/219 [00:00<00:00, 5246.74it/s]


🏃 View run bge-small-en-v1.5_no-chunking at: http://localhost:5000/#/experiments/354692744483317738/runs/819e79e397124873b4c0decb140b23a9
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3399
  R@10: 0.4504
  MRR@10: 0.7380
  Time: 55.7s

Testing: bge-small-en-v1.5 | chunk_size=128 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:38<00:00, 733.91it/s]


Created 109865 chunks from 28001 documents


Map: 100%|██████████| 109865/109865 [01:11<00:00, 1529.63 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2140.55 examples/s]
100%|██████████| 859/859 [00:00<00:00, 5026.01it/s]


🏃 View run bge-small-en-v1.5_cs128_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/872d724ea12d44149578e6f46c5f6961
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3524
  R@10: 0.4707
  MRR@10: 0.7488
  Time: 90.2s

Testing: bge-small-en-v1.5 | chunk_size=128 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:38<00:00, 723.72it/s]


Created 121866 chunks from 28001 documents


Map: 100%|██████████| 121866/121866 [01:19<00:00, 1538.08 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2175.98 examples/s]
100%|██████████| 953/953 [00:00<00:00, 5188.19it/s]


🏃 View run bge-small-en-v1.5_cs128_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/dd38a99a04a04574aa3f0a9abe8bb603
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3570
  R@10: 0.4766
  MRR@10: 0.7598
  Time: 97.3s

Testing: bge-small-en-v1.5 | chunk_size=128 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:39<00:00, 706.62it/s]


Created 146055 chunks from 28001 documents


Map: 100%|██████████| 146055/146055 [01:35<00:00, 1530.84 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2189.84 examples/s]
100%|██████████| 1142/1142 [00:00<00:00, 4553.66it/s]


🏃 View run bge-small-en-v1.5_cs128_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/1fb91b3df74b4e9d97bfa70272cbc4c4
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3600
  R@10: 0.4815
  MRR@10: 0.7615
  Time: 113.9s

Testing: bge-small-en-v1.5 | chunk_size=128 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:38<00:00, 719.09it/s]


Created 328219 chunks from 28001 documents


Map: 100%|██████████| 328219/328219 [03:34<00:00, 1528.13 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2131.55 examples/s]
100%|██████████| 2565/2565 [00:00<00:00, 4365.48it/s]


🏃 View run bge-small-en-v1.5_cs128_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/4090814a784040469f9d964517de730c
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3659
  R@10: 0.4897
  MRR@10: 0.7667
  Time: 233.0s

Testing: bge-small-en-v1.5 | chunk_size=256 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:37<00:00, 739.01it/s]


Created 60178 chunks from 28001 documents


Map: 100%|██████████| 60178/60178 [00:54<00:00, 1106.73 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2164.95 examples/s]
100%|██████████| 471/471 [00:00<00:00, 5090.54it/s]


🏃 View run bge-small-en-v1.5_cs256_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/4e50e9f7166442029761ed17138720e8
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3451
  R@10: 0.4588
  MRR@10: 0.7444
  Time: 72.7s

Testing: bge-small-en-v1.5 | chunk_size=256 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:35<00:00, 779.41it/s]


Created 61682 chunks from 28001 documents


Map: 100%|██████████| 61682/61682 [00:56<00:00, 1100.47 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2170.04 examples/s]
100%|██████████| 482/482 [00:00<00:00, 5393.80it/s]


🏃 View run bge-small-en-v1.5_cs256_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/866e9b0250874bb6a0f5fb07330751f8
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3478
  R@10: 0.4634
  MRR@10: 0.7509
  Time: 73.9s

Testing: bge-small-en-v1.5 | chunk_size=256 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:36<00:00, 761.68it/s]


Created 64042 chunks from 28001 documents


Map: 100%|██████████| 64042/64042 [00:58<00:00, 1101.85 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2138.48 examples/s]
100%|██████████| 501/501 [00:00<00:00, 5338.49it/s]


🏃 View run bge-small-en-v1.5_cs256_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/9f244ffc9cc04002b500ac8ec017465a
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3491
  R@10: 0.4649
  MRR@10: 0.7530
  Time: 76.3s

Testing: bge-small-en-v1.5 | chunk_size=256 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:34<00:00, 803.15it/s]


Created 71128 chunks from 28001 documents


Map: 100%|██████████| 71128/71128 [01:04<00:00, 1096.74 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2126.57 examples/s]
100%|██████████| 556/556 [00:00<00:00, 4468.52it/s]


🏃 View run bge-small-en-v1.5_cs256_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/33450d4d60ee4d32a1a9ac9adc7a629c
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3492
  R@10: 0.4648
  MRR@10: 0.7525
  Time: 82.7s

Testing: bge-small-en-v1.5 | chunk_size=512 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:32<00:00, 859.42it/s]


Created 34787 chunks from 28001 documents


Map: 100%|██████████| 34787/34787 [00:40<00:00, 860.33 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2123.90 examples/s]
100%|██████████| 272/272 [00:00<00:00, 4439.76it/s]


🏃 View run bge-small-en-v1.5_cs512_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/71e01618c7e74a30a28fb87753b34054
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3400
  R@10: 0.4492
  MRR@10: 0.7362
  Time: 58.4s

Testing: bge-small-en-v1.5 | chunk_size=512 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:30<00:00, 911.84it/s]


Created 34803 chunks from 28001 documents


Map: 100%|██████████| 34803/34803 [00:41<00:00, 846.84 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2114.18 examples/s]
100%|██████████| 272/272 [00:00<00:00, 4664.93it/s]


🏃 View run bge-small-en-v1.5_cs512_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/17301cdd400b4a1089bbebc4e494b6f9
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3413
  R@10: 0.4517
  MRR@10: 0.7399
  Time: 59.6s

Testing: bge-small-en-v1.5 | chunk_size=512 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:31<00:00, 891.51it/s]


Created 34841 chunks from 28001 documents


Map: 100%|██████████| 34841/34841 [00:41<00:00, 849.22 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2149.55 examples/s]
100%|██████████| 273/273 [00:00<00:00, 4579.21it/s]


🏃 View run bge-small-en-v1.5_cs512_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/b18ca3b23d704f76bc624d88c79c9803
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3414
  R@10: 0.4519
  MRR@10: 0.7394
  Time: 59.4s

Testing: bge-small-en-v1.5 | chunk_size=512 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:30<00:00, 916.40it/s]


Created 34965 chunks from 28001 documents


Map: 100%|██████████| 34965/34965 [00:40<00:00, 854.12 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 2148.20 examples/s]
100%|██████████| 274/274 [00:00<00:00, 4637.26it/s]


🏃 View run bge-small-en-v1.5_cs512_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/8097b642e298438e907f69c940527901
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3413
  R@10: 0.4515
  MRR@10: 0.7372
  Time: 59.1s

Testing: bge-base-en-v1.5 | chunk_size=None | overlap=0


Map: 100%|██████████| 28001/28001 [01:25<00:00, 328.43 examples/s]
Map: 100%|██████████| 4719/4719 [00:03<00:00, 1550.63 examples/s]
100%|██████████| 219/219 [00:00<00:00, 3187.36it/s]


🏃 View run bge-base-en-v1.5_no-chunking at: http://localhost:5000/#/experiments/354692744483317738/runs/65c8d4dd2e4a477dbb1f57aea00c6af1
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3502
  R@10: 0.4627
  MRR@10: 0.7494
  Time: 105.4s

Testing: bge-base-en-v1.5 | chunk_size=128 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:40<00:00, 694.54it/s]


Created 109865 chunks from 28001 documents


Map: 100%|██████████| 109865/109865 [01:53<00:00, 964.11 examples/s]
Map: 100%|██████████| 4719/4719 [00:03<00:00, 1544.20 examples/s]
100%|██████████| 859/859 [00:00<00:00, 3423.11it/s]


🏃 View run bge-base-en-v1.5_cs128_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/8e561d5c77954317819ec3adc2337370
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3640
  R@10: 0.4844
  MRR@10: 0.7568
  Time: 134.2s

Testing: bge-base-en-v1.5 | chunk_size=128 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:37<00:00, 737.78it/s]


Created 121866 chunks from 28001 documents


Map: 100%|██████████| 121866/121866 [02:06<00:00, 966.57 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 1575.24 examples/s]
100%|██████████| 953/953 [00:00<00:00, 3342.65it/s]


🏃 View run bge-base-en-v1.5_cs128_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/c26f4c51b91f4d519e1fa279218e47da
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3678
  R@10: 0.4910
  MRR@10: 0.7683
  Time: 145.9s

Testing: bge-base-en-v1.5 | chunk_size=128 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:39<00:00, 715.50it/s]


Created 146055 chunks from 28001 documents


Map: 100%|██████████| 146055/146055 [02:31<00:00, 964.73 examples/s]
Map: 100%|██████████| 4719/4719 [00:03<00:00, 1569.75 examples/s]
100%|██████████| 1142/1142 [00:00<00:00, 3051.94it/s]


🏃 View run bge-base-en-v1.5_cs128_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/034fb0024f9a483482b46829385a4f49
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3708
  R@10: 0.4948
  MRR@10: 0.7692
  Time: 171.7s

Testing: bge-base-en-v1.5 | chunk_size=128 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:38<00:00, 728.27it/s]


Created 328219 chunks from 28001 documents


Map: 100%|██████████| 328219/328219 [05:41<00:00, 961.51 examples/s]
Map: 100%|██████████| 4719/4719 [00:03<00:00, 1560.73 examples/s]
100%|██████████| 2565/2565 [00:00<00:00, 3056.37it/s]


🏃 View run bge-base-en-v1.5_cs128_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/01b2701dff4a4bb3937ea5b1409f07b1
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3750
  R@10: 0.5008
  MRR@10: 0.7745
  Time: 361.4s

Testing: bge-base-en-v1.5 | chunk_size=256 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:37<00:00, 748.42it/s]


Created 60178 chunks from 28001 documents


Map: 100%|██████████| 60178/60178 [01:27<00:00, 691.11 examples/s]
Map: 100%|██████████| 4719/4719 [00:03<00:00, 1568.54 examples/s]
100%|██████████| 471/471 [00:00<00:00, 3415.73it/s]


🏃 View run bge-base-en-v1.5_cs256_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/b89921a4ec564a52a83a1606720b0015
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3561
  R@10: 0.4712
  MRR@10: 0.7497
  Time: 107.6s

Testing: bge-base-en-v1.5 | chunk_size=256 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:37<00:00, 753.71it/s]


Created 61682 chunks from 28001 documents


Map: 100%|██████████| 61682/61682 [01:29<00:00, 690.32 examples/s]
Map: 100%|██████████| 4719/4719 [00:03<00:00, 1565.99 examples/s]
100%|██████████| 482/482 [00:00<00:00, 3578.68it/s]


🏃 View run bge-base-en-v1.5_cs256_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/ee19fb7f1f234c6d95b84cc54c11a2bb
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3579
  R@10: 0.4737
  MRR@10: 0.7564
  Time: 109.6s

Testing: bge-base-en-v1.5 | chunk_size=256 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:36<00:00, 761.20it/s]


Created 64042 chunks from 28001 documents


Map: 100%|██████████| 64042/64042 [01:32<00:00, 688.88 examples/s]
Map: 100%|██████████| 4719/4719 [00:03<00:00, 1568.66 examples/s]
100%|██████████| 501/501 [00:00<00:00, 3578.73it/s]


🏃 View run bge-base-en-v1.5_cs256_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/ec525f10334f460c9769ea4a76e80cbe
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3591
  R@10: 0.4760
  MRR@10: 0.7596
  Time: 112.7s

Testing: bge-base-en-v1.5 | chunk_size=256 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:36<00:00, 776.20it/s]


Created 71128 chunks from 28001 documents


Map: 100%|██████████| 71128/71128 [01:43<00:00, 687.48 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 1581.98 examples/s]
100%|██████████| 556/556 [00:00<00:00, 2941.56it/s]


🏃 View run bge-base-en-v1.5_cs256_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/c88ddd5cc38c45478e1a70e7c36692c6
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3609
  R@10: 0.4786
  MRR@10: 0.7584
  Time: 123.5s

Testing: bge-base-en-v1.5 | chunk_size=512 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:31<00:00, 876.90it/s]


Created 34787 chunks from 28001 documents


Map: 100%|██████████| 34787/34787 [01:30<00:00, 382.39 examples/s]
Map: 100%|██████████| 4719/4719 [00:03<00:00, 1569.26 examples/s]
100%|██████████| 272/272 [00:00<00:00, 3081.82it/s]


🏃 View run bge-base-en-v1.5_cs512_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/b3966b06714a4645a8caa2113e2349b6
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3504
  R@10: 0.4630
  MRR@10: 0.7438
  Time: 111.1s

Testing: bge-base-en-v1.5 | chunk_size=512 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:32<00:00, 873.36it/s]


Created 34803 chunks from 28001 documents


Map: 100%|██████████| 34803/34803 [01:31<00:00, 382.29 examples/s]
Map: 100%|██████████| 4719/4719 [00:02<00:00, 1575.22 examples/s]
100%|██████████| 272/272 [00:00<00:00, 2892.83it/s]


🏃 View run bge-base-en-v1.5_cs512_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/d4201facbc14446b922c68ef505a1488
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3513
  R@10: 0.4637
  MRR@10: 0.7487
  Time: 110.8s

Testing: bge-base-en-v1.5 | chunk_size=512 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:32<00:00, 850.42it/s]


Created 34841 chunks from 28001 documents


Map: 100%|██████████| 34841/34841 [01:31<00:00, 381.83 examples/s]
Map: 100%|██████████| 4719/4719 [00:03<00:00, 1570.78 examples/s]
100%|██████████| 273/273 [00:00<00:00, 2729.21it/s]


🏃 View run bge-base-en-v1.5_cs512_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/20f46cfb5c974e7a94a36df66b00de63
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3515
  R@10: 0.4640
  MRR@10: 0.7491
  Time: 111.7s

Testing: bge-base-en-v1.5 | chunk_size=512 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:31<00:00, 885.90it/s]


Created 34965 chunks from 28001 documents


Map: 100%|██████████| 34965/34965 [01:31<00:00, 381.66 examples/s]
Map: 100%|██████████| 4719/4719 [00:03<00:00, 1563.58 examples/s]
100%|██████████| 274/274 [00:00<00:00, 3079.47it/s]


🏃 View run bge-base-en-v1.5_cs512_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/be384c60df3f47f9acab81151b0609b6
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3517
  R@10: 0.4648
  MRR@10: 0.7494
  Time: 111.8s

Testing: bge-large-en-v1.5 | chunk_size=None | overlap=0


Map: 100%|██████████| 28001/28001 [03:30<00:00, 133.14 examples/s]
Map: 100%|██████████| 4719/4719 [00:06<00:00, 750.80 examples/s]
100%|██████████| 219/219 [00:00<00:00, 3239.18it/s]


🏃 View run bge-large-en-v1.5_no-chunking at: http://localhost:5000/#/experiments/354692744483317738/runs/e460e9188a49458fa35b8ab8663d178b
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3556
  R@10: 0.4733
  MRR@10: 0.7552
  Time: 234.1s

Testing: bge-large-en-v1.5 | chunk_size=128 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:39<00:00, 703.26it/s]


Created 109865 chunks from 28001 documents


Map: 100%|██████████| 109865/109865 [04:24<00:00, 415.30 examples/s]
Map: 100%|██████████| 4719/4719 [00:06<00:00, 749.33 examples/s]
100%|██████████| 859/859 [00:00<00:00, 2814.21it/s]


🏃 View run bge-large-en-v1.5_cs128_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/9d688ec493144775bac9a0d46b061f52
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3667
  R@10: 0.4895
  MRR@10: 0.7669
  Time: 288.6s

Testing: bge-large-en-v1.5 | chunk_size=128 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:39<00:00, 707.53it/s]


Created 121866 chunks from 28001 documents


Map: 100%|██████████| 121866/121866 [04:53<00:00, 414.89 examples/s]
Map: 100%|██████████| 4719/4719 [00:06<00:00, 749.93 examples/s]
100%|██████████| 953/953 [00:00<00:00, 3065.29it/s]


🏃 View run bge-large-en-v1.5_cs128_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/4546834d7bce474894cd9aeff210f639
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3709
  R@10: 0.4953
  MRR@10: 0.7749
  Time: 317.3s

Testing: bge-large-en-v1.5 | chunk_size=128 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:39<00:00, 712.02it/s]


Created 146055 chunks from 28001 documents


Map: 100%|██████████| 146055/146055 [05:52<00:00, 414.63 examples/s]
Map: 100%|██████████| 4719/4719 [00:06<00:00, 749.72 examples/s]
100%|██████████| 1142/1142 [00:00<00:00, 2538.86it/s]


🏃 View run bge-large-en-v1.5_cs128_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/aa24fd7d06da475d992d696a6e8610d2
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3724
  R@10: 0.4977
  MRR@10: 0.7767
  Time: 375.9s

Testing: bge-large-en-v1.5 | chunk_size=128 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:38<00:00, 721.23it/s]


Created 328219 chunks from 28001 documents


Map: 100%|██████████| 328219/328219 [13:11<00:00, 414.77 examples/s]
Map: 100%|██████████| 4719/4719 [00:06<00:00, 752.48 examples/s]
100%|██████████| 2565/2565 [00:01<00:00, 2507.03it/s]


🏃 View run bge-large-en-v1.5_cs128_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/f52bd7fcab7b41dfafb1ca19ca7700da
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3778
  R@10: 0.5050
  MRR@10: 0.7835
  Time: 815.6s

Testing: bge-large-en-v1.5 | chunk_size=256 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:36<00:00, 775.97it/s]


Created 60178 chunks from 28001 documents


Map: 100%|██████████| 60178/60178 [03:36<00:00, 277.97 examples/s]
Map: 100%|██████████| 4719/4719 [00:06<00:00, 745.47 examples/s]
100%|██████████| 471/471 [00:00<00:00, 2887.93it/s]


🏃 View run bge-large-en-v1.5_cs256_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/5053eb65fb49482f893e921124874384
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3601
  R@10: 0.4800
  MRR@10: 0.7588
  Time: 240.2s

Testing: bge-large-en-v1.5 | chunk_size=256 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:36<00:00, 774.34it/s]


Created 61682 chunks from 28001 documents


Map: 100%|██████████| 61682/61682 [03:41<00:00, 277.98 examples/s]
Map: 100%|██████████| 4719/4719 [00:06<00:00, 748.44 examples/s]
100%|██████████| 482/482 [00:00<00:00, 3194.27it/s]


🏃 View run bge-large-en-v1.5_cs256_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/2e5b8744ad4c46cba9f58be2788460f0
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3619
  R@10: 0.4823
  MRR@10: 0.7620
  Time: 245.6s

Testing: bge-large-en-v1.5 | chunk_size=256 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:36<00:00, 760.89it/s]


Created 64042 chunks from 28001 documents


Map: 100%|██████████| 64042/64042 [03:50<00:00, 277.40 examples/s]
Map: 100%|██████████| 4719/4719 [00:06<00:00, 751.62 examples/s]
100%|██████████| 501/501 [00:00<00:00, 2732.24it/s]


🏃 View run bge-large-en-v1.5_cs256_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/577e8e44bbb64b9da29c47e819661e04
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3628
  R@10: 0.4833
  MRR@10: 0.7629
  Time: 254.5s

Testing: bge-large-en-v1.5 | chunk_size=256 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:36<00:00, 769.23it/s]


Created 71128 chunks from 28001 documents


Map: 100%|██████████| 71128/71128 [04:16<00:00, 276.81 examples/s]
Map: 100%|██████████| 4719/4719 [00:06<00:00, 745.33 examples/s]
100%|██████████| 556/556 [00:00<00:00, 2483.71it/s]


🏃 View run bge-large-en-v1.5_cs256_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/d9f1ed5c341e46cfb88f084ed37a658e
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3634
  R@10: 0.4849
  MRR@10: 0.7646
  Time: 281.1s

Testing: bge-large-en-v1.5 | chunk_size=512 | overlap=0


Chunking: 100%|██████████| 28001/28001 [00:31<00:00, 898.60it/s]


Created 34787 chunks from 28001 documents


Map: 100%|██████████| 34787/34787 [04:00<00:00, 144.70 examples/s]
Map: 100%|██████████| 4719/4719 [00:06<00:00, 749.58 examples/s]
100%|██████████| 272/272 [00:00<00:00, 2009.25it/s]


🏃 View run bge-large-en-v1.5_cs512_ov0 at: http://localhost:5000/#/experiments/354692744483317738/runs/922fd80554c44979ad1fee2d707e6394
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3550
  R@10: 0.4734
  MRR@10: 0.7523
  Time: 263.8s

Testing: bge-large-en-v1.5 | chunk_size=512 | overlap=25


Chunking: 100%|██████████| 28001/28001 [00:31<00:00, 892.47it/s]


Created 34803 chunks from 28001 documents


Map: 100%|██████████| 34803/34803 [04:00<00:00, 144.62 examples/s]
Map: 100%|██████████| 4719/4719 [00:06<00:00, 753.56 examples/s]
100%|██████████| 272/272 [00:00<00:00, 2566.35it/s]


🏃 View run bge-large-en-v1.5_cs512_ov25 at: http://localhost:5000/#/experiments/354692744483317738/runs/5617833ecbfa4a1aa9bdd05077b8409f
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3554
  R@10: 0.4741
  MRR@10: 0.7529
  Time: 264.6s

Testing: bge-large-en-v1.5 | chunk_size=512 | overlap=50


Chunking: 100%|██████████| 28001/28001 [00:31<00:00, 901.91it/s]


Created 34841 chunks from 28001 documents


Map: 100%|██████████| 34841/34841 [04:00<00:00, 144.73 examples/s]
Map: 100%|██████████| 4719/4719 [00:06<00:00, 744.33 examples/s]
100%|██████████| 273/273 [00:00<00:00, 1875.58it/s]


🏃 View run bge-large-en-v1.5_cs512_ov50 at: http://localhost:5000/#/experiments/354692744483317738/runs/4191efa517c448ad94dc19dbe2f7c87f
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3556
  R@10: 0.4745
  MRR@10: 0.7539
  Time: 264.8s

Testing: bge-large-en-v1.5 | chunk_size=512 | overlap=100


Chunking: 100%|██████████| 28001/28001 [00:31<00:00, 900.95it/s]


Created 34965 chunks from 28001 documents


Map: 100%|██████████| 34965/34965 [04:01<00:00, 144.65 examples/s]
Map: 100%|██████████| 4719/4719 [00:06<00:00, 746.69 examples/s]
100%|██████████| 274/274 [00:00<00:00, 2100.45it/s]


🏃 View run bge-large-en-v1.5_cs512_ov100 at: http://localhost:5000/#/experiments/354692744483317738/runs/092c99f1073945ca8a0a67d3d16ac7bc
🧪 View experiment at: http://localhost:5000/#/experiments/354692744483317738

Results:
  P@10: 0.3561
  R@10: 0.4748
  MRR@10: 0.7533
  Time: 265.8s

All experiments completed!


In [5]:
# Optional: Quick analysis of results
print("\nView all results in MLflow UI at: http://localhost:5000")
print("\nKey questions to explore:")
print("1. Does chunking improve retrieval performance?")
print("2. What is the optimal chunk size for each embedder?")
print("3. Does overlap help? What's the optimal overlap?")
print("4. How does chunking affect embedding time?")


View all results in MLflow UI at: http://localhost:5000

Key questions to explore:
1. Does chunking improve retrieval performance?
2. What is the optimal chunk size for each embedder?
3. Does overlap help? What's the optimal overlap?
4. How does chunking affect embedding time?
