Chunk the documents

In [None]:
from src.preprocess.sem_chunker import SemanticChunker
import json
from tqdm import tqdm
import os
# --- Configuration ---
INPUT_FILE_PATH = "../assets/datagems/language_documents.json"
OUTPUT_FILE_PATH = "../assets/datagems/language_documents_chunked.json"
CONTENT_FIELD = "contents"

def chunk_json_file(input_path: str, output_path: str, content_field: str):
    """Reads, chunks, and writes JSON data."""        
    chunker = SemanticChunker() 

    with open(input_path, 'r', encoding='utf-8') as f_in:
        data = json.load(f_in)

    output_records = []
    for record in tqdm(data, desc="Chunking records", unit="record", leave=False):
        if isinstance(record, dict) and content_field in record:
            text_to_chunk = record[content_field]
            if isinstance(text_to_chunk, str) and text_to_chunk.strip():
                chunks = chunker.pre_process(text_to_chunk)
                for chunk_text in chunks:
                    if chunk_text.strip(): # Ensure chunk itself isn't just whitespace
                        new_record = record.copy()
                        new_record[content_field] = chunk_text
                        output_records.append(new_record)
    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f_out:
        json.dump(output_records, f_out, indent=2, ensure_ascii=False)

chunk_json_file(INPUT_FILE_PATH, OUTPUT_FILE_PATH, CONTENT_FIELD)


Convert to jsonl for compatibility with the retrievers

In [2]:
import json

# Paths
input_path = "../assets/datagems/language_documents.json"
output_path = "../assets/datagems/language_documents.jsonl"

with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

with open(output_path, "w", encoding="utf-8") as f:
    if isinstance(data, list):
        for item in data:
            f.write(json.dumps(item) + "\n")
    else:
        f.write(json.dumps(data) + "\n")

input_path = "../assets/datagems/language_documents_semchunk.json"
output_path = "../assets/datagems/language_documents_semchunk.jsonl"

with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

with open(output_path, "w", encoding="utf-8") as f:
    if isinstance(data, list):
        for item in data:
            f.write(json.dumps(item) + "\n")
    else:
        f.write(json.dumps(data) + "\n")

KeyboardInterrupt: 

Defone the list of retrievers

In [None]:
import os
#disable tqdm at all
os.environ["DISABLE_TQDM"] = "1"
import tqdm  # or other modules that internally import tqdm

from src.retrieval.base import BaseRetriever, RetrievalResult
from src.retrieval.bm25 import PyseriniBM25Retriever
from src.retrieval.dense import FaissDenseRetriever
from src.retrieval.dense_rerank import DenseRetrieverWithReranker
from src.retrieval.dense_decomp import DenseRetrieverWithDecomposition
from src.retrieval.dense_decomp_rerank import DenseRetrieverWithDecompositionAndReranker
from src.feverous.feverous_evaluator import FeverousEvaluation
from src.retrieval.react import ReActRetriever
from src.retrieval.qdrant_dense import QdrantBGEDenseRetriever
from src.retrieval.qdrant_hybrid import QdrantBGEHybridRetriever
from src.retrieval.qdrant_sparse_dense import QdrantBGEDenseSparseRetriever
from typing import List
MODEL_NAME = "BAAI/bge-m3"
retriever_instances = {
            #"BM25": lambda: PyseriniBM25Retriever(),
            #"Dense": lambda: FaissDenseRetriever(model_name_or_path=MODEL_NAME),
            #"Dense+Rerank": lambda: DenseRetrieverWithReranker(embedding_model_name=MODEL_NAME),
            #"Dense+Decomp": lambda: DenseRetrieverWithDecomposition(decomposition_cache_folder="../assets/datagems/language_decomp",embedding_model_name=MODEL_NAME,ollama_model="llama3.3:70b"),
            #"Dense+Decomp+Rerank": lambda: DenseRetrieverWithDecompositionAndReranker(decomposition_cache_folder="../assets/datagems/language_decomp",embedding_model_name=MODEL_NAME),
            #"ReAct": lambda: ReActRetriever(model_path="../assets/cache/Qwen2.5-32B-Instruct-Q4_K_M.gguf",dense_model_name_or_path=MODEL_NAME)
            #"QdrantDenseOriginal": lambda: QdrantBGEDenseRetriever(collection_name="datagems_language_original"),
            #"QdrantDenseChunks": lambda: QdrantBGEDenseRetriever(collection_name="datagems_language_chunked"),
            "QdrantHybridOriginal": lambda: QdrantBGEHybridRetriever(
                collection_name="datagems_language_hybrid_chunked", # Specific collection for hybrid original data
            )
            #"QdrantHybridOriginal": lambda: QdrantBGEHybridRetriever(
            #    collection_name="datagems_language_hybrid_original", # Specific collection for hybrid original data
            #)
        }
retrievers: List[BaseRetriever] = []
for name, init_func in retriever_instances.items():
    retriever_instance = init_func()
    retrievers.append(retriever_instance)



Loading BGE model: BAAI/bge-m3 (fp16: True) for Hybrid Search...


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

BGE model loaded successfully.
Initializing Qdrant client for HYBRID search: 195.251.63.238:6334 (gRPC: True)
Qdrant client configured for collection 'datagems_language_hybrid_chunked'.


  self.client = QdrantClient(


: 

Perform the indexing with and without chunking

In [None]:
from pathlib import Path

INDEX_BASE_DIR = Path("../assets/datagems/indexes/language")
SERIALIZATION_FILENAMES = {
    #"original": "../assets/datagems/language_documents.jsonl",
    "chnked": "../assets/datagems/language_documents_semchunk.jsonl"
}
METADATA_FIELDS_TO_INDEX = ["id", "source","language"]
FIELD_TO_INDEX = "contents"

def get_output_folder(base_dir: Path, retriever_instance: BaseRetriever,serialization_name:str) -> Path:
    if isinstance(retriever_instance, (FaissDenseRetriever)):
        return base_dir / f"faiss_indexes_{MODEL_NAME}" / f"dense_{serialization_name}"
    elif isinstance(retriever_instance, PyseriniBM25Retriever):
        return base_dir / "pyserini_indexes" / f"bm25_{serialization_name}"
    else:
        return ""
    
#iterate over serialization filenames
for serialization_name, serialization_filename in SERIALIZATION_FILENAMES.items():
    for retriever_instance in retrievers:
        retriever_name = getattr(retriever_instance, '_retriever_name_id', retriever_instance.__class__.__name__)
        is_qdrant = isinstance(retriever_instance, QdrantBGEDenseRetriever)
        is_qdrant_original = is_qdrant and retriever_instance.collection_name == "datagems_language_original"
        is_qdrant_chunks = is_qdrant and retriever_instance.collection_name == "datagems_language_chunked"
        is_qdrant_hybrid_original = is_qdrant and retriever_instance.collection_name == "datagems_language_hybrid_original"
        is_qdrant_hybrid_chunks = is_qdrant and retriever_instance.collection_name == "datagems_language_hybrid_chunked"
        if serialization_name == "original" and (is_qdrant_chunks or is_qdrant_hybrid_chunks):
            print(f"-- Skipping: Indexing '{serialization_name}' data with Qdrant Chunks retriever ({retriever_name})")
            continue
        if serialization_name == "chnked" and (is_qdrant_original or is_qdrant_hybrid_original):
            print(f"-- Skipping: Indexing '{serialization_name}' data with Qdrant Original retriever ({retriever_name})")
            continue
        
        print(f"Indexing {serialization_name} with {retriever_instance.__class__.__name__}")
        output_folder = get_output_folder(INDEX_BASE_DIR, retriever_instance,serialization_name)
        retriever_instance.index(
            input_jsonl_path=serialization_filename,
            output_folder=output_folder,
            metadata_fields=METADATA_FIELDS_TO_INDEX,
            field_to_index=FIELD_TO_INDEX
        )

Indexing chnked with QdrantBGEHybridRetriever
Collection 'datagems_language_hybrid_chunked' found.
Collection 'datagems_language_hybrid_chunked' created/recreated successfully for hybrid search.
Reading data from ../assets/datagems/language_documents_semchunk.jsonl...


Processing JSONL: 0it [00:00, ?it/s]

Generating BGE-M3 hybrid embeddings for 3846805 documents...


Encoding Batches:   0%|          | 0/470 [00:00<?, ?it/s]

initial target device: 100%|██████████| 2/2 [00:08<00:00,  4.17s/it]
pre tokenize: 100%|██████████| 1/1 [00:00<00:00,  3.76it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
pre tokenize: 100%|██████████| 1/1 [00:00<00:00,  3.57it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 1/1 [00:06<00:00,  6.54s/it]
Inference Embeddings: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]
Chunks: 100%|██████████| 2/2 [00:19<00:00,  9.77s/it]
pre tokenize: 100%|██████████| 1/1 [00:00<00:00,  3.10it/s]
pre tokenize: 100%|██████████| 1/1 [00:00<00:00,  3.36it/s]
Inference Embeddings: 100%|██████████

: 

In [3]:
from src.datagems.language_evaluator import LanguageEvaluator
import json
import time
K = 50
evaluator = LanguageEvaluator(n_values=[1, 5, 10,20])
BENCHMARK_FILE_PATH = "../assets/datagems/language_benchmark.json"
WANDB_PROJECT = "datagems"
WANDB_ENTITY = "darelab"

with open(BENCHMARK_FILE_PATH, 'r', encoding='utf-8') as f:
    benchmark_data = json.load(f)
all_nlqs = [record.get('question') for record in benchmark_data if record.get('question')]
all_nlqs = all_nlqs[:1]  # Limit to the first 100 queries for testing
for serialization_name, serialization_filename in SERIALIZATION_FILENAMES.items():
    for retriever_instance in retrievers:
        retriever_name = getattr(retriever_instance, '_retriever_name_id', retriever_instance.__class__.__name__)
        is_qdrant = isinstance(retriever_instance, QdrantBGEDenseRetriever)
        is_qdrant_original = is_qdrant and retriever_instance.collection_name == "datagems_language_original"
        is_qdrant_chunks = is_qdrant and retriever_instance.collection_name == "datagems_language_chunked"
        if serialization_name == "original" and is_qdrant_chunks:
            print(f"-- Skipping: Indexing '{serialization_name}' data with Qdrant Chunks retriever ({retriever_name})")
            continue
        if serialization_name == "chnked" and is_qdrant_original:
            print(f"-- Skipping: Indexing '{serialization_name}' data with Qdrant Original retriever ({retriever_name})")
            continue
        print(f"Retrieving for {serialization_name} with {retriever_instance.__class__.__name__}")
        output_folder = get_output_folder(INDEX_BASE_DIR, retriever_instance,serialization_name)
        start = time.time()
        retrieved_results: List[List[RetrievalResult]] = retriever_instance.retrieve(
            nlqs=all_nlqs,
            output_folder=str(output_folder),
            k=K)
        print(retrieved_results[0])
        end = time.time()
        wandb_group = f"language_sample_{serialization_name}"
        wandb_name = retriever_instance.__class__.__name__
        evaluator.evaluate(
            BENCHMARK_FILE_PATH,
            retrieved_results,
            end-start,
            enable_wandb=True,
            project_wandb=WANDB_PROJECT,
            entity_wandb=WANDB_ENTITY,
            group_wandb=wandb_group,
            name_wandb=wandb_name,
            verbose=False
        )

Retrieving for chnked with ReActRetriever


Processing Queries (ReAct with Guidance):   0%|          | 0/1 [00:00<?, ?it/s]

[RetrievalResult(score=0.5995, object='de minéralogie et de géologie« (1841, 17. Aufl. 1886; deutsch, Stuttg. 1858)....', metadata={id: ger_de_271349, source: german_encyclopedia, language: de}), RetrievalResult(score=0.6661, object='Landshut, a city of the province of the Isar, situated  on that river, in the kingdom of Bavaria. It is in a picturesque situation,  overlooked by an ancient castle on an eminence. It contains about 600 houses,  and 8200 inhabitants. It is the seat of a university, in which are from 600 to  700 pupils, and which possesses a library of 100,000 volumes, and several  appropriate institutions for instruction in law, medicine, surgery, midwifery,...', metadata={id: eb_en_8524, source: britannica, language: en}), RetrievalResult(score=0.6029, object='Mohs, Friedrich, Mineralog, geb. 1774 zu Gernrode am Harz, studierte zu Halle, sodann auf der Bergakademie zu Freiberg und kam 1802 nach Wien, wo er die Beschreibung der Mineraliensammlung des Bankiers van der Null 

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33ma-koukouvinis[0m ([33mlakhs[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



2025-04-29 20:31:28,601 - root - ERROR - An error occurred during evaluation.
Traceback (most recent call last):
  File "/data/hdd1/users/akouk/ARM/ARM/src/datagems/language_evaluator.py", line 164, in evaluate
    gt_by_scenario, pred_by_scenario = self._load_and_prepare_data(
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/hdd1/users/akouk/ARM/ARM/src/datagems/language_evaluator.py", line 79, in _load_and_prepare_data
    raise ValueError(
ValueError: Mismatch between number of benchmark queries (1626) and number of retrieved result sets (1).


ValueError: Mismatch between number of benchmark queries (1626) and number of retrieved result sets (1).

In [None]:
from src.datagems.language_evaluator import LanguageEvaluator
import json
import time
K = 50
evaluator = LanguageEvaluator(n_values=[1, 5, 10,20])
BENCHMARK_FILE_PATH = "../assets/datagems/language_benchmark_sample.json"
WANDB_PROJECT = "datagems"
WANDB_ENTITY = "darelab"


with open(BENCHMARK_FILE_PATH, 'r', encoding='utf-8') as f:
    benchmark_data = json.load(f)
all_nlqs = [record.get('question') for record in benchmark_data if record.get('question')]
for serialization_name, serialization_filename in SERIALIZATION_FILENAMES.items():
    for retriever_instance in retrievers:
        retriever_name = getattr(retriever_instance, '_retriever_name_id', retriever_instance.__class__.__name__)
        is_qdrant = isinstance(retriever_instance, QdrantBGEDenseRetriever)
        is_qdrant_original = is_qdrant and retriever_instance.collection_name == "datagems_language_original"
        is_qdrant_chunks = is_qdrant and retriever_instance.collection_name == "datagems_language_chunked"
        if serialization_name == "original" and is_qdrant_chunks:
            print(f"-- Skipping: Indexing '{serialization_name}' data with Qdrant Chunks retriever ({retriever_name})")
            continue
        if serialization_name == "chnked" and is_qdrant_original:
            print(f"-- Skipping: Indexing '{serialization_name}' data with Qdrant Original retriever ({retriever_name})")
            continue
        print(f"Retrieving for {serialization_name} with {retriever_instance.__class__.__name__}")
        output_folder = get_output_folder(INDEX_BASE_DIR, retriever_instance,serialization_name)
        start = time.time()
        retrieved_results: List[List[RetrievalResult]] = retriever_instance.retrieve(
            nlqs=all_nlqs,
            output_folder=str(output_folder),
            k=K)
        end = time.time()
        wandb_group = f"language_sample_{serialization_name}"
        wandb_name = retriever_instance.__class__.__name__
        evaluator.evaluate(
            BENCHMARK_FILE_PATH,
            retrieved_results,
            end-start,
            enable_wandb=True,
            project_wandb=WANDB_PROJECT,
            entity_wandb=WANDB_ENTITY,
            group_wandb=wandb_group,
            name_wandb=wandb_name,
            verbose=False

        )   

Retrieving for chnked with ReActRetriever
Failed to load FAISS index to GPU: Error in virtual void* faiss::gpu::StandardGpuResourcesImpl::allocMemory(const faiss::gpu::AllocRequest&) at /project/faiss/faiss/gpu/StandardGpuResources.cpp:577: Error: 'err == cudaSuccess' failed: StandardGpuResources: alloc fail type FlatData dev 0 space Device stream 0x55ba3def1770 size 15756513280 bytes (cudaMalloc error out of memory [2])
. Using CPU index.


Processing Queries (ReAct with Guidance):   0%|          | 0/100 [00:00<?, ?it/s]

StitchWidget(initial_height='auto', initial_width='100%', srcdoc='<!doctype html>\n<html lang="en">\n<head>\n …

In [1]:
from src.retrieval.qdrant_dense import QdrantBGEDenseRetriever
qdrant_instance = QdrantBGEDenseRetriever(collection_name="datagems_language_chunked")
qdrant_instance.index_from_faiss("../assets/datagems/indexes/language/faiss_indexes_BAAI/bge-m3/dense_chnked")

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

Initializing Qdrant client for 195.251.63.238:6334 (gRPC: True)
Qdrant client configured.


  self.client = QdrantClient(


Loading data from Faiss folder: ../assets/datagems/indexes/language/faiss_indexes_BAAI/bge-m3/dense_chnked
Loading metadata...
Loaded 3846805 metadata entries.
Loading Faiss index...
Loaded Faiss index with 3846805 vectors of dimension 1024.
Collection 'datagems_language_chunked' not found. Creating...
Collection 'datagems_language_chunked' created.
Reconstructing vectors from Faiss index...
Reconstructed 3846805 vectors.
Upserting 3846805 points from Faiss to Qdrant collection 'datagems_language_chunked'...


Upserting Faiss data to Qdrant:   0%|          | 0/15027 [00:00<?, ?it/s]

Successfully indexed 3846805 points from Faiss folder '../assets/datagems/indexes/language/faiss_indexes_BAAI/bge-m3/dense_chnked' into Qdrant collection 'datagems_language_chunked'.


In [None]:
from src.retrieval.base import BaseRetriever, RetrievalResult

BENCHMARK_FILE_PATH = "../assets/datagems/language_benchmark.json"
import json
from typing import List
with open(BENCHMARK_FILE_PATH, 'r', encoding='utf-8') as f:
    benchmark_data = json.load(f)
all_nlqs = [record.get('question') for record in benchmark_data if record.get('question')]

retrieved_results: List[List[RetrievalResult]] = qdrant_instance.retrieve(nlqs=all_nlqs,output_folder="", k=5)

In [None]:
retrieved_results[0]