In [115]:
%load_ext autoreload
%autoreload 2

from langchain_text_splitters import RecursiveCharacterTextSplitter

from rag.embeddings import LocalEmbedder
from datasets import load_dataset, load_from_disk
from rag.config import PROJECT_ROOT
from dotenv import load_dotenv

load_dotenv(PROJECT_ROOT / '.env')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True

In [23]:
ds = load_from_disk('bioasq-mini-arrow.docs')
ds.load_faiss_index('embedding', 'bioasq-mini-arrow.index')
ds

Dataset({
    features: ['passage', 'id', 'embedding'],
    num_rows: 40221
})

In [17]:
test_ds = load_from_disk('bioasq-mini-arrow.qrels')
test_ds

Dataset({
    features: ['question', 'answer', 'relevant_passage_ids', 'id', 'embedding'],
    num_rows: 4719
})

In [5]:
embedder = LocalEmbedder("Qwen/Qwen3-Embedding-4B")

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.21it/s]


7

In [47]:
ds = ds.map(lambda x: {"len": len(x['passage'])})
ds

Map: 100%|██████████| 40221/40221 [00:00<00:00, 56231.20 examples/s]


Dataset({
    features: ['passage', 'id', 'embedding', 'len'],
    num_rows: 40221
})

In [90]:
import numpy as np
argids = np.argsort(ds['len'])[::-1]
argids

array([29813, 30252, 36595, ..., 36327, 36326,  9334], shape=(40221,))

In [112]:
print(ds[argids[0]]['passage'])

Author information:
(1)Imaging Genetics Center, Institute for Neuroimaging &Informatics, Keck School 
of Medicine of the University of Southern California, Los Angeles, California 
90292, USA.
(2)1] Imaging Genetics Center, Institute for Neuroimaging &Informatics, Keck 
School of Medicine of the University of Southern California, Los Angeles, 
California 90292, USA. [2] Neurogenetics Program, Department of Neurology, UCLA 
School of Medicine, Los Angeles, California 90095, USA.
(3)QIMR Berghofer Medical Research Institute, Brisbane 4006, Australia.
(4)1] Department of Human Genetics, Radboud university medical center, Nijmegen 
6500 HB, The Netherlands. [2] Department of Psychiatry, Radboud university 
medical center, Nijmegen 6500 HB, The Netherlands. [3] Department of Cognitive 
Neuroscience, Radboud university medical center, Nijmegen 6500 HB, The 
Netherlands. [4] Donders Institute for Brain, Cognition and Behaviour, Radboud 
University, Nijmegen 6500 GL, The Netherlands.
(5)MRC-SG

In [58]:
model = embedder.model
tokenizer = model.tokenizer
text = "Marry had a little lamb"


def count_tokens(text):
    return len(tokenizer.encode(text))


count_tokens(text)

7

In [71]:
chunker = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=50,
    length_function=count_tokens,
    separators=["\n\n", "\n", ". ", " ", ""],
)

In [114]:
len(chunker.split_text(ds[argids[0]]['passage']))

53

In [118]:
from datasets import Dataset
from tqdm import tqdm


def chunk_documents(dataset, chunker, text_col = 'passage', id_col = 'id'):
    chunked_docs = []
    pbar = tqdm(total=len(dataset), desc='Chunking')
    for doc in dataset:
        text = doc[text_col]
        parent_id = doc[id_col]
        chunks = chunker.split_text(text)
        for i, chunk in enumerate(chunks):
            chunked_docs.append({
                'text': chunk,
                'parent_id': parent_id,
                'chunk_id': i,
            })
        pbar.update(1)
    pbar.close()
    return Dataset.from_list(chunked_docs)

chunker = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=50,
    length_function=count_tokens,
    separators=["\n\n", "\n", ". ", " ", ""],
)

chunked_ds = chunk_documents(ds, chunker)

100%|██████████| 40221/40221 [00:40<00:00, 985.04it/s] 


In [None]:
from langchain_text_splitters import TextSplitter

TextSplitter