In [1]:
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv
from tqdm import tqdm

from rag.config import settings, PROJECT_ROOT
from rag.ingestion import create_chunker
from rag.embeddings import create_embedder
from rag.retrieval import create_reranker
from rag.generation import create_llm
from rag.storage import (
    BaseDocumentStore,
    BaseVectorStore,
    Document,
    SearchResult,
    make_chunk_id,
    parse_chunk_id,
    InMemoryDocumentStore,
    FAISSVectorStore, PostgresDocumentStore,
    PgvectorVectorStore,
)

from datasets import load_dataset, Dataset

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
embedder = create_embedder(settings)
chunker = create_chunker(settings)
reranker = create_reranker(settings)
llm = create_llm(settings)

Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 305.69it/s]


In [3]:
corpus_ds = load_dataset("rag-datasets/rag-mini-bioasq", "text-corpus")['passages']
queries_ds = load_dataset("rag-datasets/rag-mini-bioasq", "question-answer-passages")['test']

In [4]:
corpus_ds = corpus_ds.filter(lambda row: row['passage'] != 'nan')
chunked_ds = chunker.chunk_dataset(corpus_ds, text_col='passage', id_col='id')

In [5]:
chunked_ds

Dataset({
    features: ['text', 'doc_id', 'chunk_id'],
    num_rows: 64042
})

In [6]:
from rag.utils import batched

doc_store = PostgresDocumentStore(settings)


[2025-10-25 14:32:18] [rag.storage.document_stores.postgres] [INFO] PostgresDocumentStore initialized
[2025-10-25 14:32:18] [rag.storage.document_stores.postgres] [INFO] PostgresDocumentStore initialized


In [7]:
# add parents

In [8]:
corpus_ds

Dataset({
    features: ['passage', 'id'],
    num_rows: 28001
})

100%|██████████| 4719/4719 [00:00<00:00, 53785110.26it/s]


In [41]:
with tqdm(total=len(corpus_ds)) as pbar:
    for batch in batched(corpus_ds):
        docs = [
            Document(
                id=row['id'],
                text=row['passage'],
                doc_type='parent',
            ) for row in batch
        ]
        doc_store.add_documents(docs)
        pbar.update(len(batch))

28001it [00:04, 6108.98it/s]                         


In [42]:
with tqdm(total=len(chunked_ds)) as pbar:
    for batch in batched(chunked_ds):
        chunks = [
            Document(
                id=row['chunk_id'],
                text=row['text'],
                doc_type='chunk',
                # meta={'parent_id': row['doc_id']},
            ) for row in batch
        ]
        doc_store.add_chunks(chunks)
        pbar.update(len(batch))

100%|██████████| 64042/64042 [00:10<00:00, 5909.81it/s]


In [9]:
embedder = create_embedder(settings)

vec_store = PgvectorVectorStore(
    settings,
)

with tqdm(total=doc_store.count_chunks()) as pbar:
    for batch in doc_store.iter_chunks():
        texts = [x.text for x in batch]
        ids = [x.id for x in batch]
        embs = embedder.embed_batch(texts)
        vec_store.add(embs, ids)
        pbar.update(len(batch))

[2025-10-25 14:32:29] [rag.storage.vector_stores.pgvector] [INFO] PgvectorVectorStore initialized (cosine distance)
[2025-10-25 14:32:29] [rag.storage.vector_stores.pgvector] [INFO] PgvectorVectorStore initialized (cosine distance)


In [10]:
i = 13
question = queries_ds[i]['question']

print('Question:', question)
print('Answer:', queries_ds[i]['answer'])

Question: Which are the major characteristics of cellular senescence?
Answer: The defining characteristics of cellular senescence are altered morphology, arrested cell-cycle progression, development of aberrant gene expression with proinflammatory behavior, and telomere shortening.


In [11]:
i = 10
question = queries_ds[i]['question']

print('Question:', question)
print(80*'=')

q_emb = embedder.embed_text(question)
initial_results = vec_store.search(q_emb, k=settings.k)
for res in initial_results[:5]:
    print(doc_store.get_chunk(res.chunk_id).text)
    print(80*'=')

Question: Name synonym of Acrokeratosis paraneoplastica.
removed but recurred several times, with acrokeratosis paraneoplastica showing a 
parallel development. We, therefore, add liposarcoma to the growing list of 
malignant neoplasms associated with acrokeratosis paraneoplastica.
PURPOSE: Obligatory cutaneous paraneoplastic disorders comprising acanthosis 
nigricans maligna, erythema gyratum repens, paraneoplastic pemphigus, 
hypertrichosis lanuginosa acquisita, erythema necrolyticum migrans and 
acrokeratosis paraneoplastica are rare. However, as markers of an underlying 
internal malignancy they are of utmost importance for the patient. Acrokeratosis 
paraneoplastica (first described by Gougerot and Rupp in 1922) was named after 
Bazex who had then reported several cases in a French dermatological journal 
since 1965 (Bazex et al. in Bull Soc Fr Dermatol Syphiligr 72:182, 1965; Bazex 
and Griffiths in Br J Dermatol 102:301-306, 1980).
METHOD: The study is a clinical case of a patie

In [12]:
retrived_docs = [doc_store.get_chunk(r.chunk_id) for r in initial_results]

ranked = reranker.rerank(question, retrived_docs)
reranked_docs = ranked[:settings.top_k]
reranked_docs

[Document(id='17097409#0', text="Acrokeratosis paraneoplastica (Bazex's syndrome) is a rare obligate \nparaneoplastic dermatosis characterized by erythematosquamous lesions localized \nsymmetrically at the acral sites. The condition almost exclusively affects \nCaucasian men older than 40 years. It is usually associated with primary \nmalignant neoplasms of the upper aerodigestive tract. In most cases, the skin \nchanges precede the clinical manifestation of the underlying neoplasm. The \ndermatosis can be cured only by removal of the underlying carcinoma. We describe \na case of acrokeratosis paraneoplastica associated with a retroperitoneal \nliposarcoma in a 71-year-old Caucasian man. The liposarcoma was surgically \nremoved but recurred several times, with acrokeratosis paraneoplastica showing a \nparallel development. We, therefore, add liposarcoma to the growing list of", score=6.13846492767334, doc_type='chunk', meta={}),
 Document(id='6225397#0', text="Acrokeratosis paraneoplas

In [53]:
system_prompt = """You are a domain-careful, passage-bound assistant. You will be given:

CONTEXT: a list of dictionaries, each with keys:

text (string) — the passage content (use this only).

score (float) — Higher means more relevant. Use for tie-breaking, not as ground truth.

id (int or str) — unique identifier for citation (cite this).

QUESTION: the user’s query.

Ground Rules

Use Only the Provided Passages

All factual claims must come solely from text fields within CONTEXT.

Ignore any prior knowledge and external facts.

Cite by index

After each factual claim or at the end of a sentence/ bullet, cite like [idx=2025].

If multiple passages support a claim, cite the strongest 1–3 (prefer lower distance).

Be Concise, Direct, and Structured

Lead with a 1–3 sentence answer.

Use bullets for lists, mechanisms, pros/cons, steps, etc.

Include brief definitions only if needed to answer.

Rank & Filter Passages Sensibly

Prefer passages with lower distance, high topical match, and specific details.

De-duplicate overlapping content; don’t over-cite.

If passages conflict and can’t be resolved, state the disagreement and present both sides with citations.

No Fabrication

Do not invent numbers, dates, mechanisms, or terminology not explicitly present in text.

When Information Is Insufficient

Say: “I don’t have enough information in the provided passages to answer.”

Optionally list what’s missing (e.g., “mechanism”, “dates”, “definitions”).

Biomedical/Technical Care (if applicable)

Distinguish hypotheses vs. established findings when the wording is tentative.

Avoid over-generalization beyond what’s stated.

If species/setting (rodent vs. human, in vitro vs. in vivo) isn’t specified in the passages, don’t assume.

Working Steps (internal)

Parse CONTEXT; extract only text and note each item’s index and distance.

Identify passages most relevant to the QUESTION (favor lower distance).

Synthesize the answer strictly from the chosen passages.

Add minimal, targeted citations using [idx=…].

If conflicts remain unresolved, present both views briefly.

Input Format (exact)
======================== CONTEXT ================================
[{'text': '<passage 1 text>', 'distance': <float>, 'index': <int>},
 {'text': '<passage 2 text>', 'distance': <float>, 'index': <int>},
 ...
]
====================== QUESTION: <user question> ================

Output Format (default)
<Concise answer (1–3 sentences).>

- <Key point 1>. [doc_id=33378]
- <Key point 2>. [doc_id=37076, doc_id=33378]

Citations: [doc_id=33378], [doc_id=37076]

Insufficient Information
I don’t have enough information in the provided passages to answer. I would need <briefly state what is missing>.
Citations: —

Optional (if you want a quick audit trail)

After the answer (keep it short), you may append:

Relevance notes (brief):
- Used doc_id=37076 (lower distance, direct on H3 inverse agonism).
- Used doc_id=33378 (mechanistic distribution & function).
- Skipped doc_id=5234/2025/16517 (off-topic for QUESTION).
"""

prompt = f"""
======================== CONTEXT ================================
{[doc.to_dict() for doc in reranked_docs]}

====================== QUESTION: {question} ================"""

answer = llm.generate(
    prompt=prompt,
    system_prompt=system_prompt,
    max_tokens=1000,
    temperature=0.01,
)

print(answer)

Papilin is a secreted protein. [idx=3320045#0]

- Papilin was isolated from the culture media of Drosophila Kc cells, indicating it is secreted. [idx=3320045#0]

- The core protein of papilin is formed by some cell lines and by other cells on incubation with 1 mM 4-methylumbelliferyl xyloside, which suggests it can be secreted. [idx=3320045#0]

Citations: [idx=3320045#0]
