In [1]:
import requests
from loguru import logger
from pathlib import Path
import os
import uuid
from typing import Generator
import numpy as np
import simplejson as json
import torch
from justatom.tooling.dataset import DatasetRecordAdapter
from justatom.etc.schema import Document
from more_itertools import chunked
import json_repair
import polars as pl

from justatom.storing.weaviate import Finder as WeaviateApi

from tqdm import tqdm

### ‚úîÔ∏è ANN document store backed by <a href="https://github.com/weaviate/weaviate">Weaviate</a>

 > First, make sure Docker is up and running. From the project root, run:
```bash
docker-compose up -d
```

‚ùóÔ∏è By default, Weaviate runs on port `2211`.

In [2]:
collection_name = "JUSTATOM_COLLECTION_v2"
weaviate_host, weaviate_port, weaviate_grpc_port = "localhost", 2211, 50051

In [3]:
store = await WeaviateApi.find(collection_name, WEAVIATE_HOST=weaviate_host, WEAVIATE_PORT=weaviate_port, WEAVIATE_GRPC_PORT=weaviate_grpc_port)

[32m2026-02-16 06:51:33.522[0m | [1mINFO    [0m | [36mjustatom.storing.weaviate[0m:[36mfind[0m:[36m799[0m - [1mFINDER | collection_name=[JUSTATOM_COLLECTION_v2][0m
[32m2026-02-16 06:51:33.524[0m | [1mINFO    [0m | [36mjustatom.storing.weaviate[0m:[36mconnect[0m:[36m223[0m - [1mFINDER | collection_schema_name=[JUSTATOM_COLLECTION_v2][0m
[32m2026-02-16 06:51:33.524[0m | [1mINFO    [0m | [36mjustatom.storing.weaviate[0m:[36mconnect[0m:[36m223[0m - [1mFINDER | collection_schema_name=[JUSTATOM_COLLECTION_v2][0m
            Consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [4]:
n_docs = await store.count_documents()

In [5]:
logger.info(f"For the collection=[{collection_name}] you have N=[{n_docs}] documents")

[32m2026-02-16 06:51:36.107[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mFor the collection=[JUSTATOM_COLLECTION_v2] you have N=[1184] documents[0m


In [6]:
all_collections = await store._client.collections.list_all(simple=True)

In [7]:
logger.info(f"COLLECTION | [{', '.join(all_collections)}]")

[32m2026-02-16 06:51:37.463[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mCOLLECTION | [Evalvanillalarge, Justatom_collection_v2, Polaroidsaibase, Justatom_collection][0m


### ‚úîÔ∏è Prepare dataset

 > For this tutorial we use the built-in dataset `polaroids.ai` ‚Äî snippets from books, movies, and games.

 > ‚úÖ For large datasets, prefer streaming (`lazy=True`) and avoid materializing the whole file in memory.

In [16]:
dataset_name_or_path = Path(os.getcwd()) / ".data" / "polaroids.ai.data.json"

In [17]:
pl_docs = pl.read_json(dataset_name_or_path)

In [18]:
present_columns = pl_docs.columns

In [19]:
present_columns

['title',
 'author',
 'type',
 'has_image',
 'img_path',
 'speaker',
 'keywords_or_phrases',
 'chunk_id',
 'content',
 'queries',
 'answers',
 'are_contexts_present',
 'dialogue_en',
 'dialogue_description',
 'dialogue_speakers_en']

> ‚ùóÔ∏è`content` is required. `id` is optional and auto-generated if missing.

 > In this tutorial we also map `chunk_id` to `id` and keep metadata lean for stable Weaviate writes.

> ‚ùóÔ∏èFilter out rows with `null` in required fields before indexing; otherwise the pipeline may fail at write time.

In [21]:
content_col = "content"
chunk_id_col = "chunk_id"
title_col = "title"
labels_col = "queries"

In [22]:
pl_docs = pl_docs.filter((pl.col(content_col).is_not_null()) & (pl.col(chunk_id_col).is_not_null()))

In [23]:
logger.info(f"There are D=[{pl_docs.shape[0]}] unique documents")

[32m2026-02-16 06:55:20.690[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mThere are D=[4992] unique documents[0m


> ‚ùóÔ∏èWe keep `keywords_or_phrases`, relevant `queries`, and original `chunk_id` per chunk.

 > ‚úÖ For robust indexing, use `preserve_all_fields=False` so noisy source fields are not blindly propagated to `meta`.

<small>

|  queries (list[str])  |     content: str     |   chunk_id: str   |
|:---------------------:|:--------------------:|:-----------------:|
| 1. ...thinking about 'The Hunger Games' mechanics, if you were in the same shoes as Gale, entering your name forty-two times to feed your fam, how would you strategize your game in the actual Arena? Would you team up or go solo based on these high stakes? <br><br>2.In the universe of 'The Hunger Games', what are tesserae and what do they offer to the participants in the Harvest?    | And here's where the real interest begins. Suppose you're poor and starving. Then you can ask to be included in the Harvest more times than you're entitled to, and in return you'd get tesserae. They give you grain and oil for a whole year for one tessera per person. You won't be full, but it's better than nothing. You can take tesserae for the whole family. When I was twelve, my name was entered four times. Once by law, and once more for tesserae for Prim, my mother, and myself. The next years had to do the same. And since the price of a tessera increases by one entry each year, now that I've turned sixteen, my name will be on twenty cards. Gale is eighteen, and he's been feeding a family of five for seven years. His name will be entered forty two times! It's clear that people like Madge, who has never had to risk because of tesserae, annoy Gale. Next to us, the inhabitants of the slag heap, she simply has no chance of getting into the games. Well, almost no chance. Of course, the rules are set by the Capitol, not the districts, let alone Madge's relatives, and it's still hard to sympathize with those who, like you, don't have to trade their own skin for a piece of bread.  | 80504cd8-9b21-514c-b001-4761d8c71044         |
|-----------------------|----------------------|-------------------|
| 1.In 'Harry Potter and the Philosopher's Stone', what misconception had Harry and Hermione initially had about Snape's intentions before learning the truth? <br><br>2. Hey peeps, why is Harry all jittery and pacing around the room even after telling Hermione about the whole Snape and Voldemort situation?        | Ron was asleep in the common room - apparently, he had been waiting for their return and had dozed off unnoticed. When Harry roughly shook him, Ron began to yell something about breaking the rules of a game, as if he were dreaming about a Quidditch match. However, after a few seconds, Ron completely woke up and, with his eyes wide open, listened to the story of Hermione and Harry. Harry was so excited that he could not sit still and paced back and forth across the room, trying to stay as close to the fireplace as possible. He was still shaking with cold. 'Snape wants to steal the stone for Voldemort. And Voldemort is waiting in the forest... And all this time we thought Snape wanted to steal the stone to become rich... And Voldemort...'  | 5ad25a92-28d9-5971-a81b-4f795898eeab         |
|-----------------------|----------------------|-------------------|
| 1. Hey fellow gamers, in The Hunger Games universe, if you were in a match where your ally was taken down first like Rue, how would you strategize your next move to survive against top opponents like Cato?<br><br> 2. In the 'Hunger Games' novel, why does Cato decide to spare Katniss's life after their encounter?    | What was she babbling about? You're Rue's ally? - I... I... we teamed up. We blew up the food of the Pros. I wanted to save her. Really did. But he found her first, the guy from District One - I say. Perhaps if Cato knows I helped Rue, he will kill me quickly and painlessly. - Did you kill him? - he asks grimly. - Yes. I killed him. And I covered her body with flowers. I sang to her till she fell asleep. Tears well up in my eyes. Will and strength are leaving me. There's only Rue, the pain in my head, fear of Cato and the moan of the dying girl. - Fell asleep? - mocks Cato. - Died. I sang to her till she died - I say. - Your district... sent me bread. I raise my hand - not for an arrow; I won't have time anyway. I just blow my nose. - Cato, make it quick, okay? His face shows conflicting emotions. Cato puts down the rock and says with almost a reproach: - This time, only this time, I'm letting you go. For the girl. We are even. No one owes anything to anyone anymore, understand? I nod, because I do understand. Understand about debts. About how bad it is to have them. Understand that if Cato wins, he will return to a district that has forgotten the rules to thank me. And Cato is neglecting them, too. Right now, he's not going to crack my head with a stone.  | b317200c-7fd3-5804-bbe4-bff33432ad0e         |
|-----------------------|----------------------|-------------------|

</small>

In [46]:
ds_adapter_for_index = DatasetRecordAdapter.from_source(
    dataset_name_or_path=dataset_name_or_path,
    lazy=True,
    content_col=content_col,
    queries_col=labels_col,
    chunk_id_col=chunk_id_col,
    dataframe_col=title_col,
    keywords_col="keywords_or_phrases",
    preserve_all_fields=False,
    filter_fields=[content_col, chunk_id_col],
)

In [47]:
next(ds_adapter_for_index.iterator())

{'content': "Inside was the first beautiful thing I'd seen in District 13: a replica of a meadow, filled with real trees and blooming plants, and plenty of hummingbirds fluttering about. Beetee was sitting motionless in a wheelchair in the middle of the meadow, watching a tender green bird hovering in the air and drinking nectar from a large blossom of an orange tree.",
 'content_type': 'text',
 'dataframe': '–°–æ–π–∫–∞-–ø–µ—Ä–µ—Å–º–µ—à–Ω–∏—Ü–∞',
 'keywords': None,
 'score': None,
 'meta': {'labels': ["What elements of nature were present in the artificial meadow created in District 13 in the 'Mockingjay' universe?",
   "In the 'Mockingjay' novel, who is observed sitting in a wheelchair in the middle of the recreated meadow in District 13, and what are they watching?",
   'Hey fellow tributes! üèπ Did you notice Beetee in a wheelchair in that peaceful meadow in District 13? What do you think caused him to be in that condition? #Mockingjay',
   "What's up, book lovers? üìö Just read a

### Modeling

 > See <a href="https://huggingface.co/intfloat/multilingual-e5-large">E5 large</a>, <a href="https://huggingface.co/intfloat/multilingual-e5-base">E5 base</a>, and <a href="https://huggingface.co/intfloat/multilingual-e5-small">E5 small</a>.

 > üìé <a href="https://arxiv.org/abs/2212.03533">Paper</a>

 > ‚ùóÔ∏èFor this tutorial we use `intfloat/multilingual-e5-base` as a trade-off between quality and speed.

In [27]:
model_name_or_path = "intfloat/multilingual-e5-base"

from justatom.modeling.mask import ILanguageModel
from justatom.running.encoders import EncoderRunner
from justatom.processing import RuntimeProcessor, ITokenizer

lm_model = ILanguageModel.load(model_name_or_path)

  warn(
[32m2026-02-16 06:57:12.003[0m | [1mINFO    [0m | [36mjustatom.modeling.mask[0m:[36mload[0m:[36m149[0m - [1mLoading from huggingface hub via "intfloat/multilingual-e5-base"[0m


In [28]:
def maybe_cuda_or_mps():
    if torch.backends.mps.is_built():
        return "mps"
    elif torch.cuda.is_available():
        return "cuda:0"
    else:
        return "cpu"

In [29]:
device = maybe_cuda_or_mps()

In [62]:
runner = EncoderRunner(model=lm_model, prediction_heads=[], device=device)

[32m2026-02-16 07:16:52.840[0m | [1mINFO    [0m | [36mjustatom.running.encoders[0m:[36mto[0m:[36m36[0m - [1mMoving to device cuda:0[0m


In [31]:
processor = RuntimeProcessor(ITokenizer.from_pretrained(model_name_or_path))



‚ùóÔ∏èAccording to the <a href="https://arxiv.org/abs/2212.03533">paper</a> E5 family is trained in an asymmetric way, meaning:

 > Use `"query: "` and `"passage: "` respectively for asymmetric tasks such as passage retrieval in open QA, ad-hoc information retrieval.

> Use `"query: "` prefix for symmetric tasks such as semantic similarity, bitext mining, paraphrase retrieval.

> Use `"query: "` prefix if you want to use embeddings as features, such as linear probing classification, clustering.

In [35]:
processor.prefix = "passage: "

> Let's put everything together in one abstraction ‚Äî `Indexer`.

In [34]:
from justatom.running.indexer import API as IndexerAPI

# 1. "embedding" is the way to index the given ANN store (weaviate)
# 2. runner is responsible for mapping docs to embeddings
# 3. processor is responsible for tokenizing given chunks
# 4. device - compute everything on selected `device`

ix_runner = IndexerAPI.named("embedding", runner=runner, store=store, processor=processor, device=device)

[32m2026-02-16 06:57:30.193[0m | [1mINFO    [0m | [36mjustatom.running.encoders[0m:[36mto[0m:[36m36[0m - [1mMoving to device cuda:0[0m


In [48]:
n_added_docs = await ix_runner.index(ds_adapter_for_index.iterator(), batch_size=32, batch_size_per_request=32)

0it [00:00, ?it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

1it [00:00,  2.58it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

2it [00:00,  3.57it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

3it [00:00,  4.10it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

4it [00:00,  4.40it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

5it [00:01,  4.56it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

6it [00:01,  4.70it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

7it [00:01,  4.79it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

8it [00:01,  4.83it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

9it [00:02,  4.88it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

10it [00:02,  4.91it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

11it [00:02,  4.94it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

12it [00:02,  4.92it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

13it [00:02,  4.95it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

14it [00:03,  4.92it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

15it [00:03,  4.95it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

16it [00:03,  4.93it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

17it [00:03,  4.94it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

18it [00:03,  4.92it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

19it [00:04,  4.95it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

20it [00:04,  4.95it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

21it [00:04,  4.92it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

22it [00:04,  4.94it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

23it [00:04,  4.95it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

24it [00:05,  4.96it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

25it [00:05,  4.95it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

27it [00:05,  6.61it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

28it [00:05,  6.15it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

29it [00:05,  5.88it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

30it [00:06,  5.63it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

31it [00:06,  5.43it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

32it [00:06,  5.32it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

33it [00:06,  5.20it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

34it [00:06,  5.16it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

35it [00:07,  5.12it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

36it [00:07,  5.09it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

37it [00:07,  5.08it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

38it [00:07,  5.00it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

39it [00:07,  4.98it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

40it [00:08,  4.99it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

41it [00:08,  5.01it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

42it [00:08,  5.02it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

43it [00:08,  5.04it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

44it [00:08,  5.04it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

45it [00:09,  4.96it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

46it [00:09,  4.94it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

47it [00:09,  4.92it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

48it [00:09,  4.87it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

49it [00:09,  4.84it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

50it [00:10,  4.85it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

51it [00:10,  4.81it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

52it [00:10,  4.78it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

53it [00:10,  4.77it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

54it [00:10,  4.76it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

55it [00:11,  4.75it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

56it [00:11,  4.80it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

57it [00:11,  4.80it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

58it [00:11,  4.86it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

59it [00:11,  4.85it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

60it [00:12,  4.82it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

61it [00:12,  4.85it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

62it [00:12,  4.84it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

63it [00:12,  4.80it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

64it [00:12,  4.78it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

65it [00:13,  4.74it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

66it [00:13,  4.70it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

67it [00:13,  4.70it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

68it [00:13,  4.59it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

69it [00:14,  4.62it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

70it [00:14,  4.59it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

71it [00:14,  4.64it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

72it [00:14,  4.64it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

73it [00:14,  4.63it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

74it [00:15,  4.60it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

75it [00:15,  4.57it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

76it [00:15,  4.59it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

77it [00:15,  4.61it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

78it [00:16,  4.55it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

79it [00:16,  4.55it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

80it [00:16,  4.57it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

81it [00:16,  4.57it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

82it [00:16,  4.57it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

83it [00:17,  4.59it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

84it [00:17,  4.63it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

85it [00:17,  4.55it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

86it [00:17,  4.57it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

87it [00:17,  4.60it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

88it [00:18,  4.64it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

89it [00:18,  4.61it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

90it [00:18,  4.61it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

91it [00:18,  4.53it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

92it [00:19,  4.53it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

93it [00:19,  4.54it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

94it [00:19,  4.50it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

95it [00:19,  4.48it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

96it [00:19,  4.53it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

97it [00:20,  4.42it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

98it [00:20,  4.37it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

99it [00:20,  4.42it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

100it [00:20,  4.46it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

101it [00:21,  4.45it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

102it [00:21,  4.46it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

103it [00:21,  4.51it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

104it [00:21,  4.51it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

105it [00:21,  4.53it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

106it [00:22,  4.51it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

107it [00:22,  4.51it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

108it [00:22,  4.49it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

109it [00:22,  4.48it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

110it [00:23,  4.45it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

111it [00:23,  4.38it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

112it [00:23,  4.38it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

113it [00:23,  4.38it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

114it [00:24,  4.44it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

115it [00:24,  4.44it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

116it [00:24,  4.48it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

117it [00:24,  4.43it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

118it [00:24,  4.47it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

119it [00:25,  4.39it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

120it [00:25,  4.44it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

121it [00:25,  4.38it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

122it [00:25,  4.43it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

123it [00:26,  4.43it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

124it [00:26,  4.41it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

125it [00:26,  4.41it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

126it [00:26,  4.41it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

127it [00:26,  4.41it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

128it [00:27,  4.36it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

129it [00:27,  4.40it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

130it [00:27,  4.39it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

131it [00:27,  4.42it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

132it [00:28,  4.41it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

133it [00:28,  4.39it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

134it [00:28,  4.36it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

135it [00:28,  4.34it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

136it [00:29,  4.34it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

137it [00:29,  4.34it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

138it [00:29,  4.37it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

139it [00:29,  4.39it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

140it [00:29,  4.27it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

141it [00:30,  4.28it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

142it [00:30,  4.26it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

143it [00:30,  4.25it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

144it [00:30,  4.25it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

145it [00:31,  4.25it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

146it [00:31,  4.25it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

147it [00:31,  4.17it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

148it [00:31,  4.19it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

149it [00:32,  4.19it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

150it [00:32,  4.18it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

151it [00:32,  4.20it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

152it [00:32,  4.20it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

153it [00:33,  4.17it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

154it [00:33,  4.21it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

155it [00:33,  4.23it/s]

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

156it [00:33,  4.62it/s]
156it [00:33,  4.62it/s]


‚ùóÔ∏èAccording to the <a href="https://arxiv.org/abs/2212.03533">paper</a>, E5 is trained asymmetrically, so we switch the prefix back to `query: `.

In [49]:
processor.prefix = "query: "

In [50]:
queries = [
    "thinking about 'The Hunger Games' mechanics, if you were in the same shoes as Gale, entering your name forty-two times to feed your fam, how would you strategize your game in the actual Arena? Would you team up or go solo based on these high stakes?",
    "In the universe of 'The Hunger Games', what are tesserae and what do they offer to the participants in the Harvest?",
    "In 'Harry Potter and the Philosopher's Stone', what misconception had Harry and Hermione initially had about Snape's intentions before learning the truth?",
    "Hey peeps, why is Harry all jittery and pacing around the room even after telling Hermione about the whole Snape and Voldemort situation?",
    "Hey fellow gamers, in The Hunger Games universe, if you were in a match where your ally was taken down first like Rue, how would you strategize your next move to survive against top opponents like Cato?",
    "In the 'Hunger Games' novel, why does Cato decide to spare Katniss's life after their encounter?"
] * 1

In [51]:
from justatom.running.retriever import API as RetrieverApi


def _first_document(response):
    if not isinstance(response, list) or len(response) == 0:
        return None
    first = response[0]
    if isinstance(first, list):
        return first[0] if len(first) > 0 else None
    return first

#### Pure keyword search

In [52]:
retriever = RetrieverApi.named("keywords", store=store)

In [53]:
filters = {
    "operator": "AND",
    "conditions": [
        {
            "field": "dataframe",
            "operator": "==",
            "value": "–°–æ–π–∫–∞-–ø–µ—Ä–µ—Å–º–µ—à–Ω–∏—Ü–∞"
        }
    ]
}

In [42]:
await store.count_documents()

4992

In [54]:
for pos, query in enumerate(queries):
    response = await retriever.retrieve_topk(query, top_k=1, filters=filters)
    doc = _first_document(response)
    print(doc.content if doc is not None else "<EMPTY>")
    if pos < len(queries) - 1:
        print("\n")

[32m2026-02-16 07:15:06.876[0m | [1mINFO    [0m | [36mjustatom.storing.weaviate[0m:[36msearch_by_keywords[0m:[36m635[0m - [1mSEARCH | algo=[BM25] | collection_name=[Justatom_collection_v2][0m
[32m2026-02-16 07:15:06.887[0m | [1mINFO    [0m | [36mjustatom.storing.weaviate[0m:[36msearch_by_keywords[0m:[36m635[0m - [1mSEARCH | algo=[BM25] | collection_name=[Justatom_collection_v2][0m
[32m2026-02-16 07:15:06.894[0m | [1mINFO    [0m | [36mjustatom.storing.weaviate[0m:[36msearch_by_keywords[0m:[36m635[0m - [1mSEARCH | algo=[BM25] | collection_name=[Justatom_collection_v2][0m
[32m2026-02-16 07:15:06.901[0m | [1mINFO    [0m | [36mjustatom.storing.weaviate[0m:[36msearch_by_keywords[0m:[36m635[0m - [1mSEARCH | algo=[BM25] | collection_name=[Justatom_collection_v2][0m
[32m2026-02-16 07:15:06.909[0m | [1mINFO    [0m | [36mjustatom.storing.weaviate[0m:[36msearch_by_keywords[0m:[36m635[0m - [1mSEARCH | algo=[BM25] | collection_name=[Justatom

Wearing headphones, I heard Gale's voice telling me to come back. However, the Hunger Games backpack reminded me of something else. Hooking the bag's strap over the back of the chair, I sprinted up the steps to my bedroom. Inside the closet, there was my father's hunting jacket. Before the Suppression, I had brought it here from our old home, thinking that its presence would calm my mother and sister when I die.


Now, after all the hustle and bustle is over and we've reached our goal, I realize I have no idea what I'm going to face in District 8. In fact, I know nothing about the state of the war or what victory will cost. Or, what will happen if we win. Plutarch tries to explain everything to me in simple words. First of all, every district is now at war with the Capitol, except for the second one, which has always been under the patronage of our enemies, despite their participation in the Hunger Games. They received more food and better living conditions. After the Dark Days and the

#### Embedding search

In [55]:
retriever = RetrieverApi.named("embedding", store=store, runner=runner, processor=processor, device=device)

In [56]:
len(queries)

6

In [57]:
for pos, query in enumerate(queries):
    response = await retriever.retrieve_topk(query, top_k=1)
    doc = _first_document(response)
    print(doc.content if doc is not None else "<EMPTY>")
    if pos < len(queries) - 1:
        print("\n")

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

And here's where the real interest begins. Suppose you're poor and starving. Then you can ask to be included in the Harvest more times than you're entitled to, and in return you'd get tesserae. They give you grain and oil for a whole year for one tessera per person. You won't be full, but it's better than nothing. You can take tesserae for the whole family. When I was twelve, my name was entered four times. Once by law, and once more for tesserae for Prim, my mother, and myself. The next years had to do the same. And since the price of a tessera increases by one entry each year, now that I've turned sixteen, my name will be on twenty cards. Gale is eighteen, and he's been feeding a family of five for seven years. His name will be entered forty two times! It's clear that people like Madge, who has never had to risk because of tesserae, annoy Gale. Next to us, the inhabitants of the slag heap, she simply has no chance of getting into the games. Well, almost no chance. Of course, the rule

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

And here's where the real interest begins. Suppose you're poor and starving. Then you can ask to be included in the Harvest more times than you're entitled to, and in return you'd get tesserae. They give you grain and oil for a whole year for one tessera per person. You won't be full, but it's better than nothing. You can take tesserae for the whole family. When I was twelve, my name was entered four times. Once by law, and once more for tesserae for Prim, my mother, and myself. The next years had to do the same. And since the price of a tessera increases by one entry each year, now that I've turned sixteen, my name will be on twenty cards. Gale is eighteen, and he's been feeding a family of five for seven years. His name will be entered forty two times! It's clear that people like Madge, who has never had to risk because of tesserae, annoy Gale. Next to us, the inhabitants of the slag heap, she simply has no chance of getting into the games. Well, almost no chance. Of course, the rule

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

There have been many rumors over the centuries that the Philosopher's Stone has already been created, but the only existing stone today belongs to Mr. Nicholas Flamel, a distinguished alchemist and opera fanatic. Mr. Flamel, who celebrated his six hundred and sixty-fifth birthday last year, enjoys the peace and solitude in Devon with his wife Perenelle (six hundred and fifty-eight years old). 
'Understood?' Hermione asked when Harry and Ron finished reading. 'It must be, the dog safeguards Flamel's philosopher's stone! I have no doubt that he asked Dumbledore to do this, because they are friends and also because Flamel knew that someone was hunting for his stone. That's why he wanted the stone to be withdrawn from Gringotts! 
'The stone that turns everything into gold and guarantees you immortality!' Harry exclaimed. 'No wonder Snape wants to steal it. Anyone would want such a stone.




Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

Ron was asleep in the common room - apparently, he had been waiting for their return and had dozed off unnoticed. When Harry roughly shook him, Ron began to yell something about breaking the rules of a game, as if he were dreaming about a Quidditch match. However, after a few seconds, Ron completely woke up and, with his eyes wide open, listened to the story of Hermione and Harry. Harry was so excited that he could not sit still and paced back and forth across the room, trying to stay as close to the fireplace as possible. He was still shaking with cold. 'Snape wants to steal the stone for Voldemort. And Voldemort is waiting in the forest... And all this time we thought Snape wanted to steal the stone to become rich... And Voldemort...'




Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

What was she babbling about? You're Rue's ally? - I... I... we teamed up. We blew up the food of the Pros. I wanted to save her. Really did. But he found her first, the guy from District One - I say. Perhaps if Cato knows I helped Rue, he will kill me quickly and painlessly. - Did you kill him? - he asks grimly. - Yes. I killed him. And I covered her body with flowers. I sang to her till she fell asleep. Tears well up in my eyes. Will and strength are leaving me. There's only Rue, the pain in my head, fear of Cato and the moan of the dying girl. - Fell asleep? - mocks Cato. - Died. I sang to her till she died - I say. - Your district... sent me bread. I raise my hand - not for an arrow; I won't have time anyway. I just blow my nose. - Cato, make it quick, okay? His face shows conflicting emotions. Cato puts down the rock and says with almost a reproach: - This time, only this time, I'm letting you go. For the girl. We are even. No one owes anything to anyone anymore, understand? I nod,

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

What was she babbling about? You're Rue's ally? - I... I... we teamed up. We blew up the food of the Pros. I wanted to save her. Really did. But he found her first, the guy from District One - I say. Perhaps if Cato knows I helped Rue, he will kill me quickly and painlessly. - Did you kill him? - he asks grimly. - Yes. I killed him. And I covered her body with flowers. I sang to her till she fell asleep. Tears well up in my eyes. Will and strength are leaving me. There's only Rue, the pain in my head, fear of Cato and the moan of the dying girl. - Fell asleep? - mocks Cato. - Died. I sang to her till she died - I say. - Your district... sent me bread. I raise my hand - not for an arrow; I won't have time anyway. I just blow my nose. - Cato, make it quick, okay? His face shows conflicting emotions. Cato puts down the rock and says with almost a reproach: - This time, only this time, I'm letting you go. For the girl. We are even. No one owes anything to anyone anymore, understand? I nod,

#### Hybrid search (embeddings + keywords)
 > ‚ùìHow do we combine them? We use `alpha` in `[0.0, 1.0]`.

 > `alpha = 0.0` ‚Üí pure keyword search.
 > `alpha = 1.0` ‚Üí pure embedding search.

 > ‚ö†Ô∏è Pass `device=device` when creating the hybrid retriever to avoid implicit fallback to CPU and device mismatch errors.

In [63]:
retriever = RetrieverApi.named("hybrid", store=store, processor=processor, runner=runner, device=device)

In [64]:
for pos, query in enumerate(queries):
    response = await retriever.retrieve_topk(query, top_k=1, alpha=0.78)
    doc = _first_document(response)
    print(doc.content if doc is not None else "<EMPTY>")
    if pos < len(queries) - 1:
        print("\n")

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

And here's where the real interest begins. Suppose you're poor and starving. Then you can ask to be included in the Harvest more times than you're entitled to, and in return you'd get tesserae. They give you grain and oil for a whole year for one tessera per person. You won't be full, but it's better than nothing. You can take tesserae for the whole family. When I was twelve, my name was entered four times. Once by law, and once more for tesserae for Prim, my mother, and myself. The next years had to do the same. And since the price of a tessera increases by one entry each year, now that I've turned sixteen, my name will be on twenty cards. Gale is eighteen, and he's been feeding a family of five for seven years. His name will be entered forty two times! It's clear that people like Madge, who has never had to risk because of tesserae, annoy Gale. Next to us, the inhabitants of the slag heap, she simply has no chance of getting into the games. Well, almost no chance. Of course, the rule

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

And here's where the real interest begins. Suppose you're poor and starving. Then you can ask to be included in the Harvest more times than you're entitled to, and in return you'd get tesserae. They give you grain and oil for a whole year for one tessera per person. You won't be full, but it's better than nothing. You can take tesserae for the whole family. When I was twelve, my name was entered four times. Once by law, and once more for tesserae for Prim, my mother, and myself. The next years had to do the same. And since the price of a tessera increases by one entry each year, now that I've turned sixteen, my name will be on twenty cards. Gale is eighteen, and he's been feeding a family of five for seven years. His name will be entered forty two times! It's clear that people like Madge, who has never had to risk because of tesserae, annoy Gale. Next to us, the inhabitants of the slag heap, she simply has no chance of getting into the games. Well, almost no chance. Of course, the rule

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

There have been many rumors over the centuries that the Philosopher's Stone has already been created, but the only existing stone today belongs to Mr. Nicholas Flamel, a distinguished alchemist and opera fanatic. Mr. Flamel, who celebrated his six hundred and sixty-fifth birthday last year, enjoys the peace and solitude in Devon with his wife Perenelle (six hundred and fifty-eight years old). 
'Understood?' Hermione asked when Harry and Ron finished reading. 'It must be, the dog safeguards Flamel's philosopher's stone! I have no doubt that he asked Dumbledore to do this, because they are friends and also because Flamel knew that someone was hunting for his stone. That's why he wanted the stone to be withdrawn from Gringotts! 
'The stone that turns everything into gold and guarantees you immortality!' Harry exclaimed. 'No wonder Snape wants to steal it. Anyone would want such a stone.




Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

Ron was asleep in the common room - apparently, he had been waiting for their return and had dozed off unnoticed. When Harry roughly shook him, Ron began to yell something about breaking the rules of a game, as if he were dreaming about a Quidditch match. However, after a few seconds, Ron completely woke up and, with his eyes wide open, listened to the story of Hermione and Harry. Harry was so excited that he could not sit still and paced back and forth across the room, trying to stay as close to the fireplace as possible. He was still shaking with cold. 'Snape wants to steal the stone for Voldemort. And Voldemort is waiting in the forest... And all this time we thought Snape wanted to steal the stone to become rich... And Voldemort...'




Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

What was she babbling about? You're Rue's ally? - I... I... we teamed up. We blew up the food of the Pros. I wanted to save her. Really did. But he found her first, the guy from District One - I say. Perhaps if Cato knows I helped Rue, he will kill me quickly and painlessly. - Did you kill him? - he asks grimly. - Yes. I killed him. And I covered her body with flowers. I sang to her till she fell asleep. Tears well up in my eyes. Will and strength are leaving me. There's only Rue, the pain in my head, fear of Cato and the moan of the dying girl. - Fell asleep? - mocks Cato. - Died. I sang to her till she died - I say. - Your district... sent me bread. I raise my hand - not for an arrow; I won't have time anyway. I just blow my nose. - Cato, make it quick, okay? His face shows conflicting emotions. Cato puts down the rock and says with almost a reproach: - This time, only this time, I'm letting you go. For the girl. We are even. No one owes anything to anyone anymore, understand? I nod,

Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

What was she babbling about? You're Rue's ally? - I... I... we teamed up. We blew up the food of the Pros. I wanted to save her. Really did. But he found her first, the guy from District One - I say. Perhaps if Cato knows I helped Rue, he will kill me quickly and painlessly. - Did you kill him? - he asks grimly. - Yes. I killed him. And I covered her body with flowers. I sang to her till she fell asleep. Tears well up in my eyes. Will and strength are leaving me. There's only Rue, the pain in my head, fear of Cato and the moan of the dying girl. - Fell asleep? - mocks Cato. - Died. I sang to her till she died - I say. - Your district... sent me bread. I raise my hand - not for an arrow; I won't have time anyway. I just blow my nose. - Cato, make it quick, okay? His face shows conflicting emotions. Cato puts down the rock and says with almost a reproach: - This time, only this time, I'm letting you go. For the girl. We are even. No one owes anything to anyone anymore, understand? I nod,