In [None]:
import requests
from loguru import logger
from pathlib import Path
import os
import uuid
from typing import Generator
import numpy as np
import simplejson as json
import torch
from justatom.tooling.dataset import source_from_dataset
from justatom.etc.schema import Document
from more_itertools import chunked
import json_repair
import polars as pl

from justatom.storing.weaviate import Finder as WeaviateApi

from tqdm import tqdm

### ✔️ANN document store backed by <a href="https://github.com/weaviate/weaviate">weaviate</a>

> First, let'c make sure you have docker up and running. From the root of directory run:
```bash
docker-compose up -d
```

❗️ By default weavaite will run on port `2211`

In [None]:
collection_name = "HelloWorld"
weaviate_host, weaviate_port = "localhost", 2211

In [None]:
store = WeaviateApi.find(collection_name, WEAVIATE_HOST=weaviate_host, WEAVIATE_PORT=weaviate_port)

In [None]:
logger.info(f"For the collection=[{collection_name}] you have N=[{store.count_documents()}] documents")

### ✔️ Prepare datasets

> For this tutorial we will use built-in dataset `polaroids.ai`. This is the dataset from movies, games and books containing paragraphs from various moments

In [None]:
dataset_name_or_path = Path(os.getcwd()) / ".data" / "polaroids.ai.data.all.json"

In [None]:
pl_docs = source_from_dataset(dataset_name_or_path)

In [None]:
logger.info(f"Columns=[{' | '.join(pl_docs.columns)}]")

>❗️Please do note, that `content` and `id` columns are must have. They describe each "chunk". All the rest fields are optional and would be added to `meta`

In [None]:
# We have `chunk_id` but not `id`. Let's add it as well.

pl_docs = pl_docs.with_columns([
    pl.col("chunk_id").alias("id")
])

> ❗️Let's filter out those chunks having `null` on any of "must-have" columns otherwise pipeline will fail

In [None]:
pl_docs = pl_docs.filter((pl.col("content") != None) & (pl.col("id") != None))

In [None]:
logger.info(f"There are D=[{pl_docs.shape[0]}] unique documents")

>❗️We would like to keep `keywords_or_phrases` and relevant `queries` for each chunk. Let's declare that as well as original `chunk_id` to keep the structure outside of weaviate internal generated UUID-s.

> ❗️❗️ Each chunk is associated with an array of relevant queries to describe it

<small>

|  queries (list[str])  |     content: str     |   chunk_id: str   |
|:---------------------:|:--------------------:|:-----------------:|
| 1. ...thinking about 'The Hunger Games' mechanics, if you were in the same shoes as Gale, entering your name forty-two times to feed your fam, how would you strategize your game in the actual Arena? Would you team up or go solo based on these high stakes? <br><br>2.In the universe of 'The Hunger Games', what are tesserae and what do they offer to the participants in the Harvest?    | And here's where the real interest begins. Suppose you're poor and starving. Then you can ask to be included in the Harvest more times than you're entitled to, and in return you'd get tesserae. They give you grain and oil for a whole year for one tessera per person. You won't be full, but it's better than nothing. You can take tesserae for the whole family. When I was twelve, my name was entered four times. Once by law, and once more for tesserae for Prim, my mother, and myself. The next years had to do the same. And since the price of a tessera increases by one entry each year, now that I've turned sixteen, my name will be on twenty cards. Gale is eighteen, and he's been feeding a family of five for seven years. His name will be entered forty two times! It's clear that people like Madge, who has never had to risk because of tesserae, annoy Gale. Next to us, the inhabitants of the slag heap, she simply has no chance of getting into the games. Well, almost no chance. Of course, the rules are set by the Capitol, not the districts, let alone Madge's relatives, and it's still hard to sympathize with those who, like you, don't have to trade their own skin for a piece of bread.  | 80504cd8-9b21-514c-b001-4761d8c71044         |
|-----------------------|----------------------|-------------------|
| 1.In 'Harry Potter and the Philosopher's Stone', what misconception had Harry and Hermione initially had about Snape's intentions before learning the truth? <br><br>2. Hey peeps, why is Harry all jittery and pacing around the room even after telling Hermione about the whole Snape and Voldemort situation?        | Ron was asleep in the common room - apparently, he had been waiting for their return and had dozed off unnoticed. When Harry roughly shook him, Ron began to yell something about breaking the rules of a game, as if he were dreaming about a Quidditch match. However, after a few seconds, Ron completely woke up and, with his eyes wide open, listened to the story of Hermione and Harry. Harry was so excited that he could not sit still and paced back and forth across the room, trying to stay as close to the fireplace as possible. He was still shaking with cold. 'Snape wants to steal the stone for Voldemort. And Voldemort is waiting in the forest... And all this time we thought Snape wanted to steal the stone to become rich... And Voldemort...'  | 5ad25a92-28d9-5971-a81b-4f795898eeab         |
|-----------------------|----------------------|-------------------|
| 1. Hey fellow gamers, in The Hunger Games universe, if you were in a match where your ally was taken down first like Rue, how would you strategize your next move to survive against top opponents like Cato?<br><br> 2. In the 'Hunger Games' novel, why does Cato decide to spare Katniss's life after their encounter?    | What was she babbling about? You're Rue's ally? - I... I... we teamed up. We blew up the food of the Pros. I wanted to save her. Really did. But he found her first, the guy from District One - I say. Perhaps if Cato knows I helped Rue, he will kill me quickly and painlessly. - Did you kill him? - he asks grimly. - Yes. I killed him. And I covered her body with flowers. I sang to her till she fell asleep. Tears well up in my eyes. Will and strength are leaving me. There's only Rue, the pain in my head, fear of Cato and the moan of the dying girl. - Fell asleep? - mocks Cato. - Died. I sang to her till she died - I say. - Your district... sent me bread. I raise my hand - not for an arrow; I won't have time anyway. I just blow my nose. - Cato, make it quick, okay? His face shows conflicting emotions. Cato puts down the rock and says with almost a reproach: - This time, only this time, I'm letting you go. For the girl. We are even. No one owes anything to anyone anymore, understand? I nod, because I do understand. Understand about debts. About how bad it is to have them. Understand that if Cato wins, he will return to a district that has forgotten the rules to thank me. And Cato is neglecting them, too. Right now, he's not going to crack my head with a stone.  | b317200c-7fd3-5804-bbe4-bff33432ad0e         |
|-----------------------|----------------------|-------------------|

</small>

In [None]:
columns_to_include = [
    "keywords_or_phrases",
    "chunk_id",
    "queries"
]

In [None]:
def wrapper_for_docs(
    pl_data: pl.DataFrame,
    content_field: str,
    keywords_or_phrases_field: str = None,
    batch_size: int = 128,
    columns_to_include: list[str] | None = None,
    filters: dict | None = None,
):
    js_data = pl_data.to_dicts()
    for js_chunk in tqdm(js_data):
        js_meta = {k: js_chunk[k] for k in columns_to_include}
        yield dict(content=js_chunk[content_field], meta=js_meta)

In [None]:
js_docs = list(wrapper_for_docs(
    pl_docs,
     content_field="content",
     columns_to_include=columns_to_include
))

### Modeling

> See <a href="https://huggingface.co/intfloat/multilingual-e5-large">E5 large</a> , <a href="https://huggingface.co/intfloat/multilingual-e5-base">E5 base</a>, <a href="https://huggingface.co/intfloat/multilingual-e5-small">E5 small</a> family of encoder models. More coming soon
 
> 📎 <a href="https://arxiv.org/abs/2212.03533">paper</a>

> ❗️For this tutorial we pick the base one `intfloat/multilingual-e5-base` as a trade-off between performance and precision

In [None]:
model_name_or_path = "intfloat/multilingual-e5-base"

from justatom.modeling.mask import ILanguageModel
from justatom.running.m1 import M1LMRunner
from justatom.processing import INFERProcessor, ITokenizer
lm_model = ILanguageModel.load(model_name_or_path)

In [None]:
def maybe_cuda_or_mps():
    if torch.backends.mps.is_built():
        return "mps"
    elif torch.cuda.is_available():
        return "cuda:0"
    else:
        return "cpu"

In [None]:
device = maybe_cuda_or_mps()

In [None]:
runner = M1LMRunner(model=lm_model, prediction_heads=[], device=device)

In [None]:
processor = INFERProcessor(ITokenizer.from_pretrained(model_name_or_path))

❗️According to the <a href="https://arxiv.org/abs/2212.03533">paper</a> E5 family is trained in assymetric way meaning:

> Use `"query: "` and `"passage: "` correspondingly for asymmetric tasks such as passage retrieval in open QA, ad-hoc information retrieval.

> Use `"query: "` prefix for symmetric tasks such as semantic similarity, bitext mining, paraphrase retrieval.

> Use `"query: "` prefix if you want to use embeddings as features, such as linear probing classification, clustering.

In [None]:
processor.prefix = "passage: "

> Let's put everything together in one simple abstraction - `Indexer`

In [None]:
from justatom.running.indexer import API as IndexerAPI

# 1. "embedding" is the way to index the given ANN store (weaviate)
# 2. runner is responsible for mapping docs to embeddings
# 3. processor is responsible for tokenizing given chunks
# 4. device - compute everything on selected `device`

ix_runner = IndexerAPI.named("embedding", runner=runner, store=store, processor=processor, device=device)

In [None]:
async for js_batch_docs in ix_runner.index(js_docs, n=32):
    pass

❗️According to the <a href="https://arxiv.org/abs/2212.03533">paper</a> E5 family is trained in assymetric way meaning we have to set `prefix` back to `query: `

In [None]:
processor.prefix = "query: "

In [None]:
queries = [
    "thinking about 'The Hunger Games' mechanics, if you were in the same shoes as Gale, entering your name forty-two times to feed your fam, how would you strategize your game in the actual Arena? Would you team up or go solo based on these high stakes?",
    "In the universe of 'The Hunger Games', what are tesserae and what do they offer to the participants in the Harvest?",
    "In 'Harry Potter and the Philosopher's Stone', what misconception had Harry and Hermione initially had about Snape's intentions before learning the truth?",
    "Hey peeps, why is Harry all jittery and pacing around the room even after telling Hermione about the whole Snape and Voldemort situation?",
    "Hey fellow gamers, in The Hunger Games universe, if you were in a match where your ally was taken down first like Rue, how would you strategize your next move to survive against top opponents like Cato?",
    "In the 'Hunger Games' novel, why does Cato decide to spare Katniss's life after their encounter?"
]

In [None]:
from justatom.running.retriever import API as RetrieverApi

#### Pure keywords search

In [None]:
retriever = RetrieverApi.named("keywords", store=store)

In [None]:
for pos, query in enumerate(queries):
    print(retriever.retrieve_topk(query, top_k=1)[0])
    print("\n")

#### Search by embedding

In [None]:
retriever = RetrieverApi.named("embedding", store=store, runner=runner, processor=processor)

In [None]:
for pos, query in enumerate(queries):
    print(retriever.retrieve_topk(query, top_k=1)[0])
    print("\n")

#### Search by embedding AND keywords
> ❓How do we combine them? First, introduce a parameter called `alpha`, which can be any value from 0.0 to 1.0. 

> When `alpha = 0.0`, the search relies entirely on keywords (pure keyword search). 

> When `alpha = 1.0`, it uses only semantic embeddings.

In [None]:
retriever = RetrieverApi.named("hybrid", store=store, processor=processor, runner=runner)

In [None]:
for pos, query in enumerate(queries):
    print(retriever.retrieve_topk(query, top_k=1, alpha=0.78)[0])
    print("\n")