In [129]:
from llma.etc.schema import Document
import numpy as np
from loguru import logger
from tqdm.autonotebook import tqdm
import uuid
from typing import Optional, List, Dict, Any
import polars as pl
from weaviate.classes.query import MetadataQuery

In [2]:
!pip freeze | grep weaviate

weaviate-client==4.5.4


In [4]:
from transformers import AutoModel, AutoTokenizer
from torch import Tensor
import torch
from torch.functional import F
from functools import partial
from more_itertools import chunked
# from llma.processing import loader, igniset

In [5]:
model_name_or_path = "intfloat/multilingual-e5-base"

In [6]:
model = AutoModel.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [7]:
def prefix(x, pref:str):
    return pref.strip() + " " + x

In [8]:
eval_docs = [(
    "What Peeta Meelark told about Katniss ability to shoot  at the dinner before the 74 hunger games begun?",
    "I have to think about it. I’ve been putting food on the table for four years. That’s no small task. I’m not as good as my father was, but he’d had more practice. I’ve better aim than Gale, but I’ve had more practice. He’s a genius with traps and snares. \"I’m all right,\" I say.\n\"She’s excellent,\" says Peeta.\n\"My father buys her squirrels. He always comments on how the arrows never pierce the body. She hits every one in the eye. It’s the same with the rabbits she sells the butcher. She can even bring down deer.\"",
    "\"She’s excellent,\" says Peeta.\n\"My father buys her squirrels. He always comments on how the arrows never pierce the body. She hits every one in the eye. It’s the same with the rabbits she sells the butcher. She can even bring down deer.\""
),
(
    "Why Katniss Everdeen didn’t want to talk about girl whose tongue had been cut she had seen back in the district twelve with Gale?",
    "I realize I do want to talk to someone about the girl. Someone who might be able to help me figure out her story. Gale would be my first choice, but it’s unlikely I’ll ever see Gale again. I try to think if telling Peeta could give him any possible advantage over me, but I don’t see how. Maybe sharing a confidence will actually make him believe I see him as a friend.\nBesides, the idea of the girl with her maimed tongue frightens me. She has reminded me why I’m here. Not to model flashy costumes and eat delicacies. But to die a bloody death while the crowds urge on my killer.",
    "She has reminded me why I’m here. Not to model flashy costumes and eat delicacies. But to die a bloody death while the crowds urge on my killer"
),
(
    "Who is an Avox in the book \"Hunger games\" by S.Collins?",
    "When I look back, the four adults are watching me like hawks.\"Don’t be ridiculous, Katniss. How could you possibly know an Avox?\" snaps Effie. \n\"The very thought.\"\n\"What’s an Avox?\" I ask stupidly.\n\"Someone who committed a crime. They cut her tongue so she can’t speak,\" says Haymitch. \n\"She’s probably a traitor of some sort. Not likely you’d know her.\"\n\"And even if you did, you’re not to speak to one of them unless it’s to give an order\", says Effie. \"Of course, you don’t really know her.\"",
    "\"Someone who committed a crime. They cut her tongue so she can’t speak,\" says Haymitch. \n\"She’s probably a traitor of some sort. Not likely you’d know her.\"\n\"And even if you did, you’re not to speak to one of them unless it’s to give an order,\" says Effie."
),
(
    "Why tributes cannot commit a suicide before the games begin by jumping from the roof at the Training center where u can see the whole city?",
    "Peeta and I walk to a railing at the edge of the roof. I look straight down the side of the building to the street, which is buzzing with people. You can hear their cars, an occasional shout, and a strange metallic tinkling. In District 12, we’d all be thinking about bed right now.\n\"I asked Cinna why they let us up here. Weren’t they worried that some of the tributes might decide to jump right over the side?\" says Peeta.\n\"What’d he say?\" I ask.\n\"You can’t,\" says Peeta. He holds out his hand into seemingly empty space. There’s a sharp zap and he jerks it back. \"Some kind of electric field throws you back on the roof.\"\n\n\"Always worried about our safety,\" I say. Even though Cinna has shown Peeta the roof, I wonder if we’re supposed to be up here now, so late and alone.",
    "\"Some kind of electric field throws you back on the roof.\""
),
(
    "What are the rules of thu hunger games?",
    "The rules of the Hunger Games are simple. In punishment for the uprising, each of the twelve districts must provide one girl and one boy, called tributes, to participate. The twenty-four tributes will be imprisoned in a vast outdoor arena that could hold anything from a burning desert to a frozen wasteland. Over a period of several weeks, the competitors must fight to the death. The last tribute standing wins.",
    "In punishment for the uprising, each of the twelve districts must provide one girl and one boy, called tributes, to participate. The twenty-four tributes will be imprisoned in a vast outdoor arena that could hold anything from a burning desert to a frozen wasteland. Over a period of several weeks, the competitors must fight to the death. The last tribute standing wins."
),
(
    "Why reaping system in \"HG\" is unfair if it's applied to all citizens?",
    "But here's the catch. Say you are poor and starving, as we were. You can opt to add your name more times in exchange for tesserae. Each tessera is worth a meagre year's supply of grain and oil for one person. You may do this for each of your family members as well. So, at the age of twelve, I had my name entered four times. Once because I had to, and three times for tesserae for grain and oil for myself, Prim and my mother. In fact, every year I have needed to do this. And the entries are cumulative. So now, at the age of sixteen, my name will be in the reaping twenty times. Gale, who is eighteen and has been either helping or single-handedly feeding a family of five for seven years, will have his name in forty-two times.",
    "Say you are poor and starving, as we were. You can opt to add your name more times in exchange for tesserae. Each tessera is worth a meagre year's supply of grain and oil for one person. You may do this for each of your family members as well."
)
]

In [9]:
queries, documents, answers = [d[0] for d in eval_docs], [d[1] for d in eval_docs], [d[2] for d in eval_docs]

In [10]:
def tokenize(input_texts, tokenizer):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
    return batch_dict

In [11]:
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [12]:
tokenize(documents, tokenizer=tokenizer)["input_ids"].shape

torch.Size([6, 214])

In [13]:
def text_to_rethinkdb_uuid(text, namespace_uuid='91461c99-f89d-49d2-af96-d8e2e14e9b58'):
    """
    
    Args:
    text (str): Textual content for generating fixed UUID.
    namespace_uuid (str): Namespace used by https://rethinkdb.com.
    
    Returns:
    str: Строковое представление детерминированного UUID.
    """
    namespace = uuid.UUID(namespace_uuid)
    
    deterministic_uuid = uuid.uuid5(namespace, text)
    
    return str(deterministic_uuid)

In [14]:
def ignivec(docs, labels: Optional[List[str]] = None, pref:str="", norm: bool = True, batch_size:int = 2):
    wrapped_docs = []
    if labels is not None:
        it = chunked(zip(docs, labels), n=batch_size)
    else:
        it = chunked(docs, n=batch_size)
    pbar = tqdm(total=len(docs), desc=" Embeddings for documents")
    for chunk in it:
        if labels is not None:
            raw_docs, raw_labels = [c[0] for c in chunk], [c[1] for c in chunk]
        else:
            raw_docs = chunk
        _docs = [prefix(x, pref=pref) for x in raw_docs]
        batch_dict = tokenize(_docs, tokenizer=tokenizer)
        with torch.no_grad():
            outputs = model(**batch_dict)
            _embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']).cpu()
        if norm:
            _embeddings = F.normalize(_embeddings, p=2, dim=len(_embeddings.shape) - 1)
        _embeddings = _embeddings.numpy()
        if labels is not None:
            chunk = [Document.from_dict({"content": doc, "embedding": list(emb), "labels": [text_to_rethinkdb_uuid(label)]}) for doc, label, emb in zip(raw_docs, raw_labels, _embeddings)]
        else:
            chunk = [Document.from_dict({"content": doc, "embedding": list(emb)}) for doc, emb in zip(raw_docs, _embeddings)]
        wrapped_docs.extend(chunk)
        pbar.update(n=len(raw_docs))
    return wrapped_docs

#### Initialize `WeaviateDocStore` for information retrieval

In [15]:
from llma.storing.weaviate import WeaviateDocStore
store = WeaviateDocStore(url="http://localhost:2211", collection_name="justatom")

In [18]:
store.count_documents()

0

#### Ignite `vectorization` pipeline for each document to have its own semantic vector for `dense` or `hybrid` search

In [19]:
# In the model card https://huggingface.co/intfloat/multilingual-e5-base it's said to prefix "indexed" passages
wrapped_docs = ignivec(docs=documents, labels=queries, pref="context:")

 Embeddings for documents:   0%|          | 0/6 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 Embeddings for documents: 100%|██████████| 6/6 [00:00<00:00, 14.01it/s]


In [20]:
assert wrapped_docs[0].meta["labels"][0] == text_to_rethinkdb_uuid(queries[0]), f"Unexptected error due to labeling errors"

In [21]:
wrapped_docs[0].meta

{'labels': ['3c8c081f-c9be-5d49-a008-e08830736a4f']}

In [22]:
np.array(wrapped_docs[0].embedding).shape

(768,)

#### Now run `indexing` pipeline

In [24]:
if store.count_documents() > 0:
    store.delete_all_documents()
store.write_documents(documents=wrapped_docs)
logger.info(f"Total docs {store.count_documents()}")

[32m2024-04-30 08:21:13.509[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mTotal docs 6[0m


#### Let's ignite vectors for queries to perform semantic `dense` or `hybrid` search.

In [26]:
# According to the model card https://huggingface.co/intfloat/multilingual-e5-base each query is meant to be prefixed with `query:`
query_vecs = ignivec(queries, pref="query:")

 Embeddings for documents: 100%|██████████| 6/6 [00:00<00:00, 40.30it/s]


In [27]:
query = query_vecs[-2].content
logger.info(query)

[32m2024-04-30 08:23:48.570[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mWhat are the rules of thu hunger games?[0m


#### Let's now try different retrievals

#### 1. Search `by embedding` only

In [30]:
store.search_by_embedding(query_embedding=list(query_vecs[-2].embedding), top_k=2)

[<Document: {'content': 'The rules of the Hunger Games are simple. In punishment for the uprising, each of the twelve districts must provide one girl and one boy, called tributes, to participate. The twenty-four tributes will be imprisoned in a vast outdoor arena that could hold anything from a burning desert to a frozen wasteland. Over a period of several weeks, the competitors must fight to the death. The last tribute standing wins.', 'content_type': 'text', 'score': 0.9229498505592346, 'meta': {'labels': [UUID('cd0f8f56-5595-5673-89d0-c46fb8bf8e8b')], 'dataframe': None}, 'embedding': '<embedding of shape (768,)>', 'id': 'b29a0fed3ec5170b122a7bcaf3496273'}>,
 <Document: {'content': "But here's the catch. Say you are poor and starving, as we were. You can opt to add your name more times in exchange for tesserae. Each tessera is worth a meagre year's supply of grain and oil for one person. You may do this for each of your family members as well. So, at the age of twelve, I had my name 

#### 2. Search `by keywords` only

In [31]:
store.search_by_keywords(query=query, top_k=2)

[<Document: {'content': 'The rules of the Hunger Games are simple. In punishment for the uprising, each of the twelve districts must provide one girl and one boy, called tributes, to participate. The twenty-four tributes will be imprisoned in a vast outdoor arena that could hold anything from a burning desert to a frozen wasteland. Over a period of several weeks, the competitors must fight to the death. The last tribute standing wins.', 'content_type': 'text', 'score': 2.4290518760681152, 'meta': {'labels': [UUID('cd0f8f56-5595-5673-89d0-c46fb8bf8e8b')], 'dataframe': None}, 'embedding': None, 'id': 'b29a0fed3ec5170b122a7bcaf3496273'}>,
 <Document: {'content': 'When I look back, the four adults are watching me like hawks."Don’t be ridiculous, Katniss. How could you possibly know an Avox?" snaps Effie. \n"The very thought."\n"What’s an Avox?" I ask stupidly.\n"Someone who committed a crime. They cut her tongue so she can’t speak," says Haymitch. \n"She’s probably a traitor of some sort. 

#### 3. Search by both `keywords` and `embedding` ~ `hybrid` search.

In [37]:
# See https://weaviate.io/developers/weaviate/search/hybrid

In [53]:
alpha = 0.85
query = query_vecs[0].content
query_emb = query_vecs[0].embedding

response = store.search(query=query, query_embedding=list(query_emb), alpha=alpha, top_k=2)

response_docs = "\n---".join([f"\nDOC[{str(pos)}]\n" + x.content for pos, x in enumerate(response)])
logger.info(f" >> | {query}")
logger.info(f" << | {response_docs}")

[32m2024-04-30 09:17:37.981[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1m >> | What Peeta Meelark told about Katniss ability to shoot  at the dinner before the 74 hunger games begun?[0m
[32m2024-04-30 09:17:37.982[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1m << | 
DOC[0]
I have to think about it. I’ve been putting food on the table for four years. That’s no small task. I’m not as good as my father was, but he’d had more practice. I’ve better aim than Gale, but I’ve had more practice. He’s a genius with traps and snares. "I’m all right," I say.
"She’s excellent," says Peeta.
"My father buys her squirrels. He always comments on how the arrows never pierce the body. She hits every one in the eye. It’s the same with the rabbits she sells the butcher. She can even bring down deer."
---
DOC[1]
I realize I do want to talk to someone about the girl. Someone who might be able to help me figure out her story. Gale would be my 

In [45]:
list(range(11))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [117]:
def evaluate_per_alpha(query: str, query_emb: List[float], store, top_k:int=2) -> Dict[str, Any]:
    # hit-rate @ k
    # dcg @ k
    score = []
    for x in range(11):
        alpha = x * 1.0 / 10.0
        docs = store.search(query=query, query_embedding=list(query_emb),alpha=alpha, top_k=top_k)
        status: bool = False
        pos: int = 0
        while not status and pos < len(docs):
            labels = docs[pos].meta["labels"]
            if any([str(l) == text_to_rethinkdb_uuid(query) for l in labels]):
                status = True
            pos += 1
        if status:
            value = dict(
                hit_rate=1,
                recall=1.0 / pos * 1.0,
                alpha=alpha
            )
        else:
            value = dict(
                hit_rate=0,
                recall=0,
                alpha=alpha
            )
        score.append(value)
    return score

In [118]:
query, query_emb = queries[0], query_vecs[0].embedding

In [122]:
eval_result = evaluate_per_alpha(query=query, query_emb=query_emb, store=store, top_k=5)

In [124]:
# for one specific query
eval_result

[{'hit_rate': 1, 'recall': 0.25, 'alpha': 0.0},
 {'hit_rate': 1, 'recall': 0.25, 'alpha': 0.1},
 {'hit_rate': 1, 'recall': 0.25, 'alpha': 0.2},
 {'hit_rate': 1, 'recall': 0.25, 'alpha': 0.3},
 {'hit_rate': 1, 'recall': 0.3333333333333333, 'alpha': 0.4},
 {'hit_rate': 1, 'recall': 0.3333333333333333, 'alpha': 0.5},
 {'hit_rate': 1, 'recall': 1.0, 'alpha': 0.6},
 {'hit_rate': 1, 'recall': 1.0, 'alpha': 0.7},
 {'hit_rate': 1, 'recall': 1.0, 'alpha': 0.8},
 {'hit_rate': 1, 'recall': 1.0, 'alpha': 0.9},
 {'hit_rate': 1, 'recall': 1.0, 'alpha': 1.0}]

In [125]:
# let's now average those results for entire dataset
eval_results = []
for query, query_wrap in zip(queries, query_vecs):
    query_emb = query_wrap.embedding
    eval_result = evaluate_per_alpha(query=query, query_emb=query_emb, store=store, top_k=5)
    eval_results.extend(eval_result)

In [131]:
### let's average
pl_view = pl.from_dicts(eval_results)

In [132]:
pl_view.head()

hit_rate,recall,alpha
i64,f64,f64
1,0.25,0.0
1,0.25,0.1
1,0.25,0.2
1,0.25,0.3
1,0.333333,0.4


In [136]:
pl_eval_results = pl_view.with_row_count().with_columns([
    pl.mean("hit_rate").over("alpha").alias("mean_hit_rate"),
    pl.mean("recall").over("alpha").alias("mean_recall"),
    pl.first("row_nr").over("alpha").alias("mask")
]).filter(pl.col("mask") == pl.col("row_nr"))

In [137]:
pl_eval_results

row_nr,hit_rate,recall,alpha,mean_hit_rate,mean_recall,mask
u32,i64,f64,f64,f64,f64,u32
0,1,0.25,0.0,1.0,0.625,0
1,1,0.25,0.1,1.0,0.625,1
2,1,0.25,0.2,1.0,0.708333,2
3,1,0.25,0.3,1.0,0.791667,3
4,1,0.333333,0.4,1.0,0.888889,4
5,1,0.333333,0.5,1.0,0.888889,5
6,1,1.0,0.6,1.0,1.0,6
7,1,1.0,0.7,1.0,1.0,7
8,1,1.0,0.8,1.0,1.0,8
9,1,1.0,0.9,1.0,1.0,9
