In [26]:
from pathlib import Path
import os
from loguru import logger
import simplejson as json
import polars as pl
import json_repair
from justatom.tooling.stl import reuuid
from justatom.storing.dataset import API as DatasetApi

In [27]:
def source_from_dataset(dataset_name_or_path, **props):
    from justatom.storing.dataset import API as DatasetApi
    import polars as pl

    maybe_df_or_iter = DatasetApi.named(dataset_name_or_path).iterator(**props)
    if isinstance(maybe_df_or_iter, pl.DataFrame):
        pl_data = maybe_df_or_iter
    else:
        dataset = list(maybe_df_or_iter)
        pl_data = pl.from_dicts(dataset)
    return pl_data

In [29]:
pl_docs = source_from_dataset(Path(os.getcwd()) / ".data" / "polaroids.ai.data.all.json")

In [38]:
logger.info(f"There are |ALL|=[{pl_docs.shape[0]}] total docs")

[32m2025-02-18 05:16:03.443[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mThere are |ALL|=[4992] total docs[0m


In [39]:
pl_docs.columns

['content',
 'title',
 'author',
 'type',
 'has_image',
 'img_path',
 'speaker',
 'queries',
 'keywords_or_phrases',
 'chunk_id',
 'monitor']

### Statistics
Next, the system calculates statistics such as the total number of words in the queries (`LQ`) and in the paragraphs (`LD`), respectively.

In addition, it calculates the intersection of common words between the query and the relevant document (`LQD`), the paragraph’s keywords and the paragraph itself (`LKD`), as well as the intersection of the paragraph’s keywords with the explanation and the paragraph (`LKED`).

In [40]:
def coverage_for_doc(x: str, doc: str):
        return x[0] + " " + doc
        x = x[0]
        import re
        symbols = ["\\'", "'", '#', '!', '@', '$', '%', '^', '&', '*', '(', ')', '-', '=', '+', '[', ']', '{', '}', '|', '\\', ':', ';', '"', "'", '<', '>', ',', '.', '?', '/']
        escape = ''.join(re.escape(sym) for sym in symbols)
        pattern = f"[^A-Za-zа-яА-Я0-9{escape}]+"
        xs = x.strip().lower()
        xs = re.sub(pattern, " ", xs)
        ys = doc.strip().lower()
        ys = re.sub(pattern, " ", ys)
        
        return len(set(xs.split()) & set(ys.split()))

In [41]:
def pl_coverage_for_doc(d):
    queries = d['queries']
    doc = d['content']
    kwargs = d['keywords_or_phrases']
    import re
    symbols = ["\\'", "'", '#', '!', '@', '$', '%', '^', '&', '*', '(', ')', '-', '=', '+', '[', ']', '{', '}', '|', '\\', ':', ';', '"', "'", '<', '>', ',', '.', '?', '/']
    escape = ''.join(re.escape(sym) for sym in symbols)
    pattern = f"[^A-Za-zа-яА-Я0-9{escape}]+"
    LQ: float = 0
    LD: float = 0
    LQD: float = 0
    LKD: float = 0
    LKED: float = 0
    doc = doc.strip().lower()
    doc = re.sub(pattern, " ", doc)
    for q in queries:
        if q is None:
            continue
        xs = q.strip().lower()
        xs = re.sub(pattern, " ", xs)
        LQ += len(xs.split())
        LQD += len(set(xs.split()) & set(doc.split())) / len(doc.split())
    for kwarg in kwargs:
        keyword_or_phrase = kwarg['keyword_or_phrase']
        if keyword_or_phrase is None:
            continue
        ks = keyword_or_phrase.strip().lower()
        ks = re.sub(pattern, " ", ks)
        LKD += len(set(ks.split()) & set(doc.split())) / len(doc.split())
        if kwarg['explanation'] is None:
            continue
        kse = keyword_or_phrase + "\n" + kwarg['explanation']
        kse = kse.strip().lower()
        kse = re.sub(pattern, " ", kse)
        LKED += len(set(kse.split()) & set(doc.split())) / len(doc.split())
    LQ = LQ / len(queries)
    LD = len(doc.split())
    LQD = LQD / len(queries)
    # TODO: Добавить ключевые слова
    return {
        "LQ": LQ,
        "LD": LD,
        "LQD": LQD,
        "LKD": LKD,
        "LKED": LKED
    }

In [58]:
pl_docs = pl_docs.with_columns([
    pl.struct(["queries", "content", "keywords_or_phrases"]).map_elements(pl_coverage_for_doc).alias("monitor")
])

In [59]:
pl_docs.head()

content,title,author,type,has_image,img_path,speaker,queries,keywords_or_phrases,chunk_id,monitor
str,str,str,str,bool,str,str,list[str],list[struct[2]],str,struct[5]
"""Inside was the…","""Сойка-пересмеш…","""Сьюзен Коллинз…","""book""",False,,,"[""What elements of nature were present in the artificial meadow created in District 13 in the 'Mockingjay' universe?"", ""In the 'Mockingjay' novel, who is observed sitting in a wheelchair in the middle of the recreated meadow in District 13, and what are they watching?"", … ""Gamers and sci-fi fans, have you pictured the scene where Beetee is observing a hummingbird in a blooming meadow within District 13? Could this symbolize hope or a pause from their usual grim setting? Let's chat! 🕹️ #GamingMeetsLiterature""]","[{""District 13"",""A place in the story 'The Hunger Games' series where people live and work, known for being less natural and more industrial.""}, {""replica of a meadow"",""An artificial version of a meadow created to look like a real one with trees and plants.""}, … {""orange tree"",""A fruit tree that produces oranges. It's known for its fragrant blossoms.""}]","""3973894b-1c86-…","{29.4,62,0.103226,0.145161,0.467742}"
"""To them, you'r…","""Темный рыцарь …",,"""movie""",True,"""./img/batman/j…","""Джокер""","[""Yo gamers! In the Dark Knight universe, why does the Joker believe that 'civilized people will eat each other' when things get rough? What's up with that dark view on humanity?"", ""Hey fellow gamers, in the context of The Dark Knight (2008), how does the Joker's perspective on Gotham's 'principles' and 'code' challenge Batman's ideology? Doesn't it make you think about the rules in our own game worlds?""]","[{""just a freak"",""Someone who is very different or unusual in a way that is not accepted by most people.""}, {""needed"",""Being necessary or required for a particular role or moment.""}, … {""see right through them"",""To understand or detect the true nature or motives of someone easily.""}]","""f88ec39e-bc84-…","{34.0,73,0.075342,0.219178,0.383562}"
"""He woke up the…","""Преступление и…","""Достоевский Ф.…","""book""",False,"""""",,"[""In the novel 'Crime and Punishment', how does Raskolnikov feel about the squalid conditions of his living space?"", ""Does Raskolnikov show any sign of discomfort or distress due to his overcrowded and dilapidated living conditions in 'Crime and Punishment'?"", … ""Yo, how does the minimal interaction with his maid reflect Raskolnikov's overall social withdrawal in 'Crime and Punishment'? Could this isolation be a strategic game move in his psychological survival game?""]","[{""restless sleep"",""Sleep that is not calm or peaceful, causing one to wake up still feeling tired.""}, {""gall bitter"",""Feeling very bitter or upset inside, often leading to irritation or anger.""}, … {""monomaniacs"",""People who are obsessively focused on a single thing, often ignoring everything else.""}]","""e2f09185-ce5c-…","{24.25,291,0.024914,0.024055,0.109966}"
"""And there she …","""Дюна""","""Герберт Ф.""","""book""",False,"""""",,"[""What emotions overwhelmed Paul after encountering the girl in the 'Dune' universe?"", ""Describe how Paul physically followed the girl after their meeting in the 'Dune' universe."", … ""Hey fellow gamers! Did you ever experience a wave of joy in your gaming victory that was as intense as Paul's when meeting the girl in Dune? Share your epic win!""]","[{""furious Shai-Hulud"",""Shai-Hulud is a giant desert worm in the story. Described as furious, it means very angry or aggressive.""}, {""easier path"",""An easier route or way compared to what was previously used.""}, … {""wave of joy"",""A strong, overwhelming feeling of happiness.""}]","""abaf6622-2a79-…","{22.5,95,0.047368,0.105263,0.252632}"
"""The instructio…","""Метро 2034""","""Дмитрий Глухов…","""book""",False,,,"[""How long were the metro inhabitants initially told they would need to stay in the refuge according to the book 'Metro 2034'?"", ""What happened to the metro trains after Judgment Day in the universe of 'Metro 2034'?"", … ""What's up folks? How did people from the Metro 2034 universe adapt to their new life underground, using train parts for survival?""]","[{""Judgment Day"",""In the context, it refers to a catastrophic event, likely a disaster or apocalypse, that drastically changes life as it was known.""}, {""frozen at the platform"",""This means the trains were stopped and left idle at the stations where they were located when the disaster happened.""}, … {""metro inhabitants"",""Refers to the people living inside the metro system, using it as a shelter.""}]","""b446c78f-993f-…","{20.5,59,0.144068,0.20339,0.508475}"


In [60]:
js_docs_metrics = pl_docs.select("monitor").to_dicts()

In [61]:
def compute_statistcs(pl_df: pl.DataFrame):
    pl_df = pl_df.with_columns([
        pl.col("monitor").struct.field("LQ").mean().alias("LQMean"),
        pl.col("monitor").struct.field("LQ").std().alias("LQStd"),

        pl.col("monitor").struct.field("LD").mean().alias("LDMean"),
        pl.col("monitor").struct.field("LD").std().alias("LDStd"),

        pl.col("monitor").struct.field("LQD").mean().alias("LQDMean"),
        pl.col("monitor").struct.field("LQD").std().alias("LQDStd"),

        pl.col("monitor").struct.field("LKD").mean().alias("LKDMean"),
        pl.col("monitor").struct.field("LKD").std().alias("LKDStd"),

        pl.col("monitor").struct.field("LKED").mean().alias("LKEDMean"),
        pl.col("monitor").struct.field("LKED").std().alias("LKEDStd"),
    ])
    
    columns = [
        "LQMean",
        "LQStd",
        "LDMean",
        "LDStd",
        "LQDMean",
        "LQDStd",
        "LKDMean",
        "LKDStd",
        "LKEDMean",
        "LKEDStd"
    ]
    js_metrics = {}
    for col in columns:
        js_metrics[col] = pl_df.select(col).unique().to_series().to_list()[0]
        # pl_df = pl_df.select(columns).with_columns([
        #     pl.col(col).unique().alias(col)
        # ])
    return js_metrics

In [46]:
compute_statistcs(pl_docs)

{'LQMean': 22.48569043803419,
 'LQStd': 9.563272517598573,
 'LDMean': 144.125,
 'LDStd': 78.51795160154565,
 'LQDMean': 0.04775010984639291,
 'LQDStd': 0.03916782202521952,
 'LKDMean': 0.08644840221554007,
 'LKDStd': 0.06688186460471794,
 'LKEDMean': 0.21452636258872992,
 'LKEDStd': 0.11869616880230745}

In [47]:
logger.info(f'Queries[D] = {pl_docs.explode("queries").shape[0]}')

[32m2025-02-18 05:16:48.941[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mQueries[D] = 14498[0m


In [48]:
js_docs = pl_docs.drop(["monitor"]).to_dicts()

In [55]:
def run_fusion_pipeline(js_data: list[dict], source_col: str, target_col: str) -> list[dict]:
    """
    Performs fusion process toward each dict for the field `target_col`

    :param source_col: column (field) within the `js_data` documents where keywords or phrases are taken from
    :type source_col: str
    :param target_col: column (field) within the `js_data` documents, e.g. [content].
    :type target_col: str
    :return: The same object where fusion has been applied
    """
    total_missing_kwargs: int = 0
    total_keywords: int = 0
    for i, js_doc in enumerate(js_data):
        ls_kwargs = js_doc[source_col]
        cur_doc_content = js_doc[target_col]
        for j, js_kwarg in enumerate(ls_kwargs):
            kwp = js_kwarg['keyword_or_phrase']
            exp = js_kwarg['explanation']
            total_keywords += 1
            if kwp is None or exp is None:
                continue
            pos = cur_doc_content.find(kwp)
            if pos < 0:
                # logger.info(f"Applying the phrase j=[{str(j)}] kwp=[{kwp}] is missing from i=[{str(i)}]")
                total_missing_kwargs += 1
    return total_missing_kwargs, total_keywords

In [56]:
total_missing_kwargs, total_keywords = run_fusion_pipeline(js_data=js_docs, source_col="keywords_or_phrases", target_col="content")

In [57]:
logger.info(f"There are total L1=[{total_missing_kwargs}] out of L=[{total_keywords}] keywords that are NOT substring of the respective content")

[32m2025-02-18 05:19:49.121[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mThere are total L1=[3363] out of L=[28900] keywords that are NOT substring of the respective content[0m
