In [1]:
import os

import re
import pymorphy2
from typing import List

import pandas as pd

In [2]:
# configs
PROJECT_PATH = "/home/alex/paper-2025-anonymous-submission"

In [3]:
dataset = pd.read_json(
    os.path.join(
        PROJECT_PATH,
        "Data/processed_data/dataset.json"
    ),
    orient="index"
)

  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)


In [4]:
dataset["deepseak_explain"] = ["какой то текст для объяснения" for _ in range(len(dataset))]

In [5]:
dataset = dataset.dropna(subset=["annotations"])

In [6]:
def parse_one_annotation(annot):

    searcheable_elements = list()
    searcheable_links = list()

    if isinstance(annot["reference_string"], list):
        for i in range(len(annot["reference_string"])):
            searcheable_elements.append(
                annot["reference_string"][i]
            )
    elif isinstance(annot["reference_string"], str) and annot["reference_string"].strip() != "":
        searcheable_elements.append(
                annot["reference_string"]
        )

    if isinstance(annot["reference_url"], list):
        for i in range(len(annot["reference_url"])):
            searcheable_links.append(
                annot["reference_url"][i]
            )
    elif isinstance(annot["reference_url"], str) and annot["reference_url"].strip() != "":
        searcheable_links.append(
                annot["reference_url"]
        )

    return searcheable_elements, searcheable_links

In [7]:
def extract_searcheable_elements_and_links(annotations):

    searcheable_elements = list()
    searcheable_links = list()

    for annot in annotations:

        if isinstance(annot, dict):

            tmp_elements, tmp_links = parse_one_annotation(annot)
            searcheable_elements.extend(
                tmp_elements
            )
            searcheable_links.extend(
                tmp_links
            )

        elif isinstance(annot, list):

            for single_annot in annot:
                tmp_elements, tmp_links = parse_one_annotation(single_annot)
                searcheable_elements.extend(
                    tmp_elements
                )
                searcheable_links.extend(
                    tmp_links
                )
    
    return searcheable_elements, searcheable_links

In [8]:
dataset["annotations_len"] = dataset["annotations"].apply(lambda x: len(x))

In [9]:
dataset["searcheable_elements"] = dataset["annotations"].apply(lambda x: extract_searcheable_elements_and_links(x)[0])
dataset["searcheable_links"] = dataset["annotations"].apply(lambda x: extract_searcheable_elements_and_links(x)[1])

In [10]:
dataset["searcheable_elements_len"] = dataset["searcheable_elements"].apply(lambda x: len(x))
dataset["searcheable_links_len"] = dataset["searcheable_links"].apply(lambda x: len(x))

In [11]:
dataset = dataset.query("searcheable_elements_len >= 1 or searcheable_links_len >= 1")

In [12]:
def normalize_link(link):
    if "https://ru.wikipedia.org" in link:
        link = link.split("https://ru.wikipedia.org/wiki/")[1]
    elif "https://ru.wiktionary.org" in link:
        link = link.split("https://ru.wiktionary.org/wiki/")[1]
    
    if "#" in link:
        position = link.find("#")
        link = link[:position]
    
    link = re.sub(r'\(.*?\)', '', link).strip()

    link = re.sub(r'[^a-zA-Zа-яА-ЯёЁ]', ' ', link)
    link = re.sub(r'\s+', ' ', link).strip()

    return link.lower()

In [13]:
def tokenize(text: str):
    return " ".join(re.findall(r'\b\w+\b', text.lower()))

def lemmatize(tokens: str):
    tokens = tokens.split(" ")
    morph = pymorphy2.MorphAnalyzer()
    return " ".join([morph.parse(token)[0].normal_form for token in tokens])

def normalize_text(text):
    tokens = tokenize(text)
    lemmas = lemmatize(tokens)
    return " ".join(lemmas)

In [14]:
dataset["searcheable_elements"] = dataset["searcheable_elements"].apply(lambda x: [str.lower(t) for t in x])
dataset["searcheable_elements_tokenized"] = dataset["searcheable_elements"].apply(lambda x: [tokenize(t) for t in x])
dataset["searcheable_elements_normalized"] = dataset["searcheable_elements_tokenized"].apply(lambda x: [lemmatize(t) for t in x])

In [15]:
dataset["searcheable_links"] = dataset["searcheable_links"].apply(lambda x: [normalize_link(t) for t in x])
dataset["searcheable_links"] = dataset["searcheable_links"].apply(lambda x: [str.lower(t) for t in x])
dataset["searcheable_links_tokenized"] = dataset["searcheable_links"].apply(lambda x: [tokenize(t) for t in x])
dataset["searcheable_links_normalized"] = dataset["searcheable_links_tokenized"].apply(lambda x: [lemmatize(t) for t in x])

In [16]:
dataset["searcheable_all"] = dataset["searcheable_elements"] + dataset["searcheable_elements_tokenized"] + dataset["searcheable_elements_normalized"]

In [17]:
dataset.query(
    "article_url == 'https://www.kommersant.ru/doc/6209906'"
)

Unnamed: 0,annotations,summary,is_word_play,date,article_url,headline,lead,deepseak_explain,annotations_len,searcheable_elements,searcheable_links,searcheable_elements_len,searcheable_links_len,searcheable_elements_tokenized,searcheable_elements_normalized,searcheable_links_tokenized,searcheable_links_normalized,searcheable_all
2014,"[{'headline_substring': 'Дели', 'start_index':...",Саммит лидеров G20 завершился подписанием итог...,True,2023-09-10,https://www.kommersant.ru/doc/6209906,На самом Дели,Что участники саммита G20 предпочли украинской...,какой то текст для объяснения,2,"[деле, на са]",[],2,0,"[деле, на са]","[дело, на са]",[],[],"[деле, на са, деле, на са, дело, на са]"


In [18]:
def do_search(searcheable_elements, all_texts):
    
    results = list()

    assert isinstance(all_texts, list)
    assert len(all_texts) == 3

    for elem in searcheable_elements:
        for text in all_texts:
            results.append(
                str.find(text, elem)
            )
    return results

In [19]:
EXPLIAN_COLUMNS = ["deepseak_explain"]

for explain_column in EXPLIAN_COLUMNS:
    dataset[f"{explain_column}"] = dataset[f"{explain_column}"].apply(lambda x: str.lower(x))
    dataset[f"{explain_column}_tokenized"] = dataset[f"{explain_column}"].apply(lambda x: tokenize(x))
    dataset[f"{explain_column}_normalized"] = dataset[f"{explain_column}_tokenized"].apply(lambda x: lemmatize(x))
    dataset[f"{explain_column}"] = dataset[f"{explain_column}"].apply(lambda x: [x])
    dataset[f"{explain_column}_tokenized"] = dataset[f"{explain_column}_tokenized"].apply(lambda x: [x])
    dataset[f"{explain_column}_normalized"] = dataset[f"{explain_column}_normalized"].apply(lambda x: [x])
    dataset[f"{explain_column}_all"] = dataset[f"{explain_column}"] + dataset[f"{explain_column}_tokenized"] + dataset[f"{explain_column}_normalized"]

    dataset[f"{explain_column}_search_results"] = dataset.apply(lambda row: do_search(row["searcheable_all"], row[f"{explain_column}_all"]), axis=1)
    dataset[f"{explain_column}_search_results"] = dataset[f"{explain_column}_search_results"].apply(lambda x: max(x))

    dataset[f"{explain_column}_search_results"] = dataset[f"{explain_column}_search_results"].apply(lambda x: True if x >= 0 else False)




In [20]:
dataset.sample(2)

Unnamed: 0,annotations,summary,is_word_play,date,article_url,headline,lead,deepseak_explain,annotations_len,searcheable_elements,...,searcheable_links_len,searcheable_elements_tokenized,searcheable_elements_normalized,searcheable_links_tokenized,searcheable_links_normalized,searcheable_all,deepseak_explain_tokenized,deepseak_explain_normalized,deepseak_explain_all,deepseak_explain_search_results
2077,[{'headline_substring': 'Культурная резолюция'...,Белый дом утвердил концепцию развития творческ...,True,2021-09-28,https://www.kommersant.ru/doc/5007033,Культурная резолюция,В России планируется создать систему поддержки...,[какой то текст для объяснения],1,[культурная революция],...,1,[культурная революция],[культурный революция],[культурная революция],[культурный революция],"[культурная революция, культурная революция, к...",[какой то текст для объяснения],[какой то текст для объяснение],"[какой то текст для объяснения, какой то текст...",False
2297,"[{'headline_substring': 'Код накликал', 'start...",Зародившееся как антагонист глобальным IТ-лиде...,True,2021-10-28,https://www.kommersant.ru/doc/5050986,Код накликал,Зачем и кому нужен Open Source в России,[какой то текст для объяснения],1,[кот наплакал],...,1,[кот наплакал],[кот наплакать],[кот наплакал],[кот наплакать],"[кот наплакал, кот наплакал, кот наплакать]",[какой то текст для объяснения],[какой то текст для объяснение],"[какой то текст для объяснения, какой то текст...",False


In [23]:
dataset[[f"{t}_search_results" for t in EXPLIAN_COLUMNS]].sum() / len(dataset)

deepseak_explain_search_results    0.000968
dtype: float64