In [21]:
import pandas as pd
import jsonlines
import sys
import uuid
from typing import Any, Optional
import os
sys.path.append("../")
# sys.path.append("../src/")

from src.index import create_index, index_jsonl
from src.elastic.elastic_api import ElasticIndex
from src.utils import create_suggests_jsonl

In [2]:
suggest_elastic_client = ElasticIndex(index_name="suggest", 
                              elastic_host_port="8201", # Убедись что используешь правильный порт
                              elastic_password="F4ky9EiA-VfSjy6ygY6B",
                              elastic_ca_certs_path="../src/elastic/certs/http_ca.crt")

In [3]:
create_suggests_jsonl(path_to_pq="../data/yappy_hackaton_2024_40k_postprocessed.pq")

In [4]:
create_index(
        path_to_index_json="../src/elastic/settings/suggest_index.json",
        elastic_client=suggest_elastic_client,
    )

index_jsonl(path_to_jsonl="../data/suggests.jsonl", elastic_client=suggest_elastic_client)

[32m2024-06-14 12:35:32.250[0m | [1mINFO    [0m | [36msrc.elastic.elastic_api[0m:[36mcreate_index[0m:[36m68[0m - [1mIndex with name 'suggest' is created.[0m
[32m2024-06-14 12:35:32.340[0m | [1mINFO    [0m | [36msrc.elastic.elastic_api[0m:[36mbulk_documents[0m:[36m170[0m - [1mIndexing documents... Overall documents: 50644[0m
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 50501/50644 [00:05<00:00, 12108.04docs/s][32m2024-06-14 12:35:38.155[0m | [1mINFO    [0m | [36msrc.elastic.elastic_api[0m:[36mbulk_documents[0m:[36m186[0m - [1mIndexed 50644/50644 documents[0m
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50644/50644 [00:05<00:00, 8700.84docs/s]
[32m2024-06-14 12:35:38.217[0m | [1mINFO    [0m | [36msrc.elastic.elastic_api[0m:[36mcount_documents

In [62]:
def search_suggests(user_query: str, elastic_client: Any):
    """Выполнить поиск саджестов в ElasticSearch.

    :param user_query: Пользовательский запрос для подбора саджестов.
    :type user_query: str
    :param elastic_client: Клиент для взаимодействия с ElasticSearch.
    :type elastic_client: Any
    :return: Словарь с одним полем саджестов.
    :rtype: dict
    """
    body = {
            "_source": "false",
            "size":5,
            "suggest":{
               "suggest-bucket":{
                  "text": user_query,
                  "completion":{
                     "field":"suggest",
                     "size":5,
                     "skip_duplicates":"true"
                  }
               }
            }
        }

    try:
        response = elastic_client.local_client.search(index=elastic_client.index_name, body=body)["suggest"]["suggest-bucket"][0]["options"]
        completions = [doc["text"] for doc in response]
        res = {"suggests": completions}
    except Exception as e:
        # more smart exception can be here
        raise e

    return res # res

In [82]:
res = search_suggests("robl", suggest_elastic_client)

In [83]:
res

{'suggests': ['roblox', 'robloxalan', 'robloxcore', 'robloxdead', 'robloxdj']}

## Create jsonl

In [38]:
def create_suggests_jsonl(
    data: Optional[pd.DataFrame] = None,
    path_to_pq: Optional[str] = None,
    path_to_save: str = "../data/suggests.jsonl",
):
    """Создать JSONL документы из DataFrame или Parquet файла. Для пословных подсказок.

    :param data: DataFrame с данными. Если не указан, будет использован path_to_pq.
    :type data: Optional[pd.DataFrame]
    :param path_to_pq: Путь к Parquet файлу с данными. Если не указан, будет использован data.
    :type path_to_pq: Optional[str]
    :param path_to_save: Путь для сохранения JSONL документов.
    :type path_to_save: str
    :raises Exception: Если не указаны ни data, ни path_to_pq.
    """
    if data is None and path_to_pq is None:
        raise Exception
    if data is None:
        data = pd.read_parquet(path_to_pq)

    final_suggests = set()
    
    not_simple_tokens = data["song_author"].tolist()
    not_simple_tokens.extend(data["song_name"].tolist())
    not_simple_tokens.extend(data["song_author_transliterated"].tolist())
    not_simple_tokens = set(not_simple_tokens)
    for sentence in not_simple_tokens:
        if len(sentence) > 3:
            final_suggests.add(sentence)
    
    simple_tokens = set(list(data["text_hashtags"]))
    for sentence in simple_tokens:
        candidates = {token for token in sentence.split(" ") if len(token) > 3}
        final_suggests.update(candidates)

    if os.path.isfile(path_to_save):
        os.remove(path_to_save)

    with jsonlines.open(path_to_save, mode="a") as writer:
        for suggest in final_suggests:
            writer.write({"_id": uuid.uuid4().hex, "suggest": suggest})

In [40]:
create_suggests_jsonl(path_to_pq="../data/data_postproc.pq", path_to_save="../data/suggests.jsonl")