In [None]:
import collections
import os
from functools import partial
from pathlib import Path
from pprint import pprint as pp

import dotenv
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import polars as pl
import random_name
import sklearn
import torch
from bertopic import BERTopic
from hdbscan import HDBSCAN
from more_itertools import chunked
from sklearn.cluster import KMeans
from umap import UMAP
import plotly.express as px
import plotly.io as pio
from typing import Union, List, Iterable
from catalyst import metrics

from vdorogu.inferencer.inference import Inferencer
from vdorogu.modeling.module import M1Model

if sklearn.__version__ < "1.2.2":
    from sklearn.feature_extraction import CountVectorizer
else:
    from sklearn.feature_extraction.text import CountVectorizer

In [None]:
class Monitor:
        def __init__(self, classes):
            self.monitor = {key: metrics.AdditiveMetric(compute_on_call=False) for key in list(classes)}

        def update(self, key, value: Union[np.ndarray, List[np.ndarray]], n: int = 1):
            """
            key: the class name to update embedding(s) or single value(s) for
            value: embedding or any single value tensor
            """
            if isinstance(value, np.ndarray) or not isinstance(value, Iterable):
                self.monitor[key].update(value, n)
            else:
                for vec in value:
                    self.monitor[key].update(vec, 1)
            return self

        def compute(self, key=None):
            response = {}
            if key is not None:
                return self.monitor[key].compute()[0]
            for key in self.monitor.keys():
                response[key] = self.monitor[key].compute()[0]
            return response

In [None]:
def evaluate(
    _docs,
    _topic_per_doc,
    _label_per_doc,
    _topics_to_show=None,
    _labels_to_show=None,
    _reduced_embeddings=None,
    sample: float = None,
    hide_annotations: bool = False,
    hide_document_hover: bool = False,
    custom_labels: bool = False,
    title: str = "<b>Documents and Topics</b>",
    width: int = 1200,
    height: int = 750) -> go.Figure:
    """
    Arguments:
            _topic_per_doc: Topic id assigned to each document.
            _docs: The documents.
            _topics_to_show: A selection of topics to visualize.
                    Not to be confused with the topics that you get from `.fit_transform`.
                    For example, if you want to visualize only topics 1 through 5:
                    `topics = [1, 2, 3, 4, 5]`.
            _reduced_embeddings: The 2D reduced embeddings of all documents in `_docs`.
            sample: The percentage of documents in each topic that you would like to keep.
                    Value can be between 0 and 1. Setting this value to, for example,
                    0.1 (10% of documents in each topic) makes it easier to visualize
                    millions of documents as a subset is chosen.
            hide_annotations: Hide the names of the traces on top of each cluster.
            hide_document_hover: Hide the content of the documents when hovering over
                                specific points. Helps to speed up generation of visualization.
            custom_labels: Whether to use custom topic labels that were defined using
                       `topic_model.set_topic_labels`.
            title: Title of the plot.
            width: The width of the figure.
            height: The height of the figure.
    """

    if sample is None or sample > 1:
        sample = 1

    indices = []
    for topic in set(_topic_per_doc):
        s = np.where(np.array(_topic_per_doc) == topic)[0]  # Выбираем все индексы, соответствующие определенному топику
        #
        size = len(s) if len(s) < 100 else int(len(s) * sample)
        indices.extend(np.random.choice(s, size=size, replace=False))

    indices = np.array(indices)
    # topic_per_doc[index] for index in indices
    idf = pd.DataFrame(
        {
            "topic": [_topic_per_doc[idx] for idx in indices],
            "doc": [_docs[idx] for idx in indices],
            "_label": [_label_per_doc[idx] for idx in indices]
        }
    )

    monitor = Monitor(set(_label_per_doc))

    fig = go.Figure()

    if _reduced_embeddings is not None:
        embeddings_2d = _reduced_embeddings[indices]

        idf["x"] = embeddings_2d[:, 0]
        idf["y"] = embeddings_2d[:, 1]

    unique_topics = set(_topic_per_doc)
    _topics_to_show = unique_topics if _topics_to_show is None else _topics_to_show

    non_selected_topics = unique_topics.difference(_topics_to_show)

    if len(non_selected_topics) == 0:
        non_selected_topics = [-1]  # `bertopic` присваевает документам - `-1`, если он не относится ни к одному из кластеров

    selection = idf.loc[idf.topic.isin(non_selected_topics), :]
    selection["text"] = ""
    # selection.loc[len(selection), :] = [None, None, None, selection.x.mean(), selection.y.mean(), "Other documents"]
    selection.loc[len(selection), :] = [None, None, "", selection.x.mean(), selection.y.mean(), "Other documents"]

    fig.add_trace(
        go.Scattergl(
            x=selection.x,
            y=selection.y,
            hovertext=selection.doc if not hide_document_hover else None,
            hoverinfo="text",
            mode="markers+text",
            name="other",
            showlegend=False,
            marker=dict(color="#CFD8DC", size=5, opacity=0.5),
        )
    )

    # Теперь наносим каждый topic отдельно
    # for label, topic in zip(range(len(unique_topics)), unique_topics):
    for i, topic in enumerate(unique_topics):
        if topic in _topics_to_show and topic != -1:
            selection = idf.loc[idf.topic == topic, :]
            selection["text"] = ""

            _selection = pl.from_pandas(selection)

            # _selection.join(ldf, on=pl.col("topic"))

            _selection = _selection.with_row_count().\
                with_columns([
                    pl.count().over("_label").alias("label_len")
                ])

            _selection_per_topk = _selection.sort("label_len", descending=True).unique(subset=["label_len"], maintain_order=True).top_k(3, by="label_len")

            _topk = _selection.join(_selection_per_topk, on=pl.col("row_nr")).select(pl.col("_label"), pl.col("label_len")).to_arrow()

            _label, _mass = [str(yi) for yi in _topk["_label"]], [str(pi) for pi in _topk["label_len"]]

            for li, mi in zip(_label, _mass):
                # score is a scale mi / len(_selection).
                monitor.update(li, int(mi) * 1.0 / len(_selection))

            _label_on_doc = "  ".join([yi.strip()[:20] for yi, pi in zip(_label, _mass)]) # Будет показываться постоянно на облаке

            _label_on_topic = " ".join([yi.strip()[:22] + " (" + pi + ")" for yi, pi in zip(_label, _mass)]) # Будет показываться справа по точке

            # _label_doc = str(topic)
            if not hide_annotations:
                selection.loc[len(selection), :] = [None, None, "", selection.x.mean(), selection.y.mean(), _label_on_doc[:50]] # TODO: change topic to label
                # selection.loc[len(selection), :] = [None, None, None, selection.x.mean(), selection.y.mean(), _label_doc] # TODO: change topic to label

            fig.add_trace(
                go.Scattergl(
                    x=selection.x,
                    y=selection.y,
                    hovertext=selection.doc if not hide_document_hover else None,
                    hoverinfo="text",
                    text=selection.text,
                    mode="markers+text",
                    name=_label_on_topic,
                    textfont=dict(size=12),
                    marker=dict(size=5, opacity=0.5),
                )
            )

    # Add grid in a 'plus' shape
    x_range = (idf.x.min() - abs((idf.x.min()) * .15), idf.x.max() + abs((idf.x.max()) * .15))
    y_range = (idf.y.min() - abs((idf.y.min()) * .15), idf.y.max() + abs((idf.y.max()) * .15))
    fig.add_shape(type="line",
                  x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1],
                  line=dict(color="#CFD8DC", width=2))
    fig.add_shape(type="line",
                  x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2,
                  line=dict(color="#9E9E9E", width=2))
    fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10)
    fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10)

    # Stylize layout
    fig.update_layout(
        template="simple_white",
        title={
            'text': f"{title}",
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        width=width,
        height=height
    )

    fig.update_traces(textposition='top center')

    fig.update_layout(
        title_text='Распределение по топикам и TOP_3 соответствующих класса (по частоте) на каждый топик '
    )
    fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')

    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)

    

    return fig, {k: v for k, v in monitor.compute().items() if v is not None and v is not np.nan}

In [None]:
def silo(_df, col, labels):
    pattern = "|".join(labels)
    _df = _df.with_columns([pl.col(col).str.contains(pattern).alias("silo")])
    return _df

In [None]:
dotenv.load_dotenv()

In [None]:
inf = Inferencer(
            model='vdorogu/inferencer/models/web/labse_dense_retrieval_title.py',
            storage_path=None,
            model_data_path=str(Path(os.environ.get("MODEL")).expanduser()),
            batch_size=256,
            mode='document_emb',
            half=True,
            gpus=0,
            model_params={},
        )

In [None]:
model = M1Model(model=inf)

In [None]:
df = pl.read_csv(str(Path(os.environ.get("DATASET")).expanduser()))

In [None]:
umap_model = UMAP(n_components=5, metric="cosine")

In [None]:
clustering_model = KMeans(n_clusters=40)

In [None]:
# 1. Разметка данных слабо сепарабельна - настолько, что даже самому
#       иногда сложно понять к какому из классов относится баннер
# 2. Очень много классов, на которые приходится менее ста примеров

# Поэтому, давайте сделаем небольшую предобработку,

In [None]:
print(df.shape[0]) # всего примеров

<p>Сколько всего <b><u>уникальных</u></b> классов из разметки</p>

In [None]:
print(df.select("taxons").unique().shape[0]) # всего классов

<p>Кол-во классов среди которых мало примеров </p>

$$p_i \leq 100$$

In [None]:
print(df.with_columns([pl.count().over(pl.col("taxons"))]).filter(pl.col("count") < 100).shape[0])

<p>Это очень много, поэтому просто удалить эти классы нельзя - пропадет много данных.</p>

Давайте оставим только те $x_i$, где в названии класса есть популярный класс (т.к. разметка представляет структуру дерева)

In [None]:
_df = df.with_columns([pl.count().over(pl.col("taxons"))])

In [None]:
_pdf = _df.unique(subset="taxons").sort(by="count", descending=True).to_arrow()

In [None]:
fig = px.line(x=[str(i) + "|" + str(x)[:10] for i, x in enumerate(_pdf["taxons"])], y=[int(str(x)) for x in _pdf["count"]]) # TODO: make unique
fig.update_layout(
    title=dict(text="Распределение размеченных баннеров", font=dict(size=12), automargin=True, yref='paper')
)
fig.update_layout(yaxis_title=None)
fig.update_layout(xaxis_title=None)
fig.update_xaxes(tickfont_size=14, ticks="outside", ticklen=1, tickwidth=1)
fig.show()

In [None]:
pio.write_image(fig, file="original_label_mass.png", scale=5, engine="kaleido")

<p>Сначала фиксируем популярные классы, которые подходят для анализа</p>

In [None]:
labels = [str(li) for li in _df.filter(pl.col("count") >= 100).unique("taxons").select("taxons").to_arrow()["taxons"]]

In [None]:
labels[:10]

In [None]:
dff = df.pipe(partial(silo, col="taxons", labels=labels)).filter("silo") # Оставим те фразы, где есть такой "топовый" префикс

In [None]:
dff.shape # сильно лучше, хотя бы не половина :-)

In [None]:
docs, labels = [str(x) for x in dff.to_arrow()["text"]], [str(y) for y in dff.to_arrow()["taxons"]]

In [None]:
if "EMBEDDINGS" in os.environ.keys():
    embeddings_path = Path(os.environ.get("EMBEDDINGS")).expanduser()
    if embeddings_path.exists():
        embeddings = np.load(str(embeddings_path))
    else:
        embeddings = model.embed_documents(docs)
        np.save(str(embeddings_path), embeddings)
else:
    embeddings = model.embed_documents(docs)

assert embeddings.shape[0] == len(docs)

In [None]:
embeddings.shape

In [None]:
botpic = BERTopic(umap_model=umap_model, hdbscan_model=KMeans(n_clusters=40))

In [None]:
botpic = botpic.fit(docs, embeddings=embeddings)

In [None]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit_transform(embeddings)

In [None]:
# botpic.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [None]:
print(len(docs))
print(len(botpic.topics_))
print(len(labels))

<p>Теперь уже запускаем оценку, используя результаты кластеризации </p>

In [None]:
fig, monitor = evaluate(_docs=docs, _topic_per_doc=botpic.topics_, _label_per_doc=labels, _reduced_embeddings=reduced_embeddings, width=1920, height=1080)

In [None]:
monitor

In [None]:
fig.write_html("report.html")

<p>Посмотрим результаты оценки. Какие же классы плохо сепарабельны? </p>

In [None]:
_response = pl.from_dict({"x": monitor.keys(), "y": monitor.values()}).to_arrow()

In [None]:
fig = px.line(x=[str(i) + "|" + str(x)[:22] for i, x in enumerate(_response["x"])], y=[float(str(x)) for x in _response["y"]]) # TODO: make unique
fig.update_layout(
    title=dict(text="Сепарабельность фраз по кластеризации", font=dict(size=12), automargin=True, yref='paper')
)
fig.update_layout(yaxis_title=None)
fig.update_layout(xaxis_title=None)
fig.update_xaxes(tickfont_size=14, ticks="outside", ticklen=1, tickwidth=1)
fig.show()

In [None]:
pio.write_image(fig, file="metrica_per_topic.png", scale=5, engine="kaleido")