In [None]:
import os
from pathlib import Path
from typing import List, Dict

import pandas as pd
import simplejson as json
import numpy as np
from itertools import chain
from loguru import logger

import polars as pl
import torch

from justatom.modeling.mask import ILanguageModel

from justatom.configuring.prime import Config
from justatom.running.cluster import IBTRunner, IHFWrapperBackend
from justatom.modeling.prime import DocEmbedder
from justatom.clustering.prime import IUMAPDimReducer
from justatom.viewing.prime import PlotlyScatterChart

import altair as alt

In [2]:
def source_from_dataset(dataset_name_or_path, **props):
    from justatom.storing.dataset import API as DatasetApi
    import polars as pl

    maybe_df_or_iter = DatasetApi.named(dataset_name_or_path).iterator(**props)
    if isinstance(maybe_df_or_iter, pl.DataFrame):
        pl_data = maybe_df_or_iter
    else:
        dataset = list(maybe_df_or_iter)
        pl_data = pl.from_dicts(dataset)
    return pl_data

In [3]:
pl_docs = source_from_dataset(Path(os.getcwd()) / ".data" / "polaroids.ai.data.json")

In [4]:
sub_sections = [
    "Гладиатор",
    "451 градус по Фаренгейту",
    "Гарри Поттер и Узник Азкабана",
    "Гарри Поттер и философский камень",
    "Цветы для Элджернона",
    "Гарри Поттер и Дары Смерти",
    "Ведьмак",
    "Сойка-пересмешница",
    "Голодные игры",
    "Голодные игры: И вспыхнет пламя"
]

In [5]:
pl_sub_docs = pl_docs.filter(pl.col("title").is_in(sub_sections))

In [6]:
logger.info(f"There are S=[{pl_sub_docs.shape[0]}] / [{pl_docs.shape[0]}] subset of documents selected for clustering")

[32m2025-03-21 04:44:32.306[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mThere are S=[1530] / [4992] subset of documents selected for clustering[0m


In [7]:
content_col = "content"
title_col = "title"

In [8]:
js_titles = pl_sub_docs.select(title_col).unique().to_series().to_list()
js_sub_docs = pl_sub_docs.to_dicts()

In [9]:
js_docs = [di[content_col] for di in js_sub_docs]
js_labels = [di[title_col] for di in js_sub_docs]

In [10]:
def maybe_cuda_or_mps():
    if torch.cuda.is_available():
        return "cuda:0"
    elif torch.has_mps:
        return "mps"
    else:
        return "cpu"

In [11]:
device = maybe_cuda_or_mps()
logger.info(f"Using device {device}")

[32m2025-03-21 04:44:52.629[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mUsing device mps[0m


In [12]:
model_name_or_path = "intfloat/multilingual-e5-base" 

In [13]:
from justatom.processing.mask import IProcessor
from justatom.processing.prime import INFERProcessor, TripletProcessor
from justatom.processing import ITokenizer

In [14]:
tokenizer = ITokenizer.from_pretrained(model_name_or_path)
processor = INFERProcessor(
    tokenizer=tokenizer,
    max_seq_len=512,
    content_field=content_col,
    prefix="query:"
)

In [15]:
lm_model = ILanguageModel.load(model_name_or_path)

[32m2025-03-21 04:44:57.025[0m | [1mINFO    [0m | [36mjustatom.modeling.mask[0m:[36mload[0m:[36m144[0m - [1mLoading from huggingface hub via "intfloat/multilingual-e5-base"[0m


In [16]:
embedder = DocEmbedder(model=lm_model, processor=processor, device=device)
backend_wrapper = IHFWrapperBackend(embedder, batch_size=32)

In [17]:
clustering_config = dict(
    top_n_words=10,
    n_gram_range=[1, 1],
    min_topic_size=5,
    calculate_probabilities=False
)

In [18]:
bt_runner = IBTRunner(**clustering_config, model=backend_wrapper, verbose=True)

In [19]:
embeddings = list(chain.from_iterable(embedder.encode(js_docs, verbose=True, batch_size=4)))
topics, probs = bt_runner.fit_transform(docs=js_docs)

Preprocessing dataset:   0%|          | 0/383 [00:00<?, ? Dicts/s]

  0%|          | 0/383 [00:00<?, ?it/s]

2025-03-21 04:45:30,988 - BERTopic - Embedding - Transforming documents to embeddings.


Preprocessing dataset:   0%|          | 0/48 [00:00<?, ? Dicts/s]

  0%|          | 0/48 [00:00<?, ?it/s]

2025-03-21 04:45:56,502 - BERTopic - Embedding - Completed ✓
2025-03-21 04:45:56,503 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-03-21 04:46:00,723 - BERTopic - Dimensionality - Completed ✓
2025-03-21 04:46:00,723 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-21 04:46:00,750 - BERTopic - Cluster - Completed ✓
2025-03-21 04:46:00,754 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-03-21 04:46:00,866 - BERTopic - Representation - Completed ✓


In [24]:
umap_config = dict(
    n_components=2,
    n_neighbors=3,
    min_dist=0.1,
    metric="cosine"
)

In [25]:
reducer = IUMAPDimReducer(**umap_config)
points = reducer.fit_transform(embeddings)

In [26]:
def prepare2d(docs, topics, labels, reduced_embeddings):
    assert reduced_embeddings.shape[1] == 2, f"Embeddings shape mismatch Exptected 2D, got {embeddings.shape[1]}D"
    COLS_MAPPING=dict(
        column_0="text",
        column_1="topic",
        column_2="label",
        column_3="x",
        column_4="y"
    )
    pl_view = pl.from_dicts(zip(docs, topics, labels, reduced_embeddings[:, 0], reduced_embeddings[:, 1]))
    pl_view = pl_view.rename(COLS_MAPPING)
    return pl_view

In [27]:
pl_view = prepare2d(docs=js_docs, topics=js_labels, labels=js_labels, reduced_embeddings=points)

In [28]:
from justatom.viewing.prime import PlotlyScatterChart

In [29]:
chart = PlotlyScatterChart().view(pl_view, label_to_view="Вселенная")

In [None]:
chart

In [33]:
chart.write_image(f"clustering_model=[e5]_dataset=[universe].png", engine='kaleido', scale=2)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
pl_sub_docs.shape