In [16]:
import os
from pathlib import Path
from typing import List, Dict

import pandas as pd
import simplejson as json
import numpy as np
from itertools import chain

import polars as pl
import torch

from justatom.configuring.prime import Config
from justatom.running.cluster import IBTRunner, IHFWrapperBackend
from justatom.modeling.prime import HFDocEmbedder
from justatom.clustering.prime import IUMAPDimReducer
from justatom.viewing.prime import IPlotlyChart

import altair as alt

In [2]:
def ignite_dataset(where, mask:str = None) -> List[Dict]:
    docs = None
    with open(str(Path(where)), encoding="utf-8") as fin:
        docs = json.load(fin)
    if mask:
        return docs[mask]
    return docs

In [3]:
docs = ignite_dataset(where=Path(os.getcwd()) / ".data" / "polaroids.ai.data.json")

In [4]:
documents = [di['content'] for di in docs]
labels = [di["title"] for di in docs]

In [17]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [5]:
embedder = HFDocEmbedder(**Config.clustering.embedder.toDict(), device=device)
backend_wrapper = IHFWrapperBackend(embedder, **Config.clustering.transformers_backend.toDict())

In [6]:
bt_runner = IBTRunner(**Config.clustering.bertopic, model=backend_wrapper, verbose=True)

In [7]:
embeddings = list(chain.from_iterable(embedder.encode(documents, verbose=True, batch_size=4)))
topics, probs = bt_runner.fit_transform(docs=documents)

100%|██████████| 392/392 [00:15<00:00, 24.60it/s]
2024-06-27 06:49:42,457 - BERTopic - Embedding - Transforming documents to embeddings.
100%|██████████| 157/157 [00:16<00:00,  9.34it/s]
2024-06-27 06:49:59,273 - BERTopic - Embedding - Completed ✓
2024-06-27 06:49:59,273 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-27 06:50:05,360 - BERTopic - Dimensionality - Completed ✓
2024-06-27 06:50:05,361 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-27 06:50:05,595 - BERTopic - Cluster - Completed ✓
2024-06-27 06:50:05,600 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-27 06:50:05,722 - BERTopic - Representation - Completed ✓


In [8]:
reducer = IUMAPDimReducer(**Config.clustering.umap.toDict())
points = reducer.fit_transform(embeddings)

In [9]:
def prepare2d(docs, topics, labels, reduced_embeddings):
    assert reduced_embeddings.shape[1] == 2, f"Embeddings shape mismatch Exptected 2D, got {embeddings.shape[1]}D"
    COLS_MAPPING=dict(
        column_0="text",
        column_1="topic",
        column_2="label",
        column_3="x",
        column_4="y"
    )
    pl_view = pl.from_dicts(zip(docs, topics, labels, reduced_embeddings[:, 0], reduced_embeddings[:, 1]))
    pl_view = pl_view.rename(COLS_MAPPING)
    return pl_view

In [10]:
pl_view = prepare2d(docs=documents, topics=topics, labels=labels, reduced_embeddings=points)

In [11]:
chart = alt.Chart(pl_view).mark_circle().encode(
    x='x',
    y='y',
    color='label'
)

In [12]:
chart.save(f"polaroids.ai.clustering.e5-base.png", ppi=200)

In [14]:
chart.save(f"polaroids.ai.clustering.e5-base.html")