In [1]:
from pathlib import Path

models_dir = Path("../../models/finetuning_studies/")

### Embedding Model

In [2]:
from src.bertopic_finetuning import BERTopicTrainer

--------------------------------------------------------------------------------

  CuPy may not function correctly because multiple CuPy packages are installed
  in your environment:

    cupy, cupy-cuda12x

  Follow these steps to resolve this issue:

    1. For all packages listed above, run the following command to remove all
       existing CuPy installations:

         $ pip uninstall <package_name>

      If you previously installed CuPy via conda, also run the following:

         $ conda uninstall cupy

    2. Install the appropriate CuPy package.
       Refer to the Installation Guide for detailed instructions.

         https://docs.cupy.dev/en/stable/install.html

--------------------------------------------------------------------------------



In [3]:
from src.bertopic_finetuning import BERTopicTrainer
from src.hyperparameters import BertopicHyperparameters
from src.util import initialize_logger, get_logfile_name, load_config

In [4]:
from pathlib import Path

config = load_config(config_file=Path("config/bertopic_finetuning_config_dataset_1.yml"))
logger = initialize_logger(logfile_name=config['logfile_name'], log_level=config['logging_level'])

2023-06-12 19:17:45,351 - INFO - util.py line: 42 - Logger initialized, Refer Logfile: /mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/bertopic_trainer_logs.txt, LogLevel: 10


In [5]:
trainer = BERTopicTrainer(config=config, logger=logger)
trainer

<src.bertopic_finetuning.BERTopicTrainer at 0x7fd1b8c1a670>

In [6]:
import optuna
from optuna.trial._state import TrialState

distributions={
    'model_name': optuna.distributions.CategoricalDistribution([
        'sentence-transformers/distilroberta-base-paraphrase-v1',
        'sentence-transformers/stsb-distilroberta-base-v2',
        'sentence-transformers/distilbert-base-nli-stsb-quora-ranking',
        'distilbert-base-nli-mean-tokens'
    ]),

    'nr_topics': optuna.distributions.CategoricalDistribution([5, 15, 25, 45, 75, 95, 135, 160, 190, 200, 210, 215, 300, 400]),
    'top_n_words': optuna.distributions.CategoricalDistribution([30, 60, 80, 100]),
    'min_topic_size': optuna.distributions.CategoricalDistribution([1, 5, 15, 30, 50, 100]),
    'n_gram_range_start': optuna.distributions.CategoricalDistribution([1]),
    'n_gram_range_end': optuna.distributions.CategoricalDistribution([1, 2, 3]),
    'max_features': optuna.distributions.CategoricalDistribution([500, 5000, 10000, 20000, 30000, 60000]),
    'max_df': optuna.distributions.CategoricalDistribution([0.75, 0.8, 0.86, 0.999]),
    'min_df': optuna.distributions.CategoricalDistribution([0.001, 0.01, 0.05, 0.1, 0.13]),
    'lowercase': optuna.distributions.CategoricalDistribution([True, False]),
    'n_neighbors': optuna.distributions.CategoricalDistribution([3, 5, 7, 15, 25, 40]),
    'n_components': optuna.distributions.CategoricalDistribution([64, 128, 256]),
    'umap_metric': optuna.distributions.CategoricalDistribution(['euclidean']),
    'n_epochs': optuna.distributions.CategoricalDistribution([200]),
    'learning_rate': optuna.distributions.CategoricalDistribution([1.0]),
    'min_dist': optuna.distributions.CategoricalDistribution([0.1]),
    'random_state': optuna.distributions.CategoricalDistribution([65]),
    'min_cluster_size': optuna.distributions.CategoricalDistribution([1, 5, 15, 30, 50, 100]),
    'cluster_selection_epsilon': optuna.distributions.CategoricalDistribution([0.001, 0.01, 0.1, 0.2, 0.3, 0.4]),
    'hdbscan_metric': optuna.distributions.CategoricalDistribution(['euclidean']),
    'cluster_selection_method': optuna.distributions.CategoricalDistribution(['eom']),
    'topk': optuna.distributions.CategoricalDistribution([10])
}

params = dict(zip(
                    config['hyperparameters'].keys(),
                    [config['hyperparameters'][k][0] for k in config['hyperparameters'].keys()]
                ))

trial = optuna.create_trial(state=TrialState.RUNNING, params=params, distributions=distributions)

In [7]:
trainer.set_hyperparameters(trial)

2023-06-12 19:17:52,400 - DEBUG - hyperparameters.py line: 13 - Preparing hyperparameters
2023-06-12 19:17:52,404 - DEBUG - hyperparameters.py line: 78 - Finished preparing hyperparameters


### Embeddings and Documents

In [8]:
test_embeddings = trainer.load_embeddings(split_name='test')
test_embeddings.shape

(903489, 768)

In [9]:
test_dataset = trainer.load_dataset(split='test', lemmatized=True, cache=True)
test_dataset

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-f74f09cb4e4c5952/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['title', 'categories', 'abstract', 'categories_list', 'label_counts', 'abstract_word_count', '__index_level_0__'],
    num_rows: 903489
})

In [10]:
documents = test_dataset['abstract']
len(documents)

903489

### Model

In [11]:
import os

os.listdir(models_dir)

['dataset1_finetuning.sql',
 'dataset4_finetuning.sql',
 'dataset5_best_params_training',
 'dataset5_best_params_tuning4',
 'dataset5_finetuning.sql',
 'dataset5_partialfit.sql',
 'distilled_model_choice_experiments_dataset5.sql',
 'distilled_model_selection_sample_dataset5_3',
 'generate_embeddings.sql',
 'model.bin',
 'sample',
 'tuning_dataset1_study2',
 'tuning_dataset5_study1',
 'tuning_dataset5_study2',
 'tuning_dataset5_study3',
 'umap_and_hdbscan1_dataset5']

In [12]:
study_name = "dataset5_best_params_training"

In [13]:
from custom_bertopic import CustomBERTopic
model = CustomBERTopic.load(models_dir / study_name / 'best_model.bin')
model

<custom_bertopic.CustomBERTopic at 0x7fd1b24f73d0>

In [14]:
topics = model.get_topics()
len(topics)

7

In [15]:
topics

{0: [('galaxies', 0.08412319089986078),
  ('star', 0.06056312968198693),
  ('galaxy', 0.05689535553234979),
  ('stellar', 0.05615637574100979),
  ('xray', 0.05077410387258758),
  ('gas', 0.04289091012983385),
  ('radio', 0.03890325435105336),
  ('disk', 0.037223190064581246),
  ('galactic', 0.03429675810268571),
  ('accretion', 0.02932459615571607),
  ('dust', 0.029007975641121123),
  ('redshift', 0.028473031884930455),
  ('agn', 0.02408188481412899),
  ('telescope', 0.022132631874871102),
  ('population', 0.021995951702517565),
  ('flux', 0.02170907711133291),
  ('dwarf', 0.021567092800666716),
  ('solar', 0.019624359453384147),
  ('metallicity', 0.019036003701192107),
  ('halo', 0.018876395693048522),
  ('ngc', 0.018494635805674897),
  ('gammaray', 0.01838456829208168),
  ('planets', 0.017442121421139035),
  ('host', 0.016709838894235128),
  ('spectroscopic', 0.01659355347185796),
  ('disc', 0.016127861961515645),
  ('photometric', 0.015777544063232056),
  ('planet', 0.01542684173770

In [16]:
N  = 5000

In [32]:
transformed = model.transform(documents=documents[:N], embeddings=test_embeddings[:N])
transformed

([-1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  4,
  -1,
  4,
  -1,
  -1,
  -1,
  -1,
  -1,
  4,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  4,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  4,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  4,
  -1,
  6,
  -1,
  6,
  -1,
  4,
  -1,
  4,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  4,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  4,
  -1,
  -1,
  -1,
  4,
  -1,
  6,
  -1,
  -1,
  4,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  4,
  -1,
  -1,
  4,
  -1,
  -1,
  -1,
  6,
  -1,
  -1,
  -1,
  4,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  6,
  -1,
  -1,
  -1,
  -1,
  -1,
  4,
  -1,
  -1,
  4,
  -1,
  -1,
  -1,
  -1,
  -1,
  

In [38]:
np.array(transformed).shape

(2, 5000)

In [20]:
transformed[1]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [34]:
model.hierarchical_topics(docs=transformed[0])

ValueError: All arrays must be of the same length

In [24]:
model.(documents=documents, embeddings=test_embeddings)

NameError: name 'embeddings' is not defined

In [35]:
from sklearn.datasets import fetch_20newsgroups

docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))["data"]