In [1]:
from src.bertopic_finetuning import BERTopicTrainer
from src.hyperparameters import BertopicHyperparameters
from src.util import initialize_logger, get_logfile_name, load_config

### Load Config

In [2]:
from pathlib import Path

config = load_config(config_file=Path("config/bertopic_finetuning_config_dataset_1.yml"))
config

{'logging_dir': '../logs',
 'logfile_name': 'bertopic_trainer_logs.txt',
 'logging_level': 'DEBUG',
 'study_name': 'distilled_nli_models',
 'dataset_path': '../../dataset',
 'models_path': '../../models',
 'batch_size': 384,
 'dataset_index': 1,
 'n_trials': 5,
 'hyperparameters': {'model_name': ['sentence-transformers/distilroberta-base-paraphrase-v1',
   'sentence-transformers/stsb-distilroberta-base-v2',
   'sentence-transformers/distilbert-base-nli-stsb-quora-ranking',
   'distilbert-base-nli-mean-tokens'],
  'nr_topics': [5, 15, 25, 45, 75, 95, 135, 160, 190, 200, 210, 215, 300, 400],
  'top_n_words': [10, 20, 30, 60, 80, 100],
  'min_topic_size': [1, 5, 15, 30, 50, 100],
  'n_gram_range': [[1, 1], [1, 2], [1, 3]],
  'max_features': [500, 5000, 10000, 20000, 30000, 60000],
  'max_df': [0.75, 0.8, 0.86, 0.999],
  'min_df': [0.001, 0.01, 0.05, 0.1, 0.13],
  'lowercase': [True, False],
  'n_neighbors': [3, 5, 7, 15, 25, 40],
  'n_components': [64, 128, 256],
  'umap_metric': ['euclid

In [3]:
logger = initialize_logger(logfile_name=config['logfile_name'], log_level=config['logging_level'])
logger

2023-06-07 10:51:44,658 - INFO - util.py line: 42 - Logger initialized, Refer Logfile: /mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/bertopic_trainer_logs.txt, LogLevel: 10


<Logger src.util (DEBUG)>

### Load abstract embeddings for all splits, for given model

In [4]:
models_list = [
    'sentence-transformers/distilroberta-base-paraphrase-v1',
    'sentence-transformers/stsb-distilroberta-base-v2',
    'distilbert-base-nli-mean-tokens',
    'sentence-transformers/distilbert-base-nli-stsb-quora-ranking',
]

In [5]:
hyperparams = config['hyperparameters']
hyperparams['n_gram_range'] = [tuple(k) for k in hyperparams['n_gram_range']]

In [6]:
import optuna

distributions={
            'model_name': optuna.distributions.CategoricalDistribution([
                'sentence-transformers/distilroberta-base-paraphrase-v1',
                'sentence-transformers/stsb-distilroberta-base-v2',
                'sentence-transformers/distilbert-base-nli-stsb-quora-ranking',
                'distilbert-base-nli-mean-tokens'
            ]),
    
            'nr_topics': optuna.distributions.CategoricalDistribution([5, 15, 25, 45, 75, 95, 135, 160, 190, 200, 210, 215, 300, 400]),
            'top_n_words': optuna.distributions.CategoricalDistribution([10, 20, 30, 60, 80, 100]),
            'min_topic_size': optuna.distributions.CategoricalDistribution([1, 5, 15, 30, 50, 100]),
            'n_gram_range': optuna.distributions.CategoricalDistribution([(1, 1), (1, 2), (1, 3)]),
            'max_features': optuna.distributions.CategoricalDistribution([500, 5000, 10000, 20000, 30000, 60000]),
            'max_df': optuna.distributions.CategoricalDistribution([0.75, 0.8, 0.86, 0.999]),
            'min_df': optuna.distributions.CategoricalDistribution([0.001, 0.01, 0.05, 0.1, 0.13]),
            'lowercase': optuna.distributions.CategoricalDistribution([True, False]),
            'n_neighbors': optuna.distributions.CategoricalDistribution([3, 5, 7, 15, 25, 40]),
            'n_components': optuna.distributions.CategoricalDistribution([64, 128, 256]),
            'umap_metric': optuna.distributions.CategoricalDistribution(['euclidean']),
            'n_epochs': optuna.distributions.CategoricalDistribution([200]),
            'learning_rate': optuna.distributions.CategoricalDistribution([1.0]),
            'min_dist': optuna.distributions.CategoricalDistribution([0.1]),
            'random_state': optuna.distributions.CategoricalDistribution([65]),
            'min_cluster_size': optuna.distributions.CategoricalDistribution([1, 5, 15, 30, 50, 100]),
            'cluster_selection_epsilon': optuna.distributions.CategoricalDistribution([0.001, 0.01, 0.1, 0.2, 0.3, 0.4]),
            'hdbscan_metric': optuna.distributions.CategoricalDistribution(['euclidean']),
            'cluster_selection_method': optuna.distributions.CategoricalDistribution(['eom']),
            'topk': optuna.distributions.CategoricalDistribution([10])
        }

In [7]:
import gc
import torch

import optuna
from optuna.trial._state import TrialState


for dataset_index in [1,2,3,4]:
    logger.info(f"Processing dataset - {dataset_index}")
    
    for model_name in models_list:
        logger.info(f"Processing model {model_name}")
    
        try:
            for split_name in ['train', 'test', 'validation']:
                params = dict(zip(
                    config['hyperparameters'].keys(),
                    [config['hyperparameters'][k][0] for k in config['hyperparameters'].keys()]
                ))
                params['n_gram_range'] = (1,1)
                
                trial = optuna.create_trial(state=TrialState.RUNNING, params=params, distributions=distributions)
                # hyperparameters = BertopicHyperparameters(config=config, logger=logger, trial=trial)
                
                trainer = BERTopicTrainer(config=config, logger=logger)
                trainer.set_hyperparameters(trial=trial)
                trainer.dataset_index = dataset_index
                trainer.datasets = trainer.load_datasets(dataset_index=dataset_index)
                trainer.hyperparameters.model_name = model_name
                trainer.batch_size = 1024

                # try locating embeddings first
                logger.info(f"Looking for embeddings for split {split_name} for dataset {dataset_index}")
                embeddings = trainer.load_embeddings(split_name=split_name)

                if embeddings is not None and embeddings.shape[0] != 0:
                    logger.info(f"Found existing embeddings, skipping generating new ones")
                else:
                    logger.info(f"Extracting embeddings for split {split_name} for dataset {dataset_index}")
                    trainer.extract_or_load_embeddings(split_name=split_name)
        except Exception as e:
            logger.error(f"Exception occurred - {e}")
            raise e
        finally:
            del trainer
            gc.collect()
            torch.cuda.empty_cache()

2023-06-07 10:51:44,729 - INFO - 2199698994.py line: 9 - Processing dataset - 4
2023-06-07 10:51:44,731 - INFO - 2199698994.py line: 12 - Processing model sentence-transformers/distilroberta-base-paraphrase-v1
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:51:46,905 - DEBUG - hyperparameters.py line: 14 - Preparing hyperparameters
2023-06-07 10:51:46,908 - DEBUG - hyperparameters.py line: 86 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ff94db2645fa5e2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-cf285e0ca1bbb326/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-5433279a47d817c4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:51:48,389 - INFO - 2199698994.py line: 33 - Looking for embeddings for split train for dataset 4
2023-06-07 10:51:56,488 - INFO - 2199698994.py line: 37 - Found existing embeddings, skipping generating new ones
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:51:58,482 - DEBUG - hyperparameters.py line: 14 - Preparing hyperparameters
2023-06-07 10:51:58,489 - DEBUG - hyperparameters.py line: 86 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ff94db2645fa5e2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-cf285e0ca1bbb326/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-5433279a47d817c4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:00,116 - INFO - 2199698994.py line: 33 - Looking for embeddings for split test for dataset 4
2023-06-07 10:52:03,865 - INFO - 2199698994.py line: 37 - Found existing embeddings, skipping generating new ones
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:05,418 - DEBUG - hyperparameters.py line: 14 - Preparing hyperparameters
2023-06-07 10:52:05,423 - DEBUG - hyperparameters.py line: 86 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ff94db2645fa5e2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-cf285e0ca1bbb326/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-5433279a47d817c4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:06,912 - INFO - 2199698994.py line: 33 - Looking for embeddings for split validation for dataset 4
2023-06-07 10:52:07,572 - INFO - 2199698994.py line: 37 - Found existing embeddings, skipping generating new ones
2023-06-07 10:52:07,770 - INFO - 2199698994.py line: 12 - Processing model sentence-transformers/stsb-distilroberta-base-v2
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:09,259 - DEBUG - hyperparameters.py line: 14 - Preparing hyperparameters
2023-06-07 10:52:09,260 - DEBUG - hyperparameters.py line: 86 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ff94db2645fa5e2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-cf285e0ca1bbb326/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-5433279a47d817c4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:10,914 - INFO - 2199698994.py line: 33 - Looking for embeddings for split train for dataset 4
2023-06-07 10:52:19,877 - INFO - 2199698994.py line: 37 - Found existing embeddings, skipping generating new ones
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:21,650 - DEBUG - hyperparameters.py line: 14 - Preparing hyperparameters
2023-06-07 10:52:21,657 - DEBUG - hyperparameters.py line: 86 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ff94db2645fa5e2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-cf285e0ca1bbb326/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-5433279a47d817c4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:23,411 - INFO - 2199698994.py line: 33 - Looking for embeddings for split test for dataset 4
2023-06-07 10:52:26,892 - INFO - 2199698994.py line: 37 - Found existing embeddings, skipping generating new ones
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:28,422 - DEBUG - hyperparameters.py line: 14 - Preparing hyperparameters
2023-06-07 10:52:28,429 - DEBUG - hyperparameters.py line: 86 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ff94db2645fa5e2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-cf285e0ca1bbb326/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-5433279a47d817c4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:29,919 - INFO - 2199698994.py line: 33 - Looking for embeddings for split validation for dataset 4
2023-06-07 10:52:30,625 - INFO - 2199698994.py line: 37 - Found existing embeddings, skipping generating new ones
2023-06-07 10:52:30,830 - INFO - 2199698994.py line: 12 - Processing model distilbert-base-nli-mean-tokens
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:32,296 - DEBUG - hyperparameters.py line: 14 - Preparing hyperparameters
2023-06-07 10:52:32,299 - DEBUG - hyperparameters.py line: 86 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ff94db2645fa5e2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-cf285e0ca1bbb326/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-5433279a47d817c4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:33,958 - INFO - 2199698994.py line: 33 - Looking for embeddings for split train for dataset 4
2023-06-07 10:52:45,267 - INFO - 2199698994.py line: 37 - Found existing embeddings, skipping generating new ones
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:46,843 - DEBUG - hyperparameters.py line: 14 - Preparing hyperparameters
2023-06-07 10:52:46,846 - DEBUG - hyperparameters.py line: 86 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ff94db2645fa5e2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-cf285e0ca1bbb326/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-5433279a47d817c4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:48,358 - INFO - 2199698994.py line: 33 - Looking for embeddings for split test for dataset 4
2023-06-07 10:52:51,119 - INFO - 2199698994.py line: 37 - Found existing embeddings, skipping generating new ones
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:52,638 - DEBUG - hyperparameters.py line: 14 - Preparing hyperparameters
2023-06-07 10:52:52,645 - DEBUG - hyperparameters.py line: 86 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ff94db2645fa5e2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-cf285e0ca1bbb326/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-5433279a47d817c4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:54,243 - INFO - 2199698994.py line: 33 - Looking for embeddings for split validation for dataset 4
2023-06-07 10:52:54,955 - INFO - 2199698994.py line: 37 - Found existing embeddings, skipping generating new ones
2023-06-07 10:52:55,161 - INFO - 2199698994.py line: 12 - Processing model sentence-transformers/distilbert-base-nli-stsb-quora-ranking
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:56,825 - DEBUG - hyperparameters.py line: 14 - Preparing hyperparameters
2023-06-07 10:52:56,827 - DEBUG - hyperparameters.py line: 86 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ff94db2645fa5e2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-cf285e0ca1bbb326/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-5433279a47d817c4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:52:58,389 - INFO - 2199698994.py line: 33 - Looking for embeddings for split train for dataset 4
2023-06-07 10:53:09,473 - INFO - 2199698994.py line: 37 - Found existing embeddings, skipping generating new ones
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:53:11,000 - DEBUG - hyperparameters.py line: 14 - Preparing hyperparameters
2023-06-07 10:53:11,009 - DEBUG - hyperparameters.py line: 86 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ff94db2645fa5e2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-cf285e0ca1bbb326/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-5433279a47d817c4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:53:12,663 - INFO - 2199698994.py line: 33 - Looking for embeddings for split test for dataset 4
2023-06-07 10:53:15,279 - INFO - 2199698994.py line: 37 - Found existing embeddings, skipping generating new ones
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:53:16,822 - DEBUG - hyperparameters.py line: 14 - Preparing hyperparameters
2023-06-07 10:53:16,828 - DEBUG - hyperparameters.py line: 86 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ff94db2645fa5e2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-cf285e0ca1bbb326/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-5433279a47d817c4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-07 10:53:18,447 - INFO - 2199698994.py line: 33 - Looking for embeddings for split validation for dataset 4
2023-06-07 10:53:19,046 - INFO - 2199698994.py line: 37 - Found existing embeddings, skipping generating new ones
