In [1]:
from src.bertopic_finetuning import BERTopicTrainer
from src.hyperparameters import BertopicHyperparameters
from src.util import initialize_logger, get_logfile_name, load_config

--------------------------------------------------------------------------------

  CuPy may not function correctly because multiple CuPy packages are installed
  in your environment:

    cupy-cuda11x, cupy-cuda12x

  Follow these steps to resolve this issue:

    1. For all packages listed above, run the following command to remove all
       existing CuPy installations:

         $ pip uninstall <package_name>

      If you previously installed CuPy via conda, also run the following:

         $ conda uninstall cupy

    2. Install the appropriate CuPy package.
       Refer to the Installation Guide for detailed instructions.

         https://docs.cupy.dev/en/stable/install.html

--------------------------------------------------------------------------------



### Load Config

In [2]:
from pathlib import Path

config = load_config(config_file=Path("config/bertopic_finetuning_config_dataset_1.yml"))
config

{'logging_dir': '../logs',
 'logfile_name': 'bertopic_trainer_logs.txt',
 'logging_level': 'DEBUG',
 'dataset_path': '../../dataset',
 'model_name': 'distilbert-base-nli-mean-tokens',
 'batch_size': 384,
 'dataset_index': 1,
 'hyperparameters': {'nr_topics': 30,
  'top_n_words': 100,
  'min_topic_size': 10,
  'n_gram_range': '(1, 1)',
  'min_categories': 5,
  'max_categories': 5,
  'max_features': 100,
  'max_df': 0.8,
  'min_df': 0.05,
  'lowercase': True,
  'topk': 10}}

In [3]:
logger = initialize_logger(logfile_name=config['logfile_name'], log_level=config['logging_level'])
logger

2023-06-06 20:45:30,750 - INFO - util.py line: 42 - Logger initialized, Refer Logfile: /mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/bertopic_trainer_logs.txt, LogLevel: 10


<Logger src.util (DEBUG)>

### Load abstract embeddings for all splits, for given model

In [4]:
models_list = [
    'sentence-transformers/distilroberta-base-paraphrase-v1',
    'sentence-transformers/stsb-distilroberta-base-v2',
    'distilbert-base-nli-mean-tokens',
    'sentence-transformers/distilbert-base-nli-stsb-quora-ranking',
]

In [None]:
import gc
import torch

for dataset_index in [1]:
    hyperparameters = BertopicHyperparameters(config=config, logger=logger)

    logger.info(f"Processing dataset - {dataset_index}")
    
    for model_name in models_list:
        logger.info(f"Processing model {model_name}")
    
        try:
            for split_name in ['train', 'test', 'validation']:
                trainer = BERTopicTrainer(config=config, logger=logger)
                trainer.dataset_index = dataset_index
                trainer.datasets = trainer.load_datasets(dataset_index=dataset_index)
                trainer.model_name = model_name
                trainer.batch_size = 1024

                # try locating embeddings first
                logger.info(f"Looking for embeddings for split {split_name} for dataset {dataset_index}")
                embeddings = trainer.load_embeddings(split_name=split_name)

                if embeddings is not None and embeddings.shape[0] != 0:
                    logger.info(f"Found existing embeddings, skipping generating new ones")
                else:
                    logger.info(f"Extracting embeddings for split {split_name} for dataset {dataset_index}")
                    trainer.extract_or_load_embeddings(split_name=split_name)
        except Exception as e:
            logger.error(f"Exception occurred - {e}")
        finally:
            del trainer
            gc.collect()
            torch.cuda.empty_cache()

2023-06-06 20:45:30,767 - DEBUG - hyperparameters.py line: 11 - Preparing hyperparameters
2023-06-06 20:45:30,769 - DEBUG - hyperparameters.py line: 32 - Finished preparing hyperparameters
2023-06-06 20:45:30,769 - INFO - 2524697238.py line: 7 - Processing dataset - 1
2023-06-06 20:45:30,770 - INFO - 2524697238.py line: 10 - Processing model sentence-transformers/distilroberta-base-paraphrase-v1
2023-06-06 20:45:30,771 - DEBUG - hyperparameters.py line: 11 - Preparing hyperparameters
2023-06-06 20:45:30,772 - DEBUG - hyperparameters.py line: 32 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-06 20:45:33,829 - INFO - 2524697238.py line: 21 - Looking for embeddings for split train for dataset 1
2023-06-06 20:45:33,841 - INFO - 2524697238.py line: 27 - Extracting embeddings for split train for dataset 1


Batches:   0%|          | 0/883 [00:00<?, ?it/s]

2023-06-06 21:12:48,775 - DEBUG - hyperparameters.py line: 11 - Preparing hyperparameters
2023-06-06 21:12:48,776 - DEBUG - hyperparameters.py line: 32 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-06 21:12:52,602 - INFO - 2524697238.py line: 21 - Looking for embeddings for split test for dataset 1
2023-06-06 21:12:52,616 - INFO - 2524697238.py line: 27 - Extracting embeddings for split test for dataset 1


Batches:   0%|          | 0/883 [00:00<?, ?it/s]

2023-06-06 21:39:54,721 - DEBUG - hyperparameters.py line: 11 - Preparing hyperparameters
2023-06-06 21:39:54,731 - DEBUG - hyperparameters.py line: 32 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-06 21:39:58,121 - INFO - 2524697238.py line: 21 - Looking for embeddings for split validation for dataset 1
2023-06-06 21:39:58,134 - INFO - 2524697238.py line: 27 - Extracting embeddings for split validation for dataset 1


Batches:   0%|          | 0/442 [00:00<?, ?it/s]

2023-06-06 21:53:31,841 - INFO - 2524697238.py line: 10 - Processing model sentence-transformers/stsb-distilroberta-base-v2
2023-06-06 21:53:31,852 - DEBUG - hyperparameters.py line: 11 - Preparing hyperparameters
2023-06-06 21:53:31,856 - DEBUG - hyperparameters.py line: 32 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-06 21:53:34,967 - INFO - 2524697238.py line: 21 - Looking for embeddings for split train for dataset 1
2023-06-06 21:53:34,975 - INFO - 2524697238.py line: 27 - Extracting embeddings for split train for dataset 1


Downloading (…)e581e/.gitattributes:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)8aa7be581e/README.md:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

Downloading (…)a7be581e/config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)aa7be581e/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)e581e/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)aa7be581e/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)7be581e/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/883 [00:00<?, ?it/s]

2023-06-06 22:12:03,326 - DEBUG - hyperparameters.py line: 11 - Preparing hyperparameters
2023-06-06 22:12:03,332 - DEBUG - hyperparameters.py line: 32 - Finished preparing hyperparameters
Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-ac9b42d7382cd426/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-8dca40a1fc3c3914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/bertopic_trainer/../../dataset/cache_dir/parquet/default-caa8a136896649d1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

2023-06-06 22:12:06,406 - INFO - 2524697238.py line: 21 - Looking for embeddings for split test for dataset 1
2023-06-06 22:12:06,416 - INFO - 2524697238.py line: 27 - Extracting embeddings for split test for dataset 1


Batches:   0%|          | 0/883 [00:00<?, ?it/s]