## 0. Code Setup and Helper Functions

### 0.1 Path Adjustments

In [1]:
import os
import sys
assert os.path.exists('../src'), f"[ERROR] The path src not detected in the parent directory '{os.getcwd()}'."

if os.getcwd().endswith('/notebooks'):
    os.chdir('..')
    sys.path.append('./src')

print(f'[INFO] Current Directory: "{os.getcwd()}".')

[INFO] Current Directory: "/home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements".


### 0.2 Import all necessary packages

In [4]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from time import time
import shutil
from src import preprocessor
from src.utils import load_documents
from src.main_runner import OUTPUT_FOLDER
from src.main_runner import main_runner

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Run Main Runner with Example Configs

In [5]:
if os.path.exists(OUTPUT_FOLDER):
    shutil.rmtree(OUTPUT_FOLDER)

In [12]:
example_configs = [
    {
        "algorithm_args": {
            "embedding_model": "doc2vec",
            "min_count": 30,
            "algorithm": "top2vec",
            "num_topics": 4,
            "doc2vec_speed": "learn",
            "umap_args": {
                "n_neighbors": 15,
                "n_components": 5,
                "metric": "cosine",
                "random_state": 42
            },
            "hdbscan_args": {
                "min_cluster_size": 10,
                "metric": "euclidean",
                "cluster_selection_method": "eom"
            }
        },
        "preprocessing_funcs": [],
        "dataset": "crisis_12"
    },
    {
        "algorithm_args": {
            "algorithm": "lda-bert",
            "embedding_model": "all-MiniLM-L12-v2",
            "top_n_words": 10,
            "gamma": 10,
            "num_topics": 4,
            "random_state": 42
        },
        "preprocessing_funcs": [
            "to_lowercase",
            "standardize_accented_chars",
            "remove_url",
            "expand_contractions",
            "remove_mentions",
            "remove_hashtags",
            "keep_only_alphabet",
            "remove_english_stop_words",
            "lemmatize_noun"
        ],
        "dataset": "crisis_01"
    },
    {
        "algorithm_args": {
            "embedding_model": "paraphrase-multilingual-MiniLM-L12-v2",
            "num_epochs": 100,
            "batch_size": 64,
            "algorithm": "ctm",
            "num_topics": 4,
            "random_state": 42,
            "learning_rate": 0.002
        },
        "preprocessing_funcs": [],
        "dataset": "crisis_12"
    },
    {
        "dataset": "ag_news_short",
        "preprocessing_funcs": [
            "to_lowercase",
            "standardize_accented_chars",
            "remove_url",
            "expand_contractions",
            "expand_missing_delimiter",
            "remove_mentions",
            "remove_hashtags",
            "remove_new_lines",
            "keep_only_alphabet",
            "remove_english_stop_words",
            "lemmatize_noun"
        ],
        "algorithm_args": {
            "algorithm": "lda",
            "num_topics": 4,
            "random_state": 42,
            "alpha": "asymmetric"
        }
    },
    {
        "dataset": "ag_news_long",
        "preprocessing_funcs": [
            "to_lowercase",
            "standardize_accented_chars",
            "remove_url",
            "expand_contractions",
            "expand_missing_delimiter",
            "remove_mentions",
            "remove_hashtags",
            "remove_new_lines",
            "keep_only_alphabet",
            "remove_english_stop_words",
            "lemmatize_noun"
        ],
        
        "algorithm_args": {
            "algorithm": "nmf",
            "num_topics": 4,
            "random_state": 42,
        }
    },
    {
        "algorithm_args": {
            "embedding_model": "all-mpnet-base-v2",
            "top_n_words": 10,
            "algorithm": "bertopic",
            "n_gram_range_tuple": (1, 1),
            "min_docs_per_topic": 15,
            "num_topics": 10,
            "cluster_model": "hdbscan",
            "hdbscan_args": {
                "min_cluster_size": 10,
                "metric": "euclidean",
                "cluster_selection_method": "eom",
                "prediction_data": True
            },
            "umap_args": {
                "n_neighbors": 15,
                "n_components": 5,
                "min_dist": 0.0,
                "metric": "cosine",
                "low_memory": False,
                "random_state": 42
            }
        },
        "preprocessing_funcs": [],
        "dataset": "yahoo"
    },
]


In [None]:
for i,example_config in enumerate(example_configs):
    print(f'[INFO] [MAIN RUNNER] Example Config #{i} is running..')
    main_runner(config=example_config)
    print(f'[INFO] [MAIN RUNNER] Example Config #{i} executed successfully.')
    print('-'*32)


2022-08-26 14:02:54,579 - top2vec - INFO - Pre-processing documents for training
2022-08-26 14:02:54,579 INFO:Pre-processing documents for training


[INFO] [MAIN RUNNER] Example Config #0 is running..
[WARN] [PREPROCESSOR] Preprocessing functions are empty or None, given:"[]", preprocessing is skipped.
[INFO] Running with 4 topics.
[INFO] Top2Vec Parameters:
{
    "embedding_model": "doc2vec",
    "min_count": 30,
    "algorithm": "top2vec",
    "num_topics": 4,
    "doc2vec_speed": "learn",
    "umap_args": {
        "n_neighbors": 15,
        "n_components": 5,
        "metric": "cosine",
        "random_state": 42
    },
    "hdbscan_args": {
        "min_cluster_size": 10,
        "metric": "euclidean",
        "cluster_selection_method": "eom"
    },
    "data_name": "crisis_12",
    "run_id": 1661515374
}
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_distiluse-base-multilingual-cased" found, so no need to download.
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_all-MiniLM-L6-v2" found, so no need to download.
[INFO] The embedding model folder:"./pretrained_models/se

2022-08-26 14:02:54,786 - top2vec - INFO - Creating joint document/word embedding
2022-08-26 14:02:54,786 INFO:Creating joint document/word embedding
2022-08-26 14:03:13,196 - top2vec - INFO - Creating lower dimension embedding of documents
2022-08-26 14:03:13,196 INFO:Creating lower dimension embedding of documents
2022-08-26 14:03:29,158 - top2vec - INFO - Finding dense areas of documents
2022-08-26 14:03:29,158 INFO:Finding dense areas of documents
2022-08-26 14:03:29,320 - top2vec - INFO - Finding topics
2022-08-26 14:03:29,320 INFO:Finding topics


[INFO] Original (Non-reduced) Number of Topics: 2.
[WARN] # of topics is pre-specified but non_reduced_num_topics <= num_topics, so not reduced!
   > non_reduced_num_topics:2, given num_topics:4!
[INFO] Topic #00:
     > From Reduced Model:False.
     > Topic Size:7199.
     > Topic Words: ['hoping' 'damn' 'mass' 'crazy' 'abc' 'tweets' 'thing' 'zone' 'let' 'wow'
		 'something' 'car' 'pretty' 'without' 'he' 'these' 'francisco' 'oakland'
		 'tweet' 'forces' 'lmao' 'closed' 'house' 'going' 'anyone' 'there' 'make'
		 'im' 'second' 'then' 'every' 'gt' 'whole' 'had' 'your' 'off' 'twitter'
		 'okay' 'trying' 'now' 'due' 'trippin' 'fuck' 'into' 'little' 'just'
		 'change' 'so' 'should' 'ass']
     > Topic Word Scores: [0.9794671  0.9759551  0.97558117 0.97059864 0.969261   0.96847427
		 0.9670983  0.96033615 0.9595072  0.9581351  0.9568556  0.95136064
		 0.94945693 0.9489452  0.9441134  0.9434247  0.94135416 0.94085383
		 0.93793714 0.9349394  0.9348033  0.9347808  0.9292307  0.9271242
		 0.92

[0826/140333.931813:INFO:headless_shell.cc(660)] Written to file /home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements/output/1661515374_top2vec/vis_topic_similarity_matrix.png.


[INFO] Created Topic Similarity Matrix successfully.
[INFO] Creating Labels Per Topic Visualization..


[0826/140334.713071:INFO:headless_shell.cc(660)] Written to file /home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements/output/1661515374_top2vec/vis_labels_per_topic.png.


[INFO] Created Labels Per Topic Visualization successfully.
[INFO] Creating Top Words Barchart Visualization..


[0826/140335.446678:INFO:headless_shell.cc(660)] Written to file /home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements/output/1661515374_top2vec/vis_top_words_barchart.png.


[INFO] Created Top Words Barchart Visualization successfully.
[INFO] [MAIN RUNNER] Example Config #0 executed successfully.
--------------------------------
[INFO] [MAIN RUNNER] Example Config #1 is running..
[INFO] [PREPROCESSOR] Available Preprocessing Functions in the Module:['to_lowercase', 'standardize_accented_chars', 'remove_url', 'expand_missing_delimiter', 'remove_mentions', 'remove_hashtags', 'keep_only_alphabet', 'remove_new_lines', 'remove_extra_spaces', 'remove_html_tags', 'expand_contractions', 'remove_english_stop_words', 'lemmatize', 'lemmatize_verb', 'lemmatize_noun', 'lemmatize_adjective', 'correct_typo']
[INFO] [PREPROCESSOR] Preprocessing starting..
[INFO] [PREPROCESSOR] These string preprocessing methods will be applied to the data in order:
(  'to_lowercase',
   'standardize_accented_chars',
   'remove_url',
   'remove_mentions',
   'remove_hashtags',
   'keep_only_alphabet')
[INFO] [PREPROCESSOR] Then, these tokenized preprocessing methods will be applied in orde

Batches:   0%|          | 0/642 [00:00<?, ?it/s]

[INFO] Embeddings encoded successfully.
[INFO] Embedding Model created with SentenceTransformer.
[INFO] Tokenizing raw texts...
[INFO] Tokenizing raw texts. Done!
Clustering embeddings ...
[INFO] Getting vector representations for LDA ...
Getting vector representations for LDA. Done!
[INFO] Getting vector representations for BERT ...
[INFO] Getting vector representations for BERT. Done!
Fitting Autoencoder ...
Fitting Autoencoder Done!
Clustering embeddings. Done!
[INFO] Creating Topic Similarity Matrix..


[0826/141210.290350:INFO:headless_shell.cc(660)] Written to file /home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements/output/1661515415_lda-bert/vis_topic_similarity_matrix.png.


[INFO] Created Topic Similarity Matrix successfully.
[INFO] Creating Labels Per Topic Visualization..


[0826/141211.362620:INFO:headless_shell.cc(660)] Written to file /home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements/output/1661515415_lda-bert/vis_labels_per_topic.png.


[INFO] Created Labels Per Topic Visualization successfully.
[INFO] Creating Top Words Barchart Visualization..


[0826/141212.155371:INFO:headless_shell.cc(660)] Written to file /home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements/output/1661515415_lda-bert/vis_top_words_barchart.png.


[INFO] Created Top Words Barchart Visualization successfully.
[INFO] [MAIN RUNNER] Example Config #1 executed successfully.
--------------------------------
[INFO] [MAIN RUNNER] Example Config #2 is running..
[WARN] [PREPROCESSOR] Preprocessing functions are empty or None, given:"[]", preprocessing is skipped.
[INFO] Running with 4 topics.
[WARN] CTM is experimental and does not guarantee reproducibility. Please use with caution!
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_bert-base-nli-mean-tokens" found, so no need to download.
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_all-mpnet-base-v2" found, so no need to download.
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_all-distilroberta-v1" found, so no need to download.
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_all-MiniLM-L12-v2" found, so no need to download.
[INFO] The embedding model folder:"./pretrained

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

[INFO] Model trained successfully!
[INFO] Creating Labels Per Topic Visualization..


[0826/141507.801916:INFO:headless_shell.cc(660)] Written to file /home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements/output/1661515932_ctm/vis_labels_per_topic.png.


[INFO] Created Labels Per Topic Visualization successfully.
[INFO] Creating Top Words Barchart Visualization..


[0826/141508.634853:INFO:headless_shell.cc(660)] Written to file /home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements/output/1661515932_ctm/vis_top_words_barchart.png.


[INFO] Created Top Words Barchart Visualization successfully.
[INFO] [MAIN RUNNER] Example Config #2 executed successfully.
--------------------------------
[INFO] [MAIN RUNNER] Example Config #3 is running..
[INFO] [PREPROCESSOR] Available Preprocessing Functions in the Module:['to_lowercase', 'standardize_accented_chars', 'remove_url', 'expand_missing_delimiter', 'remove_mentions', 'remove_hashtags', 'keep_only_alphabet', 'remove_new_lines', 'remove_extra_spaces', 'remove_html_tags', 'expand_contractions', 'remove_english_stop_words', 'lemmatize', 'lemmatize_verb', 'lemmatize_noun', 'lemmatize_adjective', 'correct_typo']
[INFO] [PREPROCESSOR] Preprocessing starting..
[INFO] [PREPROCESSOR] These string preprocessing methods will be applied to the data in order:
(  'to_lowercase',
   'standardize_accented_chars',
   'remove_url',
   'expand_missing_delimiter',
   'remove_mentions',
   'remove_hashtags',
   'remove_new_lines',
   'keep_only_alphabet')
[INFO] [PREPROCESSOR] Then, these t

[0826/141733.520865:INFO:headless_shell.cc(660)] Written to file /home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements/output/1661516108_lda/vis_labels_per_topic.png.


[INFO] Created Labels Per Topic Visualization successfully.
[INFO] Creating Top Words Barchart Visualization..


[0826/141734.277837:INFO:headless_shell.cc(660)] Written to file /home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements/output/1661516108_lda/vis_top_words_barchart.png.


[INFO] Created Top Words Barchart Visualization successfully.
[INFO] [MAIN RUNNER] Example Config #3 executed successfully.
--------------------------------
[INFO] [MAIN RUNNER] Example Config #4 is running..
[INFO] [PREPROCESSOR] Available Preprocessing Functions in the Module:['to_lowercase', 'standardize_accented_chars', 'remove_url', 'expand_missing_delimiter', 'remove_mentions', 'remove_hashtags', 'keep_only_alphabet', 'remove_new_lines', 'remove_extra_spaces', 'remove_html_tags', 'expand_contractions', 'remove_english_stop_words', 'lemmatize', 'lemmatize_verb', 'lemmatize_noun', 'lemmatize_adjective', 'correct_typo']
[INFO] [PREPROCESSOR] Preprocessing starting..
[INFO] [PREPROCESSOR] These string preprocessing methods will be applied to the data in order:
(  'to_lowercase',
   'standardize_accented_chars',
   'remove_url',
   'expand_missing_delimiter',
   'remove_mentions',
   'remove_hashtags',
   'remove_new_lines',
   'keep_only_alphabet')
[INFO] [PREPROCESSOR] Then, these t

[0826/142429.704912:INFO:headless_shell.cc(660)] Written to file /home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements/output/1661516254_nmf/vis_labels_per_topic.png.


[INFO] Created Labels Per Topic Visualization successfully.
[INFO] Creating Top Words Barchart Visualization..


[0826/142430.513487:INFO:headless_shell.cc(660)] Written to file /home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements/output/1661516254_nmf/vis_top_words_barchart.png.


[INFO] Created Top Words Barchart Visualization successfully.
[INFO] [MAIN RUNNER] Example Config #4 executed successfully.
--------------------------------
[INFO] [MAIN RUNNER] Example Config #5 is running..
[WARN] [PREPROCESSOR] Preprocessing functions are empty or None, given:"[]", preprocessing is skipped.
[INFO] Running with 10 topics.
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_all-mpnet-base-v2" found, so no need to download.
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_all-distilroberta-v1" found, so no need to download.
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_all-MiniLM-L12-v2" found, so no need to download.
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_all-MiniLM-L6-v2" found, so no need to download.
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_paraphrase-multilingual-MiniLM-L12-v2" found, so no need to download.


Batches:   0%|          | 0/1875 [00:00<?, ?it/s]