## Import libaries

In [1]:
import os

if os.getcwd().endswith('/notebooks'):
    os.chdir('..')

assert os.path.exists('./src'), f"[ERROR] The path src not detected in the current directory '{os.getcwd()}'."

print(f'[INFO] Current Directory: "{os.getcwd()}".')

if not os.path.exists("images"):
    os.mkdir("images")

[INFO] Current Directory: "/home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements".


In [2]:
%load_ext autoreload
%autoreload 2


from src import preprocessor
from src.utils import load_documents
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/iceking/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/iceking/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/iceking/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Parametric run: BERTopic

If we set hdbscan_args to None, Kmeans is used for clustering. That way we can enforce having no noise clusters/documents.
As a consequence we also have no assignment scores since KMeans does not produce any (set as default to 1).

In [21]:
configs = {
    'dataset': 'crisis_12',
    'preprocessing_funcs': [
        'to_lowercase',
        'standardize_accented_chars',
        'remove_url',
        'expand_contractions',
        'remove_mentions',
        'remove_hashtags',
        'remove_new_lines',
        'keep_only_alphabet',
        # 'remove_extra_spaces',
        'remove_english_stop_words',
        'lemmatize_noun',
        #'correct_typo'
    ],
    'algorithm': 'bertopic',
    'algorithm_args': {
        #"embedding_model": "all-MiniLM-L6-v2",
        #"embedding_model": "all-distilroberta-v1",
        # "embedding_model": "doc2vec",
        "embedding_model": "paraphrase-multilingual-MiniLM-L12-v2",
        "top_n_words": 10,
        "n_gram_range_tuple": (1, 1),
        ## Both the same as below
        "min_docs_per_topic": 15,
        "number_topics": 4,
        "cluster_model": "hdbscan", #"hdbscan"or "kmeans" then we do not need hdbscan_args; if used are ignored
        "hdbscan_args": {
                    "min_cluster_size": 15,
                    "metric":'euclidean',
                    "cluster_selection_method": 'eom',
                    "prediction_data": True,
                    #"min_samples": 15
         },    
        "umap_args": {
                    "n_neighbors": 15,
                    "n_components": 5,
                    "min_dist": 0.0,
                    "metric": 'cosine',
                    "low_memory": False,
                    "random_state": 42
        }
    }
}

In [22]:
docs, labels = load_documents(dataset=configs['dataset'])

if 'preprocessing_funcs' in configs:
    docs = preprocessor.run(data=docs, prep_functions=configs['preprocessing_funcs'])


[INFO] [PREPROCESSOR] Available Preprocessing Functions in the Module:['to_lowercase', 'standardize_accented_chars', 'remove_url', 'expand_missing_delimiter', 'remove_mentions', 'remove_hashtags', 'keep_only_alphabet', 'remove_new_lines', 'remove_extra_spaces', 'remove_html_tags', 'expand_contractions', 'remove_english_stop_words', 'lemmatize', 'lemmatize_verb', 'lemmatize_noun', 'lemmatize_adjective', 'correct_typo']
[INFO] [PREPROCESSOR] Preprocessing starting..
[INFO] [PREPROCESSOR] These string preprocessing methods will be applied to the data in order:
(  'to_lowercase',
   'standardize_accented_chars',
   'remove_url',
   'remove_mentions',
   'remove_hashtags',
   'remove_new_lines',
   'keep_only_alphabet')
[INFO] [PREPROCESSOR] Then, these tokenized preprocessing methods will be applied in order:
(  '__tokenize',
   'expand_contractions',
   'remove_english_stop_words',
   'lemmatize_noun',
   '__glue')
[INFO] [PREPROCESSOR] Preprocessing completed in 4.883 seconds..


In [17]:
from src.bertopic_runner import BertopicTrainer

algorithm_args = configs['algorithm_args']
algorithm_args.update(data_name=configs['dataset'],docs=docs,labels=labels)
print(f'Running with {algorithm_args["number_topics"]} topics')

if configs['algorithm'] == 'bertopic':
    trainer = BertopicTrainer(dataset = configs['dataset'],
                      model_name = configs['algorithm'],
                      params = algorithm_args)
    
    model, df_output_doc_topic, df_output_topic_word = trainer.train()
    
    from src.evaluator import compute_topic_scores
    df_output_topic_word = compute_topic_scores(df_output_doc_topic, df_output_topic_word)

Running with 4 topics
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_all-mpnet-base-v2" found, so no need to download.
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_all-distilroberta-v1" found, so no need to download.
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_all-MiniLM-L12-v2" found, so no need to download.
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_all-MiniLM-L6-v2" found, so no need to download.
[INFO] The embedding model folder:"./pretrained_models/sentence-transformers_paraphrase-multilingual-MiniLM-L12-v2" found, so no need to download.
[INFO] Embeddings are being encoded..


Batches:   0%|          | 0/250 [00:00<?, ?it/s]

[INFO] Embeddings encoded successfully.
[INFO] Embedding Model created with SentenceTransformer.


2022-07-21 12:14:36,982 - BERTopic - Reduced dimensionality
2022-07-21 12:14:37,275 - BERTopic - Clustered reduced embeddings
2022-07-21 12:14:37,879 - BERTopic - Reduced number of topics from 112 to 5


In [None]:
## Labels per topic
from src import visualizer
visualizer.visualize_labels_per_topic(df_output_doc_topic=df_output_doc_topic,
                               df_output_topic_word=df_output_topic_word,top_n_topics=10)

In [35]:
visualizer.visualize_heatmap(
    model=model,
    df_output_doc_topic=df_output_doc_topic,
    df_output_topic_word=df_output_topic_word,
    topics= None,
    top_n_topics = None,
    n_clusters = None, # Unknown for now
)

In [None]:
_ = visualizer.draw_umap2d_scatter_plot(model=model, df_output_topic_word=df_output_topic_word, df_output_doc_topic= df_output_doc_topic,target_dir= './output/visualization')

In [None]:
visualizer.visualize_barchart(df_output_topic_word=df_output_topic_word,n_words=5)

In [None]:
from src.visualizer import draw_representative_docs
x = draw_representative_docs(df_output_doc_topic,top_n_docs=3)
x

## Parametric Run: LDA-BERT

In [18]:
configs = {
    'dataset': 'crisis_12',
    'preprocessing_funcs': [
        'to_lowercase',
        'standardize_accented_chars',
        'remove_url',
        'expand_contractions',
        'expand_missing_delimiter',
        'remove_mentions',
        'remove_hashtags',
        'remove_new_lines',
        'keep_only_alphabet',
        # 'remove_extra_spaces',
        'remove_english_stop_words',
        'lemmatize_noun',
        #'correct_typo'
    ],
    'algorithm': 'lda-bert',
    'algorithm_args': {
            #'embedding_model': "all-MiniLM-L6-v2",
            'embedding_model': "paraphrase-multilingual-MiniLM-L12-v2",
            'number_topics': 4,
            'top_n_words': 10,
            'gamma': 15 ,
            'random_state': 42
    }     
}

In [19]:
from src import preprocessor
from src.utils import load_documents

docs, labels = load_documents(dataset=configs['dataset'])

if 'preprocessing_funcs' in configs:
    docs = preprocessor.run(data=docs, prep_functions=configs['preprocessing_funcs'])
    
algorithm_args = configs['algorithm_args']
algorithm_args.update(data_name=configs['dataset'],docs=docs,labels=labels)
print(f'Running with {algorithm_args["number_topics"]} topics')

if configs['algorithm'] == 'lda-bert':
    print('[WARN] Lda-Bert is experimental and use with caution!')
    # Encode data with embedding model
    
    trainer = BertopicTrainer(dataset = configs['dataset'],
                      model_name = configs['algorithm'],
                      params = algorithm_args)
    
    model, df_output_doc_topic, df_output_topic_word = trainer.train()
    
    from src.evaluator import compute_topic_scores
    df_output_topic_word = compute_topic_scores(df_output_doc_topic, df_output_topic_word)

[INFO] Available Preprocessing Functions in the Module:['to_lowercase', 'standardize_accented_chars', 'remove_url', 'expand_missing_delimiter', 'remove_mentions', 'remove_hashtags', 'keep_only_alphabet', 'remove_new_lines', 'remove_extra_spaces', 'remove_html_tags', 'expand_contractions', 'remove_english_stop_words', 'lemmatize', 'lemmatize_verb', 'lemmatize_noun', 'lemmatize_adjective', 'correct_typo']
[INFO] Preprocessing starting..
[INFO] These string preprocessing methods will be applied to the data in order:
(  'to_lowercase',
   'standardize_accented_chars',
   'remove_url',
   'expand_missing_delimiter',
   'remove_mentions',
   'remove_hashtags',
   'remove_new_lines',
   'keep_only_alphabet')
[INFO] Then, these tokenized preprocessing methods will be applied to the data in order:
(  '__tokenize',
   'expand_contractions',
   'remove_english_stop_words',
   'lemmatize_noun',
   '__glue')
[INFO] Preprocessing completed in 6.242 seconds..
Running with 4 topics
[INFO] The embeddin

Batches:   0%|          | 0/250 [00:00<?, ?it/s]

[INFO] Embeddings encoded successfully.
[INFO] Embedding Model created with SentenceTransformer.
[INFO] Tokenizing raw texts...
[INFO] Tokenizing raw texts. Done!
Clustering embeddings ...
[INFO] Getting vector representations for LDA ...
Getting vector representations for LDA. Done!
[INFO] Getting vector representations for BERT ...
[INFO] Getting vector representations for BERT. Done!
Fitting Autoencoder ...


2022-07-21 12:17:27.059464: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-07-21 12:17:27.059487: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-07-21 12:17:27.059504: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (iceking-ThinkPad-T490): /proc/driver/nvidia/version does not exist
2022-07-21 12:17:27.059686: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Fitting Autoencoder Done!
Clustering embeddings. Done!


In [20]:
df_output_topic_word

Unnamed: 0,run_id,method,method_specific_params,dataset,num_given_topics,reduced,topic_num,topic_size,topic_words,word_scores,num_detected_topics,num_final_topics,duration_secs,diversity_unique,diversity_inv_rbo,coherence_npmi,coherence_v,rand_index
0,1658398610,lda-bert,{'embedding_model': 'paraphrase-multilingual-M...,crisis_12,4,False,0,2461,"[flood, wildfire, california, fort, mcmurray, ...","[0.0549645390070922, 0.04284071305347901, 0.02...",4,4,59.277545,0.75,0.756802,-0.067271,0.376991,0.745998
1,1658398610,lda-bert,{'embedding_model': 'paraphrase-multilingual-M...,crisis_12,4,False,1,1275,"[flood, wildfire, california, rain, smoke, hea...","[0.028576782535369626, 0.025297479621474748, 0...",4,4,59.277545,0.75,0.756802,-0.067271,0.376991,0.745998
2,1658398610,lda-bert,{'embedding_model': 'paraphrase-multilingual-M...,crisis_12,4,False,2,2027,"[cyclone, debbie, flood, ex, queensland, schoo...","[0.06512013256006628, 0.06484396575531622, 0.0...",4,4,59.277545,0.75,0.756802,-0.067271,0.376991,0.745998
3,1658398610,lda-bert,{'embedding_model': 'paraphrase-multilingual-M...,crisis_12,4,False,3,2237,"[earthquake, felt, feel, flood, like, californ...","[0.13113254615000833, 0.02286712123731914, 0.0...",4,4,59.277545,0.75,0.756802,-0.067271,0.376991,0.745998


In [18]:
df_output_doc_topic

Unnamed: 0,run_id,Document ID,Document,Real Label,Assigned Topic Num,Assignment Score
0,1658359258,0,thereformedcrow nah going to go earthquake,earthquake,2,1
1,1658359258,1,think earthquake,earthquake,2,1
2,1658359258,2,uhh else felt earthquake though,earthquake,2,1
3,1658359258,3,bay area nice size earthquake,earthquake,2,1
4,1658359258,4,thought dad farting turn earthquake,earthquake,2,1
...,...,...,...,...,...,...
7995,1658359258,7995,due severity weather ex tropical cyclone debbi...,hurricane,3,1
7996,1658359258,7996,wot wet weather plan today,hurricane,1,1
7997,1658359258,7997,folk rain south east queensland coming ex cycl...,hurricane,1,1
7998,1658359258,7998,ahh man realised put bin weather low blow ex t...,hurricane,1,1
