## Import libaries

In [1]:
import os

if os.getcwd().endswith('/notebooks'):
    os.chdir('..')

assert os.path.exists('./src'), f"[ERROR] The path src not detected in the current directory '{os.getcwd()}'."

print(f'[INFO] Current Directory: "{os.getcwd()}".')

if not os.path.exists("images"):
    os.mkdir("images")

[INFO] Current Directory: "/home/ferdi/topic-modeling-advancements".


In [2]:
%load_ext autoreload
%autoreload 2
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer

from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.dataset.dataset import Dataset
from octis.models.LDA import LDA

import time
import pandas as pd
import numpy as np
import zlib
import pickle as pkl
import sklearn
import pandas as pd
from tabulate import tabulate
import kaleido

## Preencode data with embeddings
from sentence_transformers import SentenceTransformer
## Load data with datalaoder from src
from src.utils import load_documents
from src.bertopic_runner import Trainer
from umap import UMAP
from hdbscan import HDBSCAN

2022-07-19 12:27:31.155007: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-19 12:27:31.155065: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [5]:
## Embedding models to try
#list_emb_models = ["all-mpnet-base-v2", "all-distilroberta-v1", "all-MiniLM-L12-v2", "all-MiniLM-L6-v2"]
list_emb_models = ["all-MiniLM-L6-v2"]
list_emb_models.reverse()

## Datasets to try
#list_data_sets = ["crisis_12", "crisis_12_preprocessed", "crisis_1", "crisis_1_preprocessed", "20news"]
list_data_sets = ["crisis_toy"]

## Models to try
list_models = ['lda-bert', "bertopic"]
#Nr topics
nr_topics_dict = {"crisis_1": 12,
                  "crisis_1_preprocessed": 12,
                  "crisis_12": 10,
                  "crisis_12_preprocessed": 4,
                  "20news": 20,
                  "crisis_toy": 2}

experiment_results = []

for m in list_models:
    for e in list_emb_models:
        for dataset in list_data_sets:
            print(f'Running experiment for model {m}, dataset {dataset} and embedding {e}.')
            
            # Load data
            data, labels = load_documents(dataset)
            
            # Encode data with embedding model
            model = SentenceTransformer(e)
            embeddings = model.encode(data, show_progress_bar=True)
            
            ## Set params for respective model
            if m=="lda-bert":
                params = {
                    'embedding_model': e,
                    'number_topics': nr_topics_dict[dataset],
                    'top_n_words': 10,
                    'gamma': 15 
                }
            elif m=="bertopic":
                params = {
                    "embedding_model": e,
                    "top_n_words": 10,
                    "n_gram_range_tuple": (1, 1),
                    "min_docs_per_topic": 15,
                    "number_topics": nr_topics_dict[dataset],
                    ## Assign almost all docs to a topic
                    "no_noise": True,
                    "prob_threshold": 0.01
                }
            
                ## Setting min_samples to reduce #docs classified as noise/in topic -1
                hdbscan_args = {
                    "min_cluster_size": params["min_docs_per_topic"],
                    "metric":'euclidean',
                    "cluster_selection_method": 'eom',
                    "prediction_data": True,
                    "min_samples": 1
                }
                
                umap_args = {
                    "n_neighbors": 15,
                    "n_components": 5,
                    "min_dist": 0.0,
                    "metric": 'cosine',
                    "low_memory": False
                }
                
                params['hdbscan_args'] = hdbscan_args
                params['umap_args'] = umap_args
            
            trainer = Trainer(dataset = dataset,
                              model_name = m,
                              params = params,
                              topk = 10,
                              bt_embeddings = embeddings,
                              custom_model = None,
                              verbose = True,
                              )
            
            model, df_output_doc_topic, df_output_topic_word = trainer.train()
            
            # Append row as tuple to a list to create dataframe later
            experiment_results.append((model, df_output_doc_topic, df_output_topic_word))

Running experiment for model lda-bert, dataset crisis_toy and embedding all-MiniLM-L6-v2.


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Preprocessing raw texts ...
Preprocessing raw texts. Done!
Clustering embeddings ...
Getting vector representations for LDA ...
Getting vector representations for LDA. Done!
Getting vector representations for BERT ...
Getting vector representations for BERT. Done!
Fitting Autoencoder ...
Fitting Autoencoder Done!
Clustering embeddings. Done!
the words scores are: [[0.03368683718028696, 0.029320024953212728, 0.021002287377833228, 0.020794343938448742, 0.019754626741526306, 0.019754626741526306, 0.018922852983988356, 0.017883135787065917, 0.016635475150758992, 0.01538781451445207], [0.04025423728813559, 0.030508474576271188, 0.02923728813559322, 0.025423728813559324, 0.02076271186440678, 0.019491525423728815, 0.019491525423728815, 0.015254237288135594, 0.014830508474576272, 0.013135593220338982]]
and have length: 2
Running experiment for model bertopic, dataset crisis_toy and embedding all-MiniLM-L6-v2.


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

2022-07-19 09:42:08,855 - BERTopic - Reduced dimensionality
2022-07-19 09:42:08,877 - BERTopic - Clustered reduced embeddings
2022-07-19 09:42:09,054 - BERTopic - Reduced number of topics from 3 to 2


In [6]:
#lda-bert 
model, df_output_doc_topic, df_output_topic_word = experiment_results[0]

In [8]:
df_output_topic_word

Unnamed: 0,run_id,method,method_specific_params,dataset,num_given_topics,reduced,topic_num,topic_size,topic_words,word_scores,num_detected_topics,num_final_topics,duration_secs
0,1658216488,lda-bert,"{'embedding_model': 'all-MiniLM-L6-v2', 'numbe...",crisis_toy,2,False,0,278,"[the, ., :, I, in, https, earthquake, to, ,, a]","[0.03368683718028696, 0.029320024953212728, 0....",2,2,10.47802
1,1658216488,lda-bert,"{'embedding_model': 'all-MiniLM-L6-v2', 'numbe...",crisis_toy,2,False,1,118,"[#, ., :, https, !, ,, the, in, to, of]","[0.04025423728813559, 0.030508474576271188, 0....",2,2,10.47802


In [9]:
#bertopic
model, df_output_doc_topic, df_output_topic_word = experiment_results[1]

In [15]:
df_output_topic_word

Unnamed: 0,run_id,method,method_specific_params,dataset,num_given_topics,reduced,topic_num,topic_size,topic_words,word_scores,num_detected_topics,num_final_topics,duration_secs
0,1658216513,bertopic,"{'embedding_model': 'all-MiniLM-L6-v2', 'top_n...",crisis_toy,2,True,0,296,"[the, co, https, to, in, is, flood, debbie, of...","[0.106000510582472, 0.08556013489946064, 0.085...",3,2,15.908521
1,1658216513,bertopic,"{'embedding_model': 'all-MiniLM-L6-v2', 'top_n...",crisis_toy,2,True,1,100,"[earthquake, was, an, that, just, felt, in, th...","[0.4025949207930253, 0.21330151994995877, 0.18...",3,2,15.908521


In [16]:
model.hdbscan_model

HDBSCAN(min_cluster_size=15, min_samples=1, prediction_data=True)

## Parametric run: BERTopic

In [21]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/ferdi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ferdi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ferdi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

If we set hdbscan_args to None, Kmeans is used for clustering. That way we can enforce having no noise clusters/documents.
As a consequence we also have no assignment scores since KMeans does not produce any (set as default to 1).

In [7]:
configs = {
    'dataset': 'crisis_toy',
    'preprocessing_funcs': [
        'to_lowercase',
        'standardize_accented_chars',
        'remove_url',
        'expand_contractions',
        'remove_mentions',
        'remove_hashtags',
        'remove_new_lines',
        'keep_only_alphabet',
        # 'remove_extra_spaces',
        'remove_english_stop_words',
        'lemmatize_noun'
    ],
    'algorithm': 'bertopic',
    'algorithm_args': {
         "embedding_model": "all-MiniLM-L6-v2",
         "top_n_words": 10,
         "n_gram_range_tuple": (1, 1),
         ## Both the same as below
         "min_docs_per_topic": 15,
         "number_topics": 4,
         ## Assign almost all docs to a topic
         #"no_noise": True,
         #"prob_threshold": 0.01,
         ## Setting min_samples to reduce #docs classified as noise/in topic -1
         "hdbscan_args": {
                    "min_cluster_size": 15,
                    "metric":'euclidean',
                    "cluster_selection_method": 'eom',
                    "prediction_data": True,
                    #"min_samples": 15
         },    
        "umap_args": {
                    "n_neighbors": 15,
                    "n_components": 5,
                    "min_dist": 0.0,
                    "metric": 'cosine',
                    "low_memory": False,
                    "random_state": 42
        }
    }
}

In [8]:
##Maybe parallelism disable?
#import os
#os.environ["TOKENIZERS_PARALLELISM"] = "false"

from src import preprocessor
from src.utils import load_documents

docs, labels = load_documents(dataset=configs['dataset'])

if 'preprocessing_funcs' in configs:
    docs = preprocessor.run(data=docs, prep_functions=configs['preprocessing_funcs'])
    
algorithm_args = configs['algorithm_args']
algorithm_args.update(data_name=configs['dataset'],docs=docs,labels=labels)
print(f'Running with {algorithm_args["number_topics"]} topics')

if configs['algorithm'] == 'bertopic':
    # Encode data with embedding model
    model = SentenceTransformer(algorithm_args['embedding_model'])
    embeddings = model.encode(docs, show_progress_bar=True)
    
    trainer = Trainer(dataset = configs['dataset'],
                      model_name = configs['algorithm'],
                      params = algorithm_args,
                      topk = algorithm_args["top_n_words"],
                      bt_embeddings = embeddings,
                      )
    
    model, df_output_doc_topic, df_output_topic_word = trainer.train()

[INFO] Available Preprocessing Functions in the Module:['to_lowercase', 'standardize_accented_chars', 'remove_url', 'expand_contractions', 'remove_mentions', 'remove_hashtags', 'keep_only_alphabet', 'remove_new_lines', 'remove_extra_spaces', 'remove_html_tags', 'remove_english_stop_words', 'lemmatize', 'lemmatize_verb', 'lemmatize_noun', 'lemmatize_adjective']
[INFO] Preprocessing starting..
[INFO] These string preprocessing methods will be applied to the data in order:
(  'to_lowercase',
   'standardize_accented_chars',
   'remove_url',
   'expand_contractions',
   'remove_mentions',
   'remove_hashtags',
   'remove_new_lines',
   'keep_only_alphabet')
[INFO] Then, these tokenized preprocessing methods will be applied to the data in order:
(  '__tokenize',
   'remove_english_stop_words',
   'lemmatize_noun',
   '__glue')
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `toke

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

2022-07-19 12:30:06,358 - BERTopic - Reduced dimensionality
2022-07-19 12:30:06,386 - BERTopic - Clustered reduced embeddings
2022-07-19 12:30:06,999 - BERTopic - Reduced number of topics from 5 to 5


In [11]:
df_output_topic_word

Unnamed: 0,run_id,method,method_specific_params,dataset,num_given_topics,reduced,topic_num,topic_size,topic_words,word_scores,num_detected_topics,num_final_topics,duration_secs
0,1658226595,bertopic,"{'embedding_model': 'all-MiniLM-L6-v2', 'top_n...",crisis_toy,4,True,0,114,"[flood, rain, water, basement, flash, heavy, f...","[0.17884271362411105, 0.05607179255504992, 0.0...",5,5,11.276808
1,1658226595,bertopic,"{'embedding_model': 'all-MiniLM-L6-v2', 'top_n...",crisis_toy,4,True,1,102,"[earthquake, felt, feel, small, twitter, sf, t...","[0.4155759390043396, 0.13284892105804227, 0.06...",5,5,11.276808
2,1658226595,bertopic,"{'embedding_model': 'all-MiniLM-L6-v2', 'top_n...",crisis_toy,4,True,2,95,"[smoke, wildfire, fire, smell, california, lik...","[0.1808532212759992, 0.15586154467642044, 0.07...",5,5,11.276808
3,1658226595,bertopic,"{'embedding_model': 'all-MiniLM-L6-v2', 'top_n...",crisis_toy,4,True,3,83,"[debbie, cyclone, school, ex, closed, cancelle...","[0.2168539266597446, 0.21021521924112715, 0.06...",5,5,11.276808
4,1658226595,bertopic,"{'embedding_model': 'all-MiniLM-L6-v2', 'top_n...",crisis_toy,4,True,-1,2,"[windy, shitttttt, zacharylevi, holyy, wonder,...","[0.5348637446613509, 0.5348637446613509, 0.534...",5,5,11.276808


## Parametric Run: LDA-BERT

In [22]:
configs = {
    'dataset': 'crisis_toy',
    'preprocessing_funcs': [
        'to_lowercase',
        'standardize_accented_chars',
        'remove_url',
        'expand_contractions',
        'expand_missing_delimiter',
        'remove_mentions',
        'remove_hashtags',
        'remove_new_lines',
        'keep_only_alphabet',
        # 'remove_extra_spaces',
        'remove_english_stop_words',
        'lemmatize_noun',
        'correct_typo'
    ],
    'algorithm': 'lda-bert',
    'algorithm_args': {
            'embedding_model': "all-MiniLM-L6-v2",
            'number_topics': 3,
            'top_n_words': 10,
            'gamma': 15 
    }     
}

In [23]:
from src import preprocessor
from src.utils import load_documents

docs, labels = load_documents(dataset=configs['dataset'])

if 'preprocessing_funcs' in configs:
    docs = preprocessor.run(data=docs, prep_functions=configs['preprocessing_funcs'])
    
algorithm_args = configs['algorithm_args']
algorithm_args.update(data_name=configs['dataset'],docs=docs,labels=labels)
print(f'Running with {algorithm_args["number_topics"]} topics')

if configs['algorithm'] == 'lda-bert':
    # Encode data with embedding model
    model = SentenceTransformer(algorithm_args['embedding_model'])
    embeddings = model.encode(docs, show_progress_bar=True)
    
    trainer = Trainer(dataset = configs['dataset'],
                      model_name = configs['algorithm'],
                      params = algorithm_args,
                      topk = algorithm_args["top_n_words"],
                      bt_embeddings = embeddings,
                      )
    
    model, df_output_doc_topic, df_output_topic_word = trainer.train()

[INFO] Available Preprocessing Functions in the Module:['to_lowercase', 'standardize_accented_chars', 'remove_url', 'expand_contractions', 'expand_missing_delimiter', 'remove_mentions', 'remove_hashtags', 'keep_only_alphabet', 'remove_new_lines', 'remove_extra_spaces', 'remove_html_tags', 'remove_english_stop_words', 'lemmatize', 'lemmatize_verb', 'lemmatize_noun', 'lemmatize_adjective', 'correct_typo']
[INFO] Preprocessing starting..
[INFO] These string preprocessing methods will be applied to the data in order:
(  'to_lowercase',
   'standardize_accented_chars',
   'remove_url',
   'expand_contractions',
   'expand_missing_delimiter',
   'remove_mentions',
   'remove_hashtags',
   'remove_new_lines',
   'keep_only_alphabet')
[INFO] Then, these tokenized preprocessing methods will be applied to the data in order:
(  '__tokenize',
   'remove_english_stop_words',
   'lemmatize_noun',
   'correct_typo',
   '__glue')
huggingface/tokenizers: The current process just got forked, after paral

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Preprocessing raw texts ...
Preprocessing raw texts. Done!
Clustering embeddings ...
Getting vector representations for LDA ...
Getting vector representations for LDA. Done!
Getting vector representations for BERT ...
Getting vector representations for BERT. Done!
Fitting Autoencoder ...
Fitting Autoencoder Done!
Clustering embeddings. Done!
the words scores are: [[0.09523809523809523, 0.023809523809523808, 0.022675736961451247, 0.015873015873015872, 0.015873015873015872, 0.013605442176870748, 0.013605442176870748, 0.01020408163265306, 0.009070294784580499, 0.007936507936507936], [0.03349673202614379, 0.03022875816993464, 0.028594771241830064, 0.0196078431372549, 0.0196078431372549, 0.01715686274509804, 0.014705882352941176, 0.00980392156862745, 0.008986928104575163, 0.008986928104575163], [0.055165496489468405, 0.05215646940822467, 0.03610832497492478, 0.01905717151454363, 0.013039117352056168, 0.011033099297893681, 0.010030090270812437, 0.010030090270812437, 0.009027081243731194, 0.0

In [19]:
df_output_topic_word

Unnamed: 0,run_id,method,method_specific_params,dataset,num_given_topics,reduced,topic_num,topic_size,topic_words,word_scores,num_detected_topics,num_final_topics,duration_secs
0,1658231276,lda-bert,"{'embedding_model': 'all-MiniLM-L6-v2', 'numbe...",crisis_toy,3,False,0,181,"[debbie, cyclone, smoke, wildfire, earthquake,...","[0.041499330655957165, 0.038821954484605084, 0...",3,3,9.363553
1,1658231276,lda-bert,"{'embedding_model': 'all-MiniLM-L6-v2', 'numbe...",crisis_toy,3,False,1,102,"[flood, cyclone, debbie, rain, smoke, work, fl...","[0.06338028169014084, 0.01643192488262911, 0.0...",3,3,9.363553
2,1658231276,lda-bert,"{'embedding_model': 'all-MiniLM-L6-v2', 'numbe...",crisis_toy,3,False,2,113,"[earthquake, flood, fire, felt, smoke, califor...","[0.06869220607661823, 0.034346103038309116, 0....",3,3,9.363553


In [20]:
df_output_doc_topic

Unnamed: 0,run_id,Document ID,Document,Real Label,Assigned Topic Num,Assignment Score
0,1658231276,0,thereformedcrow nah going go earthquake,earthquake,2,1
1,1658231276,1,think earthquake,earthquake,2,1
2,1658231276,2,uhh else felt earthquake though,earthquake,2,1
3,1658231276,3,bay area nice size earthquake,earthquake,0,1
4,1658231276,4,thought dad farting turn earthquake,earthquake,2,1
...,...,...,...,...,...,...
391,1658231276,391,people keep asking good safe live cyclone debb...,hurricane,0,1
392,1658231276,392,ayyeeee work got cancelled flood thank cyclone...,hurricane,1,1
393,1658231276,393,jetstarairways helpful need change flight due ...,hurricane,0,1
394,1658231276,394,getting hit ex tropical cyclone named debbie r...,hurricane,0,1
