## Import libaries

In [2]:
import os

if os.getcwd().endswith('/notebooks'):
    os.chdir('..')

assert os.path.exists('./src'), f"[ERROR] The path src not detected in the current directory '{os.getcwd()}'."

print(f'[INFO] Current Directory: "{os.getcwd()}".')

if not os.path.exists("images"):
    os.mkdir("images")

[INFO] Current Directory: "/home/ferdi/topic-modeling-advancements".


In [3]:
%load_ext autoreload
%autoreload 2
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer

from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.dataset.dataset import Dataset
from octis.models.LDA import LDA

import time
import pandas as pd
import numpy as np
import zlib
import pickle as pkl
import sklearn
import pandas as pd
from tabulate import tabulate
import kaleido

## Preencode data with embeddings
from sentence_transformers import SentenceTransformer
## Load data with datalaoder from src
from src.utils import load_documents
from src.bertopic_runner import Trainer
from umap import UMAP
from hdbscan import HDBSCAN

2022-07-20 11:21:11.864820: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-20 11:21:11.864957: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Parametric run: BERTopic

In [21]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/ferdi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ferdi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ferdi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

If we set hdbscan_args to None, Kmeans is used for clustering. That way we can enforce having no noise clusters/documents.
As a consequence we also have no assignment scores since KMeans does not produce any (set as default to 1).

In [4]:
configs = {
    'dataset': 'crisis_toy',
    'preprocessing_funcs': [
        'to_lowercase',
        'standardize_accented_chars',
        'remove_url',
        'expand_contractions',
        'remove_mentions',
        'remove_hashtags',
        'remove_new_lines',
        'keep_only_alphabet',
        # 'remove_extra_spaces',
        'remove_english_stop_words',
        'lemmatize_noun'
    ],
    'algorithm': 'bertopic',
    'algorithm_args': {
         "embedding_model": "all-MiniLM-L6-v2",
         "top_n_words": 10,
         "n_gram_range_tuple": (1, 1),
         ## Both the same as below
         "min_docs_per_topic": 15,
         "number_topics": 4,
         ## Assign almost all docs to a topic
         #"no_noise": True,
         #"prob_threshold": 0.01,
         ## Setting min_samples to reduce #docs classified as noise/in topic -1
         "hdbscan_args": {
                    "min_cluster_size": 15,
                    "metric":'euclidean',
                    "cluster_selection_method": 'eom',
                    "prediction_data": True,
                    #"min_samples": 15
         },    
        "umap_args": {
                    "n_neighbors": 15,
                    "n_components": 5,
                    "min_dist": 0.0,
                    "metric": 'cosine',
                    "low_memory": False,
                    "random_state": 42
        }
    }
}

In [5]:
##Maybe parallelism disable?
#import os
#os.environ["TOKENIZERS_PARALLELISM"] = "false"

from src import preprocessor
from src.utils import load_documents

docs, labels = load_documents(dataset=configs['dataset'])

if 'preprocessing_funcs' in configs:
    docs = preprocessor.run(data=docs, prep_functions=configs['preprocessing_funcs'])
    
algorithm_args = configs['algorithm_args']
algorithm_args.update(data_name=configs['dataset'],docs=docs,labels=labels)
print(f'Running with {algorithm_args["number_topics"]} topics')

if configs['algorithm'] == 'bertopic':
    # Encode data with embedding model
    model = SentenceTransformer(algorithm_args['embedding_model'])
    embeddings = model.encode(docs, show_progress_bar=True)
    
    trainer = Trainer(dataset = configs['dataset'],
                      model_name = configs['algorithm'],
                      params = algorithm_args,
                      topk = algorithm_args["top_n_words"],
                      bt_embeddings = embeddings,
                      )
    
    model, df_output_doc_topic, df_output_topic_word = trainer.train()
    
    from src.evaluator import compute_topic_scores
    df_output_topic_word = compute_topic_scores(df_output_doc_topic, df_output_topic_word)

[INFO] Available Preprocessing Functions in the Module:['to_lowercase', 'standardize_accented_chars', 'remove_url', 'expand_missing_delimiter', 'remove_mentions', 'remove_hashtags', 'keep_only_alphabet', 'remove_new_lines', 'remove_extra_spaces', 'remove_html_tags', 'expand_contractions', 'remove_english_stop_words', 'lemmatize', 'lemmatize_verb', 'lemmatize_noun', 'lemmatize_adjective', 'correct_typo']
[INFO] Preprocessing starting..
[INFO] These string preprocessing methods will be applied to the data in order:
(  'to_lowercase',
   'standardize_accented_chars',
   'remove_url',
   'remove_mentions',
   'remove_hashtags',
   'remove_new_lines',
   'keep_only_alphabet')
[INFO] Then, these tokenized preprocessing methods will be applied to the data in order:
(  '__tokenize',
   'expand_contractions',
   'remove_english_stop_words',
   'lemmatize_noun',
   '__glue')
[INFO] Preprocessing completed in 5.488 seconds..
Running with 4 topics


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

2022-07-20 11:22:06,279 - BERTopic - Reduced dimensionality
2022-07-20 11:22:06,297 - BERTopic - Clustered reduced embeddings


In [6]:
df_output_topic_word

Unnamed: 0,run_id,method,method_specific_params,dataset,num_given_topics,reduced,topic_num,topic_size,topic_words,word_scores,num_detected_topics,num_final_topics,duration_secs,diversity_unique,diversity_inv_rbo,coherence_npmi,coherence_v,rand_index
0,1658308911,bertopic,"{'embedding_model': 'all-MiniLM-L6-v2', 'top_n...",crisis_toy,4,False,0,201,"[flood, debbie, cyclone, rain, today, school, ...","[0.12487296520516175, 0.11498390801027375, 0.1...",3,3,15.202784,1.0,1.0,-0.05357,0.409122,0.853663
1,1658308911,bertopic,"{'embedding_model': 'all-MiniLM-L6-v2', 'top_n...",crisis_toy,4,False,1,101,"[earthquake, felt, feel, small, twitter, sf, t...","[0.5103614735047877, 0.1515561989210337, 0.069...",3,3,15.202784,1.0,1.0,-0.05357,0.409122,0.853663
2,1658308911,bertopic,"{'embedding_model': 'all-MiniLM-L6-v2', 'top_n...",crisis_toy,4,False,2,94,"[smoke, wildfire, fire, smell, california, lik...","[0.21855425061383502, 0.18618706849685615, 0.0...",3,3,15.202784,1.0,1.0,-0.05357,0.409122,0.853663


## Parametric Run: LDA-BERT

In [None]:
configs = {
    'dataset': 'crisis_toy',
    'preprocessing_funcs': [
        'to_lowercase',
        'standardize_accented_chars',
        'remove_url',
        'expand_contractions',
        'expand_missing_delimiter',
        'remove_mentions',
        'remove_hashtags',
        'remove_new_lines',
        'keep_only_alphabet',
        # 'remove_extra_spaces',
        'remove_english_stop_words',
        'lemmatize_noun',
        'correct_typo'
    ],
    'algorithm': 'lda-bert',
    'algorithm_args': {
            'embedding_model': "all-MiniLM-L6-v2",
            'number_topics': 3,
            'top_n_words': 10,
            'gamma': 15 ,
            'random_state': 42
    }     
}

In [None]:
from src import preprocessor
from src.utils import load_documents

docs, labels = load_documents(dataset=configs['dataset'])

if 'preprocessing_funcs' in configs:
    docs = preprocessor.run(data=docs, prep_functions=configs['preprocessing_funcs'])
    
algorithm_args = configs['algorithm_args']
algorithm_args.update(data_name=configs['dataset'],docs=docs,labels=labels)
print(f'Running with {algorithm_args["number_topics"]} topics')

if configs['algorithm'] == 'lda-bert':
    # Encode data with embedding model
    model = SentenceTransformer(algorithm_args['embedding_model'])
    embeddings = model.encode(docs, show_progress_bar=True)
    
    trainer = Trainer(dataset = configs['dataset'],
                      model_name = configs['algorithm'],
                      params = algorithm_args,
                      topk = algorithm_args["top_n_words"],
                      bt_embeddings = embeddings,
                      )
    
    model, df_output_doc_topic, df_output_topic_word = trainer.train()
    
    from src.evaluator import compute_topic_scores
    df_output_topic_word = compute_topic_scores(df_output_doc_topic, df_output_topic_word)

In [19]:
df_output_topic_word

Unnamed: 0,run_id,method,method_specific_params,dataset,num_given_topics,reduced,topic_num,topic_size,topic_words,word_scores,num_detected_topics,num_final_topics,duration_secs
0,1658231276,lda-bert,"{'embedding_model': 'all-MiniLM-L6-v2', 'numbe...",crisis_toy,3,False,0,181,"[debbie, cyclone, smoke, wildfire, earthquake,...","[0.041499330655957165, 0.038821954484605084, 0...",3,3,9.363553
1,1658231276,lda-bert,"{'embedding_model': 'all-MiniLM-L6-v2', 'numbe...",crisis_toy,3,False,1,102,"[flood, cyclone, debbie, rain, smoke, work, fl...","[0.06338028169014084, 0.01643192488262911, 0.0...",3,3,9.363553
2,1658231276,lda-bert,"{'embedding_model': 'all-MiniLM-L6-v2', 'numbe...",crisis_toy,3,False,2,113,"[earthquake, flood, fire, felt, smoke, califor...","[0.06869220607661823, 0.034346103038309116, 0....",3,3,9.363553


In [20]:
df_output_doc_topic

Unnamed: 0,run_id,Document ID,Document,Real Label,Assigned Topic Num,Assignment Score
0,1658231276,0,thereformedcrow nah going go earthquake,earthquake,2,1
1,1658231276,1,think earthquake,earthquake,2,1
2,1658231276,2,uhh else felt earthquake though,earthquake,2,1
3,1658231276,3,bay area nice size earthquake,earthquake,0,1
4,1658231276,4,thought dad farting turn earthquake,earthquake,2,1
...,...,...,...,...,...,...
391,1658231276,391,people keep asking good safe live cyclone debb...,hurricane,0,1
392,1658231276,392,ayyeeee work got cancelled flood thank cyclone...,hurricane,1,1
393,1658231276,393,jetstarairways helpful need change flight due ...,hurricane,0,1
394,1658231276,394,getting hit ex tropical cyclone named debbie r...,hurricane,0,1
