# Code preparation

In [1]:
%load_ext autoreload
%autoreload 2

import nltk
import os
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/iceking/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/iceking/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/iceking/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/iceking/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
if os.getcwd().endswith('/notebooks'):
    os.chdir('..')

assert os.path.exists('./src'), f"[ERROR] The path src not detected in the current directory '{os.getcwd()}'."

print(f'[INFO] Current Directory: "{os.getcwd()}".')

[INFO] Current Directory: "/home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements".


# Training

In [345]:
configs = {
    'dataset': 'crisis_12',
    'preprocessing_funcs': [
        'to_lowercase',
        'standardize_accented_chars',
        'remove_url',
        'expand_contractions',
        'remove_mentions',
        'remove_hashtags',
        # 'remove_new_lines',
        'keep_only_alphabet',
        # 'remove_extra_spaces',
        'remove_english_stop_words',
        'lemmatize_noun'
    ],
    'algorithm': 'ctm',
    'algorithm_args': {
        'num_topics': 4,
        # 'data_save_dir': './OCTIS_data_tmp',   # Specific for LDA/NMF
        'random_state': 42,    
        #'embedding_model':'bert-base-nli-mean-tokens',
        'embedding_model':'all-mpnet-base-v2',
        # 'embedding_model': 'doc2vec',
        # 'embedding_model': 'universal-sentence-encoder',
        # 'embedding_model': 'universal-sentence-encoder-large', # WORKS VERY WELL
        # 'embedding_model': 'distiluse-base-multilingual-cased'
        'num_epochs': 100,
        'learning_rate': 2e-3,
        'batch_size': 64,
        
        }
    }

In [346]:
from src import preprocessor
from src.utils import load_documents

docs,labels = load_documents(dataset=configs['dataset'])
if 'preprocessing_funcs' in configs:
    docs = preprocessor.run(data=docs, prep_functions=configs['preprocessing_funcs'])
    
algorithm_args = configs['algorithm_args']
algorithm_args.update(data_name=configs['dataset'],docs=docs,labels=labels)

algorithm_name = configs['algorithm'].lower()
if algorithm_name in {'lda', 'nmf','ctm'}:
    if algorithm_name == 'ctm':
        print('[WARN] CTM is experimental and does not guarantee reproducibility. Please use with caution!')
    from src import LDA_NMF_CTM_runner
    df_output_doc_topic, df_output_topic_word = LDA_NMF_CTM_runner.runner(args=algorithm_args,model_name=algorithm_name)        


[INFO] [PREPROCESSOR] Available Preprocessing Functions in the Module:['to_lowercase', 'standardize_accented_chars', 'remove_url', 'expand_missing_delimiter', 'remove_mentions', 'remove_hashtags', 'keep_only_alphabet', 'remove_new_lines', 'remove_extra_spaces', 'remove_html_tags', 'expand_contractions', 'remove_english_stop_words', 'lemmatize', 'lemmatize_verb', 'lemmatize_noun', 'lemmatize_adjective', 'correct_typo']
[INFO] [PREPROCESSOR] Preprocessing starting..
[INFO] [PREPROCESSOR] These string preprocessing methods will be applied to the data in order:
(  'to_lowercase',
   'standardize_accented_chars',
   'remove_url',
   'remove_mentions',
   'remove_hashtags',
   'keep_only_alphabet')
[INFO] [PREPROCESSOR] Then, these tokenized preprocessing methods will be applied in order:
(  '__tokenize',
   'expand_contractions',
   'remove_english_stop_words',
   'lemmatize_noun',
   '__glue')
[INFO] [PREPROCESSOR] Preprocessing completed in 2.686 seconds..
[WARN] CTM is experimental and d

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

[INFO] Model trained successfully!


In [347]:
# See doc_topic_matrix
df_output_doc_topic

Unnamed: 0,run_id,Document ID,Document,Real Label,Assigned Topic Num,Assignment Score
0,1658505700,0,nah going to go earthquake,earthquake,3,0.452579
1,1658505700,1,uhh else felt earthquake though,earthquake,3,0.678245
2,1658505700,2,bay area nice size earthquake,earthquake,0,0.287449
3,1658505700,3,thought dad turn earthquake,earthquake,3,0.457613
4,1658505700,4,earthquake,earthquake,3,0.628831
...,...,...,...,...,...,...
6395,1658505700,6395,due weather ex tropical cyclone debbie pool cl...,hurricane,2,0.771200
6396,1658505700,6396,wet weather plan today,hurricane,1,0.705613
6397,1658505700,6397,folk rain south east queensland coming ex cycl...,hurricane,2,0.798299
6398,1658505700,6398,man put weather low blow ex tropical cyclone d...,hurricane,2,0.704594


In [348]:
# See Topic_Word_matrix
df_output_topic_word

Unnamed: 0,run_id,method,method_specific_params,dataset,num_given_topics,reduced,topic_num,topic_size,topic_words,word_scores,num_detected_topics,num_final_topics,duration_secs
0,1658505700,ctm,,crisis_12,4,False,0,1436,"[fort, canada, wildfire, wine, california, fir...","[0.1191793, 0.11775019, 0.11415977, 0.10807861...",4,4,134.85
1,1658505700,ctm,,crisis_12,4,False,1,1718,"[flood, flash, rain, warning, heavy, flooding,...","[0.20493537, 0.1441246, 0.12001129, 0.09863417...",4,4,134.85
2,1658505700,ctm,,crisis_12,4,False,2,1277,"[cyclone, debbie, queensland, ex, australia, s...","[0.22596043, 0.18861528, 0.12113283, 0.0951154...",4,4,134.85
3,1658505700,ctm,,crisis_12,4,False,3,1969,"[eu, reaction, radar, shot, otherwise, nuclear...","[0.15293045, 0.11327045, 0.09993666, 0.0985998...",4,4,134.85


# Visualization

In [349]:
from src import visualizer
visualizer.visualize_labels_per_topic(df_output_doc_topic = df_output_doc_topic,
                                      df_output_topic_word = df_output_topic_word,
                                      top_n_topics = algorithm_args['num_topics'])

In [350]:
visualizer.visualize_barchart(df_output_topic_word = df_output_topic_word,n_words = 5)

In [None]:
# First Documents
for i in sorted(df_output_topic_word['topic_num'].to_list()):
    x = df_output_doc_topic.query(f'`Assigned Topic Num` == {i}').sort_values(by ='Assignment Score',ascending=False).drop('run_id',axis=1).head()
    with pd.option_context('display.max_colwidth', None):
        display(x)

In [None]:
from gensim.corpora import Dictionary

documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

dataset = []
for text in documents:
    tokens = nltk.word_tokenize(text)
    dataset.append(tokens)
vocab = Dictionary(documents=dataset, prune_at=20000)

In [None]:
vocab.filter_extremes(no_below=1, no_above=1, keep_n=10) 

In [None]:
word_list = []
for key in vocab.token2id.keys():
    word_list.append(key)
word_list

In [None]:
# Then vocabulary.txt is obtained using the function we have

In [None]:
for i in range(len(documents)):
    new_sen = []
    for word in nltk.word_tokenize(documents[i]):
        if word in word_list:
            new_sen.append(word)
    documents[i] = ' '.join(new_sen).strip()

In [None]:
documents

In [None]:
vocab.token2id

In [None]:
vocab.id2token

In [None]:
vocab.cfs

In [None]:
vocab.filter_extremes(no_below=2)

# Archive

In [None]:
assert False

In [None]:
# In Archive
from src import LDA_NMF_runner
importlib.reload(LDA_NMF_runner)

# Please choose the entries from the following sets:

# model: ['LDA', 'NMF']
# dataset: ['crisis_12', '20news']

# Other parameters
# top_n_topics: int (This decides how many topics are shown)
# n_words: int (This decides how many words are shown)

settings = {'model': 'NMF',
            'dataset': 'crisis_12',
            'top_n_topics': 5,
            'n_words': 5}

save_dir_crisis_12 = '/content/drive/MyDrive/SS_2022_Praktikum/Crisis Dataset/Dataset_12'

Doc_Topic_df, Topic_Word_df = LDA_NMF_runner.runner(model_name = settings['model'],
                                                    dataset_name = settings['dataset'],
                                                    top_n_topics = settings['top_n_topics'],
                                                    n_words = settings['n_words'],
                                                    save_dir = save_dir_crisis_12,
                                                    random_state = 100)