## 0. Code Setup and Helper Functions

### 0.1 Path Adjustments

In [1]:
import os
import sys
assert os.path.exists('../src'), f"[ERROR] The path src not detected in the parent directory '{os.getcwd()}'."

if os.getcwd().endswith('/notebooks'):
    os.chdir('..')
    sys.path.append('./src')

print(f'[INFO] Current Directory: "{os.getcwd()}".')

[INFO] Current Directory: "/home/iceking/Desktop/22SS-TUM Lecture Docs/3. Praktikum (IN2106, IN4249)/Repo/topic-modeling-advancements".


### 0.2 Import all necessary packages

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from time import time
from src import preprocessor
from src.bertopic_runner import BertopicTrainer
from src.utils import load_documents

2022-08-22 21:32:51.448700: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-22 21:32:51.448725: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## 1. Train

In [3]:
configs = {
    'dataset': 'ag_news_long',
    'preprocessing_funcs': [
        'to_lowercase',
        'standardize_accented_chars',
        'remove_url',
        'expand_contractions',
        'expand_missing_delimiter',
        'remove_mentions',
        'remove_hashtags',
        'remove_new_lines',
        'keep_only_alphabet',
        # 'remove_extra_spaces',
        'remove_english_stop_words',
        'lemmatize_noun',
        #'correct_typo'
    ],
    'algorithm': 'nmf',
    'algorithm_args': {
            'embedding_model': "all-MiniLM-L6-v2",
             'num_epochs': 100,
           # 'learning_rate': 0.002,
            # 'batch_size': 64,
            #'embedding_model': "paraphrase-multilingual-MiniLM-L12-v2",
            'num_topics': 4,
            'top_n_words': 10,
            'gamma': 10,
            'random_state': 42,
            'alpha':0.1,
    }     
}

In [None]:
OUTPUT_FOLDER = './output'

docs,labels = load_documents(dataset=configs['dataset'])
if 'preprocessing_funcs' in configs:
    docs = preprocessor.run(data=docs, prep_functions=configs['preprocessing_funcs'])

algorithm_args = configs['algorithm_args']
algorithm_args.update(data_name=configs['dataset'],docs=docs,labels=labels)
print(f'Running with {algorithm_args["num_topics"]} topics')

algorithm_name = configs['algorithm']
if algorithm_name == 'nmf':
    run_id=int(time())
    output_folder = f'{OUTPUT_FOLDER}/{run_id}_{algorithm_name}'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    from src import LDA_NMF_CTM_runner
    df_output_doc_topic, df_output_topic_word =LDA_NMF_CTM_runner.runner(
            args=algorithm_args, run_id=run_id, output_folder=output_folder, model_name=algorithm_name)
    
    model = None
    
    from src.evaluator import compute_topic_scores
    df_output_topic_word = compute_topic_scores(df_output_doc_topic, df_output_topic_word)

[INFO] [PREPROCESSOR] Available Preprocessing Functions in the Module:['to_lowercase', 'standardize_accented_chars', 'remove_url', 'expand_missing_delimiter', 'remove_mentions', 'remove_hashtags', 'keep_only_alphabet', 'remove_new_lines', 'remove_extra_spaces', 'remove_html_tags', 'expand_contractions', 'remove_english_stop_words', 'lemmatize', 'lemmatize_verb', 'lemmatize_noun', 'lemmatize_adjective', 'correct_typo']
[INFO] [PREPROCESSOR] Preprocessing starting..
[INFO] [PREPROCESSOR] These string preprocessing methods will be applied to the data in order:
(  'to_lowercase',
   'standardize_accented_chars',
   'remove_url',
   'expand_missing_delimiter',
   'remove_mentions',
   'remove_hashtags',
   'remove_new_lines',
   'keep_only_alphabet')
[INFO] [PREPROCESSOR] Then, these tokenized preprocessing methods will be applied in order:
(  '__tokenize',
   'expand_contractions',
   'remove_english_stop_words',
   'lemmatize_noun',
   '__glue')
[INFO] [PREPROCESSOR] Preprocessing complet

## 2. Display Outputs

In [None]:
df_output_doc_topic

In [None]:
with pd.option_context('display.max_colwidth', 100):
    display(df_output_topic_word)

In [None]:
# First Documents
for i in sorted(df_output_topic_word['topic_num'].to_list()):
    x = df_output_doc_topic.query(f'`Assigned Topic Num` == {i}').sort_values(by ='Assignment Score',ascending=False).drop('run_id',axis=1).head()
    with pd.option_context('display.max_colwidth', None):
        display(x)

## 3. Visualization

### 3.1 Visualization - UMAP 2D Scatter Plot

In [None]:
# from src import visualizer

# umap2d_scatter_plot = visualizer.draw_umap2d_scatter_plot(model, df_output_topic_word,df_output_doc_topic,target_dir='.')

### 3.2 Visualization - Topic Words Bar Chart

In [None]:
df_output_topic_word

In [None]:
from src import visualizer

top_words_barchart = visualizer.visualize_top_words_barchart(df_output_topic_word=df_output_topic_word,n_words=5,target_dir='./',top_n_topics=10)
top_words_barchart

### 3.3 Visualization - Labels per Topic

In [None]:
from src import visualizer
labels_per_topic = visualizer.visualize_labels_per_topic(df_output_doc_topic=df_output_doc_topic,
                               df_output_topic_word=df_output_topic_word) #,top_n_labels=5,top_n_topics=4)
labels_per_topic

### 3.4 Visualization - Topic Similarity Matrix

In [None]:
import random

In [None]:
random.sample(list(range(1,20)), 5)

In [None]:
from src import visualizer

# visualizer.visualize_topic_similarity_matrix(
#    model=model,
#    df_output_doc_topic=df_output_doc_topic,
#    df_output_topic_word=df_output_topic_word,
#    topics= None,
#    top_n_topics = None,
#    n_clusters = None, # Unknown for now
#    width = 1000,
#    height = 1000
#)

### 3.5 Visualization - Representative Docs

In [None]:
from src.visualizer import draw_representative_docs
df_style = draw_representative_docs(df_output_doc_topic, top_n_docs = 3)
df_style

# To save this image:
# import dataframe_image as dfi
# dfi.export(df_style, 'successful_test.png') # todo: export to file