In [None]:
from pathlib import Path
import os
import sys
sys.path.insert(10, str(Path(os.getcwd()).resolve().parents[0]) + '/')

# Importing libraries
import numpy as np
import pickle
import pandas as pd
from tqdm import tqdm
import tomotopy as tp
from utils.data_selection import DocumentSelection
from utils.preprocesslib import Preprocess
from utils.metrics import calculate_perplexity, calculate_coherence
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read the data for Reuters or Wikipedia dataset
FOLDER_PATH = sys.path[-1] + 'dataset/reuters.csv'
# FOLDER_PATH = sys.path[-1] + 'dataset/wiki.csv'

reference_documents = pd.read_csv(FOLDER_PATH)['texts'].values

In [None]:
DocumentSelector = DocumentSelection(reference_documents, name='Reuters')
DocumentSelector.document_selector_hyper()

In [None]:
DocumentSelector.select_hyper_document(threshold="PUT_THRESHOLD_HERE")

In [None]:
selected_documents_hyper = pd.DataFrame(data={'texts': DocumentSelector.selected_doc_hyper})

## 2. Data Cleaning

In [None]:
preprocess = Preprocess()
(reference_corpus, our_model_selected_corpus) = preprocess.make_training_corpora(reference_documents, selected_documents_hyper, lemma_model='efficient')

## 3. Training Topic Model

In [None]:
n = "CHOOSE_MAXIMUM_NUMBER_OF_TOPICS"
def topic_model_compare(topic_reference_corpus, model_selected_corpus):
    pbar = tqdm(total = n * 1, file = sys.stdout, ascii = ' >=')
    model_results = {'Topics'    : [],
                     'Perplexity': [],
                     'c_v'       : []
                    }
    for i in range(1, n+1):
        mdl = tp.LDAModel(k=i, seed = 100, corpus = model_selected_corpus)
        mdl.train(1000)
        perplexity_score = calculate_perplexity(model=mdl, corpus=model_selected_corpus)
        model_results['Topics'].append(i)
        model_results['Perplexity'].append(perplexity_score)
        
        for preset in ['c_v']:
            average_coherence_score = calculate_coherence(model=mdl, topic_reference_corpus=topic_reference_corpus, preset=preset)
            model_results[preset].append(average_coherence_score)
            pbar.update(1)
    pbar.close()
    return model_results

def topic_model(topic_reference_corpus):
    pbar = tqdm(total = n * 1, file = sys.stdout, ascii = ' >=')
    model_results = {'Topics'    : [],
                     'Perplexity': [],
                     'c_v'       : []
                    }
    for i in range(1, n+1):
#         mdl = tp.LDAModel(min_df=3, rm_top=10, k=i, seed = 100, corpus = original)
        mdl = tp.LDAModel(k=i, seed = 100, corpus = topic_reference_corpus)
        mdl.train(1000)
        
        perplexity_score = calculate_perplexity(model=mdl, corpus=topic_reference_corpus)
        model_results['Topics'].append(i)
        model_results['Perplexity'].append(perplexity_score)
        
        for preset in ['c_v']:
            average_coherence_score = tp.coherence.Coherence(mdl, coherence=preset, top_n = 10).get_score()
            model_results[preset].append(average_coherence_score)
            pbar.update(1)
    pbar.close()
    return model_results

In [None]:
model_results_hyper = topic_model_compare(reference_corpus, our_model_selected_corpus)
model_results_reference = topic_model(reference_corpus)

## 4. If you want to save the results (uncomment the code)

In [None]:
# with open('model_results/PUT_FILENAME_HERE.pkl', 'wb') as handle:
#     pickle.dump(model_results_reference, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# with open('model_results/PUT_FILENAME_HERE.pkl', 'wb') as handle:
#     pickle.dump(model_results_hyper, handle, protocol=pickle.HIGHEST_PROTOCOL)