In [None]:
import numpy as np
import pandas as pd

import gensim

import matplotlib.pyplot as plt

from datetime import datetime

from IPython.core.display import HTML
import tba3102



display(HTML("<style>pre { white-space: pre !important; }</style>"))
tba3102.set_default_pandas_options(max_columns=16, width=1500, max_colwidth=300)

print('Text processing started at {}'.format(datetime.now()))

In [None]:
df = pd.read_csv('../data/voted-kaggle-dataset-cleaned.csv', index_col=0)
documents = [tba3102.tokenize_sentence_to_words(cleaned_description) for cleaned_description in df['Cleaned_Description']]



bigram = gensim.models.Phrases(documents, min_count=20, threshold=20, delimiter='_') # higher threshold fewer phrases.
bigram_model = gensim.models.phrases.Phraser(bigram)

corpus_bigrams = [bigram_model[doc] for doc in documents]
dictionary = gensim.corpora.Dictionary(corpus_bigrams)
dictionary.filter_extremes(no_below=20, no_above=0.6)
bow_corpus = [dictionary.doc2bow(text) for text in corpus_bigrams]

In [None]:
MODEL_NAMES = ['LSI', 'LDA']
START_TOPIC_COUNT = 2
END_TOPIC_COUNT = 10
model_count = 0

In [None]:
for MODEL_NAME in MODEL_NAMES:

    models, coherence_scores = tba3102.topic_model_coherence_generator(model_name = MODEL_NAME,
                                                                       corpus=bow_corpus,
                                                                       texts=corpus_bigrams,
                                                                       dictionary=dictionary,
                                                                       start_topic_count=START_TOPIC_COUNT,
                                                                       end_topic_count=END_TOPIC_COUNT,
                                                                       step=1,
                                                                       cpus=-1,
                                                                       print_topics=True)

    coherence_df = pd.DataFrame({'Number of Topics': range(START_TOPIC_COUNT, END_TOPIC_COUNT + 1, 1), 'Coherence Score': np.round(coherence_scores, 4)})
    coherence_df.sort_values(by=['Coherence Score'], ascending=False).head(10)
    print(coherence_df)

    x_ax = range(START_TOPIC_COUNT, END_TOPIC_COUNT + 1, 1)
    y_ax = coherence_scores
    plt.figure(num=model_count, figsize=(15, 12))
    plt.plot(x_ax, y_ax, c='r')
    plt.axhline(y=0.3, c='k', linestyle='--', linewidth=2)
    plt.axhline(y=0.4, c='k', linestyle='--', linewidth=2)
    plt.axhline(y=0.5, c='k', linestyle='--', linewidth=2)
    plt.rcParams['figure.facecolor'] = 'white'
    xl = plt.xlabel('Number of Topics')
    yl = plt.ylabel('Coherence Score')

    model_count += 1

plt.show()

In [None]:
print('Text processing ended at {}'.format(datetime.now()))