In [None]:
import pandas as pd

from nltk import word_tokenize

import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel
from gensim.models.nmf import Nmf
from gensim.models.lsimodel import LsiModel
from gensim.models.ldamodel import LdaModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_pickle('tesla_clean.pkl')
df.reset_index(inplace=True)

In [None]:
clean_docs = df['Discussion_Clean'].to_list()

In [None]:
tokenized_docs = [word_tokenize(word) for word in clean_docs]

In [None]:
dictionary = gensim.corpora.Dictionary(tokenized_docs)
print(dictionary)

In [None]:
# Term Document Frequency 
# convert our entire corpus to a list of vectors:
bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# View the first doc
print(bow_corpus[0])

In [None]:
doc = bow_corpus[1]
for i in range(len(doc)):
    print (f"Word {doc[i][0]} ({dictionary[doc[i][0]]}) appears {doc[i][1]} times")

In [None]:
NUM_TOPICS = 6

In [None]:
# Build LDA model
lda_model = LdaModel(corpus=bow_corpus, id2word=dictionary, num_topics=NUM_TOPICS, random_state=42)

In [None]:
# View the topics in LDA model
topics = lda_model.print_topics()
for topic in topics:
    print(topic)

In [None]:
def model_scoring (model, corpus, text, dictionary, perplex=False):

    # Compute Perplexity
    # a measure of how good the model is. lower the better.
    if perplex:
        print('Perplexity: ', model.log_perplexity(corpus))  

    # Compute Coherence Score
    coherence_model = CoherenceModel(model=model, 
                                         texts=text, 
                                         dictionary=dictionary, 
                                         coherence='c_v')
    
    coherence_lda = coherence_model.get_coherence()
    print('Coherence Score: ', coherence_lda)

In [None]:
lda_model = LdaModel(corpus=bow_corpus, id2word=dictionary, num_topics=NUM_TOPICS, random_state=42)

model_scoring(lda_model, bow_corpus, tokenized_docs, dictionary, perplex=True)

In [None]:
lsi_model = LsiModel(corpus=bow_corpus, num_topics=NUM_TOPICS, id2word=dictionary)

model_scoring(lsi_model, bow_corpus, tokenized_docs, dictionary)

In [None]:
nmf_model = Nmf(corpus=bow_corpus, num_topics=NUM_TOPICS, id2word=dictionary, random_state=42)

model_scoring(nmf_model, bow_corpus, tokenized_docs, dictionary)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, cohere, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=cohere)
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, 
                                                        corpus=bow_corpus, 
                                                        texts=tokenized_docs, 
                                                        cohere='c_v', # {'u_mass', 'c_v', 'c_uci', 'c_npmi'}
                                                        start=2, 
                                                        limit=20, 
                                                        step=1)

In [None]:
# Show graph
plt.figure(figsize=(8,5))

limit=20
start=2
step=1

x = range(start, limit, step)
ax = sns.lineplot(x, coherence_values, color='dimgray')

# Set titles and labels
plt.title("Best Number of Topics for LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")


# Draw a custom legend
legend_elements = [Line2D([0], [0], color='dimgray', ls='-', label='Coherence (c_uci)')]
ax.legend(handles=legend_elements, loc='lower left')

plt.tight_layout()
plt.savefig('topic_coherence.png', dpi=300)

In [None]:
NUM_TOPICS = 8

In [None]:
# Build LDA model
model = LdaModel(corpus=bow_corpus, id2word=dictionary, num_topics=NUM_TOPICS, random_state=42)

In [None]:
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(model, bow_corpus, dictionary, sort_topics=False)

pyLDAvis.display(lda_viz)