In [None]:
#pip install -U gensim

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

Perform some basic data pre-processing. Note that data has largely been cleaned prior.

In [None]:
#documents = pd.read_csv("")
#original = documents
#documents = documents[['Created time','Permalink URL','Message','Tag']]
#documents = documents.dropna()
#documents['Message'] = documents['Message'].astype("string")

In [None]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
      if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 1:
        result.append(PorterStemmer().stem(WordNetLemmatizer().lemmatize(token, pos='v')))
    return result

In [None]:
processed_docs = documents['Message'].map(preprocess)
processed_docs[:10]

In [None]:
def remove_none(list_of_words):
    if list_of_words is None:
        return []
    else:
        return list_of_words

In [None]:
processed_docs = processed_docs.apply(remove_none)

In [None]:
#Remove common words that may not help in identifying useful topics. This is an iterative process.  

In [None]:
def remove_words(list_of_words):
    words_to_remove = ['com','http'] #Example of words to remove
    for word in words_to_remove:
        if list_of_words is not None:
            if word in list_of_words:
                list_of_words = list_of_words.remove(word)
    return list_of_words

In [None]:
#Remove missing values if any; remove words that do not help in idenfiying useful topics
processed_docs = processed_docs.dropna()
processed_docs = processed_docs.apply(remove_words)

In [None]:
#Sanity check on what our lists of strings look like 
processed_docs[:10]

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)
num_dict_words = 0
for dict_index, dict_word in dictionary.iteritems():
    #print(dict_index, dict_word)
    num_dict_words += 1
    if num_dict_words > 50:
        break

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
word_counts = [[(dictionary[id], count) for id, count in line] for line in bow_corpus]
#print(word_counts[:10])

In [None]:
coherence_scores = []
for i in range(1, 11):
    lda_model = gensim.models.ldamodel.LdaModel(corpus = bow_corpus, num_topics = i, id2word = dictionary, passes = 5, alpha = 'auto', eta = 'auto', iterations = 5, random_state = np.random.seed(1234))
    coherence_lda_model = CoherenceModel(model = lda_model, texts = processed_docs, dictionary = dictionary, coherence = 'c_v')
    if __name__ == "__main__":
        coherence = coherence_lda_model.get_coherence()
        print(f"Coherence_v Score for {i} Topics: ", coherence)
        coherence_scores.append(coherence)

In [None]:
#Show coherence scores
if len(range(1, 11, 1)) == len(coherence_scores):
    tick_spacing = 1
    fig, ax = plt.subplots(1,1)
    ax.plot(range(1, 11, 1), coherence_scores)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))
    ax.set_title("LDA using BOW: Topics 1 to 10")
    ax.set_xlabel("Number of Topics")
    ax.set_ylabel("Coherence Scores")
    plt.show()
else:
    pass

In [None]:
#Generate results 
lda_model = gensim.models.ldamodel.LdaModel(corpus = bow_corpus, num_topics = 6, id2word = dictionary, passes = 5, alpha = 'auto', eta = 'auto', iterations = 5, random_state = np.random.seed(1234))
for idx, topic in lda_model.show_topics(num_words = 10):
    print('Topic {}: Words are {}'.format(idx, topic))