# TOPIC MODELING (DTM, LSI, HDP, LDA, & Coherence)



In [None]:
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel, TfidfModel
from gensim.models.word2vec import LineSentence
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.wrappers import DtmModel
from gensim import similarities
from gensim.corpora import MmCorpus, Dictionary
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
import seaborn as sns
import numpy as np
import pandas as pd
import os
import warnings
#warnings.filterwarnings('ignore')

In [None]:
_data_directory = 'staging2002'
trigram_dictionary_filepath = os.path.join(_data_directory,'trigram_dict_all.dict')

trigram_bow_filepath = os.path.join(_data_directory,'trigram_bow_corpus_all.mm')

In [None]:
# load corpus and dictionary

try:
    trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)
except FileNotFoundError as e:
    raise ValueError("No dictionary found")
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
# corpus must be saved in order of time-slices!

In [None]:
trigram_bow_corpus.index

## DTM Model [C++ implementation]

In [None]:
%%time

# DTM binary for python wrapper
path_to_dtm_binary = "./dtm_macOS"  # Mac arch version
#path_to_dtm_binary = "./dtm_self_compiled"  # old MacOS arch version
#path_to_dtm_binary = "dtm-win64.exe"

time_slices = [89,102] # number of docs for each successive year
num_topics = 5

# python wrapper
dtm = DtmModel(
    path_to_dtm_binary, corpus=trigram_bow_corpus, 
    id2word=trigram_dictionary, 
    time_slices=time_slices,
    #time_slices=[1] * len(trigram_bow_corpus)
    model='dtm',
    num_topics=num_topics,
    initialize_lda=True,
    rng_seed=1   
)

In [None]:

topic_df = pd.DataFrame()
df2 = pd.DataFrame()
for i in range(num_topics):    
    for j in range(len(time_slices)):
        df1 = pd.DataFrame(dtm.show_topic(topicid=i, time=j, topn=10))
        df2 = pd.concat([df1,df2], axis=1)
        #df.reset_index(inplace=True, drop=True)
    topic_df = pd.concat([topic_df,df2], axis=0)
    df2 = pd.DataFrame()   
topic_df

In [None]:
len(time_slices)

In [None]:
topics = dtm.show_topic(topicid=2, time=1, topn=10)
topics

In [None]:
#topics = dtm.show_topic(topicid=0, time=1, topn=10)
dtm.show_topics(num_topics=5, times=2, num_words=10, formatted=False)


In [None]:
dtm.print_topics(num_topics=5, times=1, num_words=10)

In [None]:
dtm.save(os.path.join(_data_directory,'dtm_model'))
#dtm_model = DtmModel.load('dtm_model')

In [None]:
len(trigram_bow_corpus)

In [None]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = dtm.dtm_vis(corpus=trigram_bow_corpus, time=0, )
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)

## LSI Model


In [None]:
lsimodel = LsiModel(corpus=trigram_bow_corpus, num_topics=10, id2word=trigram_dictionary)

In [None]:
lsimodel.save(os.path.join(_data_directory,'lsi_topic.model'))

In [None]:
lsimodel.show_topics(num_topics=5, formatted=False)  # Showing only the top 5 topics

## HDP Model

In [None]:
hdpmodel = HdpModel(corpus=trigram_bow_corpus, id2word=trigram_dictionary)

In [None]:
hdpmodel.save(os.path.join(_data_directory,'hdp_topic.model'))

In [None]:
hdpmodel.show_topics(formatted=False)

## LDA Model


In [None]:
ldamodel = LdaModel(corpus=trigram_bow_corpus, num_topics=10, id2word=trigram_dictionary)

In [None]:
ldamodel.save(os.path.join(_data_directory,'lda_topic.model'))

In [None]:
ldamodel.show_topics(formatted=False)

In [None]:
#d = Dictionary.load(os.path.join(_data_directory,'trigram_dict_all.dict'))
#c = MmCorpus(os.path.join(_data_directory,'trigram_bow_corpus_all.mm'))
#lda = LdaModel.load(os.path.join(_data_directory,'lda_topic.model'))

#data = pyLDAvis.gensim.prepare(lda, c, d)
#data

#
pyLDAvis.gensim.prepare(topic_model=ldamodel, corpus=trigram_bow_corpus, dictionary=trigram_dictionary)


In [None]:
fiz=plt.figure(figsize=(15,30))
for i in range(10):
    df=pd.DataFrame(ldamodel.show_topic(i), columns=['term','prob']).set_index('term')
#     df=df.sort_values('prob')
    
    plt.subplot(5,2,i+1)
    plt.title('topic '+str(i+1))
    sns.barplot(x='prob', y=df.index, data=df, label='Cities', palette='Reds_d')
    plt.xlabel('probability')
    

plt.show()

In [None]:
## MULTI-CORE LDA

In [None]:
%%time

lda_model_filepath = os.path.join(_data_directory, 'lda_multicore_topic.model')

if False:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=5,
                           id2word=trigram_dictionary,
                           workers=3)
    
    lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

In [None]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [None]:
explore_topic(topic_number=3)

## Coherence tests

In [None]:
lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]

hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)]

ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]

In [None]:
trigram_articles_filepath = os.path.join(_data_directory,'trigram_transformed_articles_all.txt')
trigram_articles = LineSentence(trigram_articles_filepath)

In [None]:
lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=trigram_articles, dictionary=trigram_dictionary, window_size=10).get_coherence()

hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=trigram_articles, dictionary=trigram_dictionary, window_size=10).get_coherence()

lda_coherence = CoherenceModel(topics=ldatopics, texts=trigram_articles, dictionary=trigram_dictionary, window_size=10).get_coherence()

In [None]:
def evaluate_bar_graph(coherences, indices):
    """
    Function to plot bar graph.
    
    coherences: list of coherence values
    indices: Indices to be used to mark bars. Length of this and coherences should be equal.
    """
    assert len(coherences) == len(indices)
    n = len(coherences)
    x = np.arange(n)
    plt.bar(x, coherences, width=0.2, tick_label=indices, align='center')
    plt.xlabel('Models')
    plt.ylabel('Coherence Value')

In [None]:
import matplotlib.pyplot as plt
evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence],
                   ['LSI', 'HDP', 'LDA'])

## TF-IDF Model

In [None]:
tfidf = TfidfModel(trigram_bow_corpus)
corpus_tfidf = tfidf[trigram_bow_corpus]

In [None]:
# STEP 3 : Create similarity matrix of all files
print("Create similarity matrix of all files")
print('-'*10)
index = similarities.MatrixSimilarity(tfidf[trigram_bow_corpus])
print("We compute similarities from the TF-IDF corpus : %s"%type(index))
index.save(os.path.join(_data_directory,'tfidf.index'))
index = similarities.MatrixSimilarity.load(os.path.join(_data_directory,'tfidf.index'))

sims = index[corpus_tfidf]
print("We get a similarity matrix for all documents in the corpus %s"% type(sims))
print 
#print("Done in %.3fs"%(time()-t0))
sims