# Topic Modeling - LDA

In [1]:
import gensim
from gensim import corpora, models
from sklearn.externals import joblib

import pyLDAvis
import pyLDAvis.gensim

In [2]:
%%time 


corpus_dictionary_file = 'data/eos/unigram_dictionary_EOS.txt'
corpus_doc2bow_file = 'data/eos/processed_unigram_corpus_EOS.mm'


# Load to memory
corpus = gensim.corpora.MmCorpus(corpus_doc2bow_file)
dictionary = gensim.corpora.Dictionary.load_from_text(corpus_dictionary_file)

CPU times: user 1.06 s, sys: 28 ms, total: 1.08 s
Wall time: 1.1 s


In [3]:
%%time

if 1 == 1:
    # generate LDA model
    ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, id2word=dictionary, 
                                                       num_topics=20, chunksize=2000, workers=7)
    joblib.dump(ldamodel, 'data/eos/LDAmodel_unigram_20_EOS.pkl')
    
else:
    ldamodel = joblib.load('data/eos/LDAmodel_unigram_20_EOS.pkl')


CPU times: user 15min 8s, sys: 41.6 s, total: 15min 49s
Wall time: 21min 27s


In [4]:
(ldamodel.print_topics(num_topics=20, num_words=10))

[(0,
  '0.014*"oil" + 0.013*"percent" + 0.012*"price" + 0.009*"year" + 0.009*"market" + 0.007*"per" + 0.006*"energy" + 0.006*"billion" + 0.005*"cent" + 0.005*"million"'),
 (1,
  '0.015*"islamic" + 0.015*"state" + 0.014*"syria" + 0.011*"group" + 0.010*"militant" + 0.010*"iraq" + 0.008*"syrian" + 0.007*"attack" + 0.007*"force" + 0.007*"killed"'),
 (2,
  '0.031*"la" + 0.017*"şi" + 0.016*"în" + 0.009*"le" + 0.008*"din" + 0.008*"cu" + 0.007*"care" + 0.007*"să" + 0.007*"un" + 0.006*"pe"'),
 (3,
  '0.019*"این" + 0.006*"برای" + 0.005*"کرد" + 0.004*"ایران" + 0.003*"وی" + 0.003*"نیز" + 0.003*"سال" + 0.003*"city" + 0.003*"کشور" + 0.003*"باید"'),
 (4,
  '0.009*"government" + 0.007*"state" + 0.007*"country" + 0.007*"minister" + 0.006*"president" + 0.006*"party" + 0.005*"force" + 0.005*"also" + 0.004*"military" + 0.004*"political"'),
 (5,
  '0.007*"one" + 0.006*"people" + 0.005*"say" + 0.005*"time" + 0.005*"year" + 0.005*"can" + 0.004*"like" + 0.004*"just" + 0.003*"many" + 0.003*"life"'),
 (6,
  '0.

In [5]:
# Visualize the LDA topics
lda_vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(lda_vis)

# WordCloud

In [None]:
## word lists
n_topics = 20
terms = []
# for i in range(0, n_topics):
#     temp = ldamodel.show_topic(i, 10)
#     for term in temp:
#         terms.append(term)
        
#     print ("Top 10 terms for topic #" + str(i) + ": " + ", ".join([i[0] for i in terms]))

## word lists
for i in range(0, n_topics):
    temp = ldamodel.show_topic(i, 50)
    topic_terms = []
    for term in temp:
        topic_terms.append(term)
        
    terms.append(topic_terms)
    print ("Top 10 terms for topic #" + str(i) + ": "+ ", ".join([i[0] for i in topic_terms]))


# print(terms)

In [None]:
## word clouds
from os import path
import matplotlib.pyplot as plt
from wordcloud import WordCloud

print (terms[0])
def terms_to_wordcounts(terms, multiplier=1000):
    return  " ".join([" ".join(int(multiplier*i[0]) * [i[1]]) for i in terms])

wordcloud = WordCloud(font_path="Impact_Label.ttf", background_color="black").generate(terms_to_wordcounts(terms), 1000)

plt.imshow(wordcloud) 
plt.axis("off")
plt.savefig("terms1")

plt.close()