Topic Modeling - Latent Dirichlet Allocation using Gensim
===

Preparing the corpus
---

Load Corpus iterator and dictionary
---

In [4]:
import logging, gensim, bz2
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

corpus_dict = './wikipedia/wiki_en_wordids.txt'
corpus = './wikipedia/wiki_en_tfidf.mm'

# load id->word mapping (the dictionary), one of the results of step 2 above
id2word = gensim.corpora.Dictionary.load_from_text(corpus_dict)
mm = gensim.corpora.MmCorpus(corpus)
# mm = gensim.corpora.MmCorpus(bz2.BZ2File('corpus')) # use if compressed to bz2

print(mm)

2017-11-18 15:57:23,991 : INFO : initializing corpus reader from ./wikipedia/wiki_en_tfidf.mm
2017-11-18 15:57:24,053 : INFO : accepted corpus with 4343550 documents, 100000 features, 682472195 non-zero entries


MmCorpus(4343550 documents, 100000 features, 682472195 non-zero entries)


Run Latent Dirichlet Allocation
---

In [5]:
# extract 100 LDA topics, using 1 pass and updating once every 1 chunk (10,000 documents)
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=100, update_every=1, chunksize=10000, passes=1)

# print the most contributing words for 20 randomly selected topics
lda.print_topics(20)

2017-11-18 15:57:37,633 : INFO : using symmetric alpha at 0.01
2017-11-18 15:57:37,902 : INFO : using symmetric eta at 1e-05
2017-11-18 15:57:37,917 : INFO : using serial LDA version on this node
2017-11-18 15:59:09,144 : INFO : running online (single-pass) LDA training, 100 topics, 1 passes over the supplied corpus of 4343550 documents, updating model once every 10000 documents, evaluating perplexity every 100000 documents, iterating 50x with a convergence threshold of 0.001000
2017-11-18 15:59:28,943 : INFO : PROGRESS: pass 0, at document #10000/4343550
2017-11-18 16:00:16,336 : INFO : merging changes from 10000 documents into a model of 4343550 documents
2017-11-18 16:00:23,282 : INFO : topic #83 (0.010): 0.001*"politician" + 0.001*"player" + 0.001*"law" + 0.001*"french" + 0.001*"italian" + 0.001*"german" + 0.001*"mozambique" + 0.001*"author" + 0.001*"finland" + 0.001*"footballer"
2017-11-18 16:00:23,284 : INFO : topic #76 (0.010): 0.002*"emperor" + 0.001*"autocad" + 0.001*"system" 

[(66,
  '0.010*"power" + 0.009*"solar" + 0.008*"satellite" + 0.007*"plant" + 0.007*"gas" + 0.007*"energy" + 0.006*"mw" + 0.006*"dam" + 0.006*"space" + 0.005*"oil"'),
 (76,
  '0.080*"polish" + 0.045*"poland" + 0.031*"warsaw" + 0.022*"bydgoszcz" + 0.017*"kraków" + 0.012*"lithuanian" + 0.010*"poznań" + 0.010*"wrocław" + 0.009*"jan" + 0.009*"andrzej"'),
 (3,
  '0.058*"fuscous" + 0.021*"paralympics" + 0.013*"dundee" + 0.012*"chancel" + 0.011*"cornice" + 0.011*"hibernian" + 0.010*"anteaters" + 0.010*"dorsum" + 0.009*"curacy" + 0.009*"subdistrict"'),
 (52,
  '0.020*"bridge" + 0.016*"canadian" + 0.015*"highway" + 0.015*"canada" + 0.013*"route" + 0.011*"ontario" + 0.010*"road" + 0.008*"toronto" + 0.007*"quebec" + 0.007*"river"'),
 (34,
  '0.009*"food" + 0.008*"costal" + 0.006*"restaurant" + 0.005*"chef" + 0.005*"wine" + 0.004*"jae" + 0.004*"eun" + 0.004*"dish" + 0.004*"cuisine" + 0.004*"rice"'),
 (4,
  '0.009*"chemical" + 0.007*"chemistry" + 0.007*"acid" + 0.006*"compound" + 0.006*"reaction" + 

In [6]:
lda.save('lda_wiki.model')

2017-11-18 20:56:16,959 : INFO : saving LdaState object under lda_wiki.model.state, separately None
2017-11-18 20:56:17,399 : INFO : saved lda_wiki.model.state
2017-11-18 20:56:17,449 : INFO : saving LdaModel object under lda_wiki.model, separately ['expElogbeta', 'sstats']
2017-11-18 20:56:17,449 : INFO : not storing attribute state
2017-11-18 20:56:17,450 : INFO : not storing attribute id2word
2017-11-18 20:56:17,450 : INFO : storing np array 'expElogbeta' to lda_wiki.model.expElogbeta.npy
2017-11-18 20:56:17,561 : INFO : not storing attribute dispatcher
2017-11-18 20:56:17,587 : INFO : saved lda_wiki.model


Test Model
---

In [41]:
from gensim import corpora
from six import iteritems

test_corpus_path = 'corpus.txt'
stop_list = set('for is and be are so has they in'.split())

# Generate Dictionary
with open(test_corpus_path, 'r') as file:
    dictionary = corpora.Dictionary(line.lower().split() for line in file)

    # remove stop words and words that appear only once
    stop_ids = [dictionary.token2id[stopword] for stopword in stop_list
        if stopword in dictionary.token2id]
    once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq==1]
    dictionary.filter_tokens(stop_ids+once_ids)
    #dictionary.filter_tokens(stop_ids)
    dictionary.compactify()

# TFIDF or BOW
class MyCorpus(object):
    def __iter__(self):
        with open(test_corpus_path, 'r') as file:
            for line in file:
                # Assume there's one document per line, tokens separated by whitespace
                yield dictionary.doc2bow(line.lower().split())
        

test_corpus = MyCorpus()

same_lda_model = gensim.models.LdaModel.load('lda_wiki.model')
test_lda = same_lda_model[test_corpus]

'''
# Visualize final results
print('Corpus LDA')
topics_index = list()
with open(test_corpus_path, 'r') as file:
    for i, line in enumerate(file):
        topics_index.append(max(test_lda[i], key=lambda item:abs(item[1]))[0])
        print('Topic : ', topics_index[i])
        print(test_lda[i], " # " + line)
       
    print('\n')

'''
topics_index = list() 
with open(test_corpus_path, 'r') as file:
    for topic, doc in zip(test_lda, file):
        topic_chosen = max(topic, key=lambda item:item[1])[0]
        print('topic: ', topic_chosen)
        print(lda.print_topic(topic_chosen))
        print(topic, doc)


2017-11-18 22:24:03,276 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-11-18 22:24:03,276 : INFO : built Dictionary(141 unique tokens: ['develops', 'eukaryotes', 'transforming', 'central', 'development,']...) from 10 documents (total 228 corpus positions)
2017-11-18 22:24:03,277 : INFO : loading LdaModel object from lda_wiki.model
2017-11-18 22:24:03,279 : INFO : loading expElogbeta from lda_wiki.model.expElogbeta.npy with mmap=None
2017-11-18 22:24:03,294 : INFO : setting ignored attribute state to None
2017-11-18 22:24:03,294 : INFO : setting ignored attribute id2word to None
2017-11-18 22:24:03,295 : INFO : setting ignored attribute dispatcher to None
2017-11-18 22:24:03,295 : INFO : loaded lda_wiki.model
2017-11-18 22:24:03,295 : INFO : loading LdaModel object from lda_wiki.model.state
2017-11-18 22:24:03,647 : INFO : loaded lda_wiki.model.state


topic:  71
0.013*"album" + 0.009*"song" + 0.007*"chart" + 0.006*"band" + 0.005*"track" + 0.004*"vocals" + 0.004*"guitar" + 0.004*"label" + 0.004*"you" + 0.004*"songs"
[(18, 0.14428571428571454), (28, 0.14428571322368802), (68, 0.13715354059388421), (71, 0.29427503083468809), (80, 0.14428571428571454)] Artificial intelligence (AI, also machine intelligence, MI) is Intelligence displayed by machines, in contrast with the natural intelligence (NI) displayed by humans and other animals.

topic:  43
0.024*"league" + 0.019*"club" + 0.019*"cup" + 0.015*"goals" + 0.015*"football" + 0.014*"apps" + 0.011*"fc" + 0.010*"match" + 0.009*"round" + 0.009*"rugby"
[(1, 0.091818181818181951), (18, 0.12533013336624779), (40, 0.090004314025152507), (43, 0.18272727272727296), (48, 0.18262258567661377), (49, 0.15113387602289577), (77, 0.091818181818181951)] AI research is defined as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of succ