In [124]:
from gensim.utils import simple_preprocess, ClippedCorpus
from gensim.corpora import Dictionary, MmCorpus
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import LdaModel
import numpy as np

In [125]:
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]
    
# create iterator over the documents in corpus.txt 
def corpus_iter():
    for line in open('./data/corpus.txt'):
        yield tokenize(line)

In [131]:
# create dictionary from corpus
%time nyt_dictionary = Dictionary(corpus_iter())
print(nyt_dictionary)

CPU times: user 2min 4s, sys: 1.25 s, total: 2min 5s
Wall time: 2min 8s
Dictionary(246605 unique tokens: ['able', 'accomplished', 'acute', 'ads', 'advantage']...)


In [132]:
# sample sentence
doc = "Phenotypic characterization of the SIRC (Statens Seruminstitut Rabbit Cornea) cell line reveals a mixed epithelial and fibroblastic nature"
bow = nyt_dictionary.doc2bow(tokenize(doc))
print('Word IDs present in dictionary:')
print(bow)
print([nyt_dictionary[i[0]] for i in bow])

Word IDs present in dictionary:
[(264, 1), (936, 1), (2922, 1), (3752, 1), (11649, 1), (14261, 1), (41103, 1), (118162, 1)]
['nature', 'line', 'cell', 'mixed', 'reveals', 'rabbit', 'cornea', 'epithelial']


In [134]:
# create a stream of bag-of-words vectors
class BOWCorpus(object):
    def __init__(self, corpus_iter, dictionary):
        self.corpus_iter = corpus_iter
        self.dictionary = dictionary
    
    def __iter__(self):
        for tokens in self.corpus_iter():
            yield self.dictionary.doc2bow(tokens)

nyt_corpus = BOWCorpus(corpus_iter, nyt_dictionary)

In [135]:
# store BOW-corpus into a file
%time MmCorpus.serialize('./data/nyt_corpus_bow.mm', nyt_corpus)

CPU times: user 2min 21s, sys: 1.44 s, total: 2min 22s
Wall time: 2min 24s


In [93]:
# load BOW-corpus
mm_corpus = MmCorpus('./data/nyt_corpus_bow.mm')
print(mm_corpus)

MmCorpus(71973 documents, 246605 features, 17912557 non-zero entries)


In [119]:
# use fewer documents during training, LDA is slow
clipped_corpus = ClippedCorpus(mm_corpus, 10000)  

In [120]:
# fit LDA model
%time lda_model = LdaModel(clipped_corpus, num_topics=20, id2word=nyt_dictionary, passes=4)

CPU times: user 1min 16s, sys: 6.18 s, total: 1min 22s
Wall time: 1min 19s


In [123]:
lda_model.show_topic(15, topn=50)

[('said', 0.018776244),
 ('military', 0.007428504),
 ('state', 0.007309765),
 ('security', 0.006394513),
 ('attacks', 0.0061470517),
 ('government', 0.0060266275),
 ('islamic', 0.0060094655),
 ('united', 0.00581097),
 ('killed', 0.0049792747),
 ('group', 0.00484526),
 ('saudi', 0.0047487537),
 ('people', 0.0045015886),
 ('forces', 0.0043934025),
 ('officials', 0.004319128),
 ('war', 0.004206837),
 ('brussels', 0.0041150996),
 ('al', 0.003945687),
 ('american', 0.003882213),
 ('iran', 0.0037156686),
 ('paris', 0.0037052904),
 ('mr', 0.0036327094),
 ('israel', 0.003547665),
 ('attack', 0.0034859704),
 ('states', 0.0034659172),
 ('year', 0.0034348196),
 ('country', 0.0032869552),
 ('iraq', 0.00323156),
 ('police', 0.0030473035),
 ('syria', 0.0030255055),
 ('french', 0.002898777),
 ('israeli', 0.0028642572),
 ('foreign', 0.0027879404),
 ('minister', 0.002597607),
 ('including', 0.0024606762),
 ('terrorist', 0.0024088093),
 ('arabia', 0.00238324),
 ('france', 0.0023167413),
 ('told', 0.0022

In [113]:
lda_model.print_topics(2)

[(19,
  '0.007*"people" + 0.005*"women" + 0.005*"american" + 0.004*"world" + 0.004*"like" + 0.004*"war" + 0.003*"time" + 0.003*"mr" + 0.003*"years" + 0.003*"way"'),
 (17,
  '0.015*"students" + 0.014*"north" + 0.013*"said" + 0.010*"school" + 0.009*"korea" + 0.007*"nuclear" + 0.007*"university" + 0.007*"college" + 0.007*"workers" + 0.006*"law"')]