In [16]:
import logging 
import itertools 
import numpy as np 
import gensim 

# import some more modules for processing the corpus
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS


import pyLDAvis
import pyLDAvis.gensim

import os 

# configure logging 
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  

def head(stream, n=10):
    return list(itertools.islice(stream, n))

  from collections import Iterable


In [3]:
# tokenize
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [4]:
def iter_docs(base_dir):
    docCount = 0
    docs = os.listdir(base_dir)

    for doc in docs:
        if not doc.startswith('.'):
            with open(base_dir + doc, "r") as file:
                text = file.read()
                tokens = tokenize(text) 
        
                yield doc, tokens

In [5]:
stream = iter_docs('./test/')

In [9]:
doc_stream = (tokens for _, tokens in iter_docs('./test/'))
              
id2word = gensim.corpora.Dictionary(doc_stream) 

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : built Dictionary(4236 unique tokens: ['able', 'accessory', 'accommodate', 'adequate', 'adopt']...) from 106 documents (total 64576 corpus positions)


In [10]:
# filter out words in only 1 doc, keeping the rest
id2word.filter_extremes(no_below=2, no_above=1.0)

INFO : discarding 0 tokens: []...
INFO : keeping 4236 tokens which were in no less than 2 and no more than 106 (=100.0%) documents
INFO : resulting dictionary: Dictionary(4236 unique tokens: ['able', 'accessory', 'accommodate', 'adequate', 'adopt']...)


In [11]:
class Corpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        self.titles = []
        for title, tokens in itertools.islice(iter_docs(self.dump_file), self.clip_docs):
            self.titles.append(title)
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

In [12]:
news_corpus = Corpus('./test/', id2word)

In [23]:
lda_model = gensim.models.ldamodel.LdaModel(news_corpus, num_topics=20, id2word=id2word, passes=10) 

INFO : using symmetric alpha at 0.05
INFO : using symmetric eta at 0.05
INFO : using serial LDA version on this node
INFO : running online (multi-pass) LDA training, 20 topics, 10 passes over the supplied corpus of 106 documents, updating model once every 106 documents, evaluating perplexity every 106 documents, iterating 50x with a convergence threshold of 0.001000
INFO : -11.142 per-word bound, 2259.8 perplexity estimate based on a held-out corpus of 106 documents with 64576 words
INFO : PROGRESS: pass 0, at document #106/106
INFO : topic #14 (0.050): 0.008*"hong" + 0.007*"kong" + 0.006*"people" + 0.006*"said" + 0.006*"food" + 0.005*"fiftyforward" + 0.005*"china" + 0.005*"roy" + 0.005*"li" + 0.004*"life"
INFO : topic #10 (0.050): 0.022*"li" + 0.018*"school" + 0.010*"nashville" + 0.009*"said" + 0.007*"tn" + 0.006*"elementary" + 0.006*"food" + 0.005*"meals" + 0.004*"middle" + 0.004*"kong"
INFO : topic #3 (0.050): 0.009*"says" + 0.006*"people" + 0.005*"office" + 0.005*"work" + 0.005*"sc

INFO : topic diff=1.069302, rho=0.353553
INFO : -6.599 per-word bound, 97.0 perplexity estimate based on a held-out corpus of 106 documents with 64576 words
INFO : PROGRESS: pass 7, at document #106/106
INFO : topic #19 (0.050): 0.044*"hong" + 0.042*"kong" + 0.015*"people" + 0.015*"independence" + 0.013*"says" + 0.013*"beijing" + 0.010*"china" + 0.010*"city" + 0.008*"protests" + 0.008*"time"
INFO : topic #3 (0.050): 0.020*"says" + 0.015*"office" + 0.010*"work" + 0.010*"employees" + 0.009*"people" + 0.009*"pandemic" + 0.008*"care" + 0.008*"boston" + 0.007*"percent" + 0.007*"business"
INFO : topic #6 (0.050): 0.022*"dress" + 0.022*"midsommar" + 0.013*"went" + 0.013*"queen" + 0.013*"auction" + 0.013*"academy" + 0.013*"worn" + 0.013*"museum" + 0.013*"sold" + 0.013*"items"
INFO : topic #16 (0.050): 0.025*"says" + 0.011*"people" + 0.011*"boston" + 0.009*"services" + 0.008*"home" + 0.008*"technology" + 0.006*"company" + 0.006*"care" + 0.006*"work" + 0.006*"software"
INFO : topic #7 (0.050): 0

In [27]:
# how to store corpus to disk
from gensim.corpora import MmCorpus
if not os.path.isdir("./gensim_files"):
    os.mkdir("./gensim_files")
MmCorpus.serialize('./gensim_files/news_corpus.mm', news_corpus)

# how to store dictionary to disk
id2word_ccp.save('./gensim_files/news_dictionary')

# how to store model to disk 
lda_model.save('./gensim_files/lda_news_corpus_10iters.model')

INFO : storing corpus in Matrix Market format to ./gensim_files/news_corpus.mm
INFO : saving sparse matrix to ./gensim_files/news_corpus.mm
INFO : PROGRESS: saving document #0
INFO : saved 106x4236 matrix, density=8.715% (39131/449016)
INFO : saving MmCorpus index to ./gensim_files/news_corpus.mm.index
INFO : saving Dictionary object under ./gensim_files/news_dictionary, separately None
INFO : saved ./gensim_files/news_dictionary
INFO : saving LdaState object under ./gensim_files/lda_news_corpus_10iters.model.state, separately None
INFO : saved ./gensim_files/lda_news_corpus_10iters.model.state
INFO : saving LdaModel object under ./gensim_files/lda_news_corpus_10iters.model, separately ['expElogbeta', 'sstats']
INFO : storing np array 'expElogbeta' to ./gensim_files/lda_news_corpus_10iters.model.expElogbeta.npy
INFO : not storing attribute dispatcher
INFO : not storing attribute id2word
INFO : not storing attribute state
INFO : saved ./gensim_files/lda_news_corpus_10iters.model


In [28]:
outputFileName = 'news_covid_lda_vis.html'
news_corpus = MmCorpus('./gensim_files/news_corpus.mm')
vis = pyLDAvis.gensim.prepare(lda_model, news_corpus, id2word)
pyLDAvis.prepared_data_to_html(vis)
pyLDAvis.save_html(vis,outputFileName)

INFO : loaded corpus index from ./gensim_files/news_corpus.mm.index
INFO : initializing cython corpus reader from ./gensim_files/news_corpus.mm
INFO : accepted corpus with 106 documents, 4236 features, 39131 non-zero entries


<gensim.corpora.dictionary.Dictionary at 0x7f7fa505a710>