In [1]:
import os
import itertools # helpful library for iterating through things
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS

# spacy for lemmatization
import spacy
def head(stream, n=10):
    return list(itertools.islice(stream, n))


import subprocess

In [2]:
# tokenize text
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [3]:
def iter_docs(base_dir):
    docCount = 0
    docs = os.listdir(base_dir)

    for doc in docs:
        if not doc.startswith('.'):
            with open(base_dir + doc, "r") as file:
                text = file.read()
                tokens = tokenize(text) 
        
                yield doc, tokens

In [4]:
from gensim.models.wrappers import LdaMallet
os.environ.update({'MALLET_HOME':r'./mallet-2.0.8/'}) 
#You should update this path as per the path of Mallet directory on your system.
mallet_path = r'./mallet-2.0.8/bin/mallet' 

In [5]:
doc_stream = (tokens for _, tokens in iter_docs('./test/'))
              
id2word_news = gensim.corpora.Dictionary(doc_stream) 

In [6]:
class Corpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        self.titles = []
        for title, tokens in itertools.islice(iter_docs(self.dump_file), self.clip_docs):
            self.titles.append(title)
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

In [7]:
news_corpus = Corpus('./test/', id2word_news)

In [8]:
ldamallet = gensim.models.wrappers.LdaMallet(
   mallet_path, corpus=news_corpus, num_topics=20, id2word=id2word_news
)
pprint(ldamallet.show_topics(formatted=False))

[(4,
  [('china', 0.035126722987994664),
   ('news', 0.024899955535793685),
   ('beijing', 0.018674966651845263),
   ('foreign', 0.01600711427301023),
   ('reporters', 0.011116051578479324),
   ('coverage', 0.011116051578479324),
   ('credentials', 0.01067140951534015),
   ('street', 0.009337483325922631),
   ('wrote', 0.008892841262783458),
   ('journal', 0.008892841262783458)]),
 (12,
  [('mail', 0.022072189041807324),
   ('federal', 0.017398078421189303),
   ('safety', 0.01428200467411062),
   ('west', 0.01428200467411062),
   ('ballots', 0.011685276551545054),
   ('results', 0.010906258114775382),
   ('states', 0.010646585302518826),
   ('coronavirus', 0.009867566865749156),
   ('schools', 0.00882887561672293),
   ('leaders', 0.008569202804466372)]),
 (18,
  [('hong', 0.061009077475195274),
   ('kong', 0.054042643023010345),
   ('government', 0.012455140384209416),
   ('china', 0.011399620012666244),
   ('mainland', 0.009921891492505806),
   ('immigration', 0.008866371120962635),
 

# Mallet Topic Coherence Value Testing

In [None]:
def compute_coherence_values(dictionary, corpus, limit, start=2, step=3):
    """
    Compute u_mass coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word_news)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word_news, corpus=news_corpus, start=2, limit=32, step=5)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
limit=32; start=2; step=5;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [16]:
inputDir = "./test/"
TXTFiles_MalletFormatted_FileName = "./covid_news_malletformatted_txt_files.mallet"
numTopics = 20
Compressed_FileName = "./covid_news_NLP-Mallet_Output_Compressed.gz"
Keys_FileName = "./covid_news_NLP-Mallet_Output_Keys.tsv"
Composition_FileName = './covid_news_NLP-Mallet_Output_Composition.tsv'

In [17]:
subprocess.call([mallet_path, 
                 'import-dir', 
                 '--input', inputDir, 
                 '--output', TXTFiles_MalletFormatted_FileName, 
                 '--keep-sequence', 
                 '--remove-stopwords'])

subprocess.call([mallet_path, 
                 'train-topics', '--input', 
                 TXTFiles_MalletFormatted_FileName, 
                 '--num-topics', str(numTopics), 
                 '--optimize-interval', str(numTopics), 
                 '--output-state', Compressed_FileName, 
                 '--output-topic-keys', Keys_FileName, 
                 '--output-doc-topics', Composition_FileName])


0