# Topic Modelling with Latent Dirichlet Allocation

## Coherence vs Perplexity

In [None]:
import logging
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s', 
    level=logging.WARNING
)

In [None]:
import matplotlib.pyplot as plt
import multiprocessing as mp
import numpy as np
import os
import pickle
import pyLDAvis
import time

from eda.corpus.reuterscorpus import ReutersCorpus

from gensim.models import CoherenceModel, LdaMulticore
from gensim.corpora import Dictionary, MmCorpus
from os import path
from tqdm.notebook import tqdm

In [None]:
datadir = path.abspath(path.join(os.getcwd(), "data"))

----

In [None]:
# Read in the corpus from within the archive file
fin = path.join(datadir, "reuters21578.tar.gz")
rc = ReutersCorpus(fin)

In [None]:
# Filter out some of the more common words,
# and some of the less-common ones as well
rc.dictionary.filter_extremes(no_below=20, no_above=0.1)
rc.dictionary.compactify()

In [None]:
# Serialize the Reuters 21578 corpus
fout = path.join(datadir, "reuters21578.mm")
MmCorpus.serialize(fout, rc)

In [None]:
# Save the dictionary to file as text
fout = path.join(datadir, "reuters21578.dict.txt")
rc.dictionary.save_as_text(fout)

In [None]:
# Save the text of the corpus for later use
texts = list(rc.get_texts())

fout = path.join(datadir, "texts.pkl")
with open(fout, "wb") as f:
    pickle.dump(texts, f)

----

In [None]:
fin = path.join(datadir, "reuters21578.dict.txt")
dictionary = Dictionary.load_from_text(fin)

In [None]:
fin = path.join(datadir, "reuters21578.mm")
corpus = MmCorpus(fin)

In [None]:
fin = path.join(datadir, "texts.pkl")
with open(fin, "rb") as f:
    texts = pickle.load(f)

In [None]:
start = 5
limit = 15
step = 5


In [None]:
coherence_vals = []
perplexity_vals = []

for num_topics in tqdm(range(start, limit, step)):
    lda_model = LdaMulticore(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        passes=20,
        workers=mp.cpu_count()-1
    )
    
    perplexity_vals.append(
        np.exp2(-lda_model.log_perplexity(corpus))
    )
    
    coherence_model_lda = CoherenceModel(
        model=lda_model, 
        texts=texts, 
        dictionary=dictionary, 
        coherence="c_v"
    )
    coherence_vals.append(
        coherence_model_lda.get_coherence()
    )

In [None]:
x = range(start, limit, step)

c1, bgcolor, c2 = ["#ef8a62", "#f7f7f7", "#67a9cf"]


fig, ax1 = plt.subplots(figsize=(12,5))

# coherence plot
ax1.plot(x, coherence_vals, "o-", color=c1)
ax1.set_xlabel("Number of topics")
ax1.set_ylabel("Coherence", color=c1)
ax1.tick_params("y", colors=c1)

# perplexity plot
ax2 = ax1.twinx()
ax2.plot(x, perplexity_vals, "o-", color=c2)
ax2.set_ylabel("Perplexity", color=c2)
ax2.tick_params("y", colors=c2)

# x-axis is shared 
ax1.set_xticks(x)
ax1.set_facecolor(bgcolor)

fig.tight_layout()
plt.title("LDA: Coherence vs Perplexity")
plt.show()

----

In [None]:
lda_example = LdaMulticore(
    corpus=corpus,
    id2word=dictionary,
    num_topics=35,
    passes=20,
    workers=mp.cpu_count()-1
)

In [None]:
pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim_models.prepare(
    lda_example,
    corpus, 
    dictionary, 
    n_jobs=1, 
    sort_topics=False
)

pyLDAvis.display(vis)