# Final LDA Model Comparisons

```
Author:
Zach Wolpe
zachcolinwolpe@gmail.com
zachwolpe.com
```

Before model comparison can be conducted, the models need to by trained. I have chosen to focus on the comparison between Sklearn & Pymc3 autoencoder - although the code below does train a Gensim LDA model.


## Specify Model Parameters
Specify the model parameters:
    - number of topics
    - number of words to keep in the corpus

All 3 models will be trained with these specifications.

In [6]:
# Specify parameters

# number of topics
n_topics = 10

# size of vocab
n_words = 1000

## Imports

In [7]:
%matplotlib inline
import sys, os
# unfortunately I was not able to run it on GPU due to overflow problems
%env THEANO_FLAGS=device=cpu,floatX=float64
import theano

from collections import OrderedDict
from copy import deepcopy
import numpy as np
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt
import seaborn as sns
from theano import shared
import theano.tensor as tt
from theano.sandbox.rng_mrg import MRG_RandomStreams

import pymc3 as pm
from pymc3 import math as pmmath
from pymc3 import Dirichlet
from pymc3.distributions.transforms import t_stick_breaking
plt.style.use('seaborn-darkgrid')



# Gensim & collectives
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys, os
import theano
from time import time
import re
from pprint import pprint

# nltk
import nltk
from nltk.corpus import stopwords

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

env: THEANO_FLAGS=device=cpu,floatX=float64


# Import Data


In [8]:
# The number of words in the vocabulary
n_words = n_words

print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_words,
                                stop_words='english')

t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
feature_names = tf_vectorizer.get_feature_names()
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 2.832s.
Extracting tf features for LDA...
done in 3.719s.


# Train-test split

In [9]:
n_samples_tr = round(tf.shape[0] * 0.7) # testing on 70%
n_samples_te = tf.shape[0] - n_samples_tr
docs_tr = tf[:n_samples_tr, :]
docs_te = tf[n_samples_tr:, :]
print('Number of docs for training = {}'.format(docs_tr.shape[0]))
print('Number of docs for testing = {}'.format(docs_te.shape[0]))


n_tokens = np.sum(docs_tr[docs_tr.nonzero()])
print('Number of tokens in training set = {}'.format(n_tokens))
print('Sparsity = {}'.format(
    len(docs_tr.nonzero()[0]) / float(docs_tr.shape[0] * docs_tr.shape[1])))

Number of docs for training = 7920
Number of docs for testing = 3394
Number of tokens in training set = 384502
Sparsity = 0.0255030303030303


# Training

The below script trains the 3 LDA models:
    - pymc3
    - sklearn
    - gensim
    
The cell immediately allows the user to specify the parameters:
    - K: number of topics
    

## Run Pymc3, Skearn & Gensim Models 
The code below computes the fitted LDA parameter estimates for each method.

## pymc3 & sklearn

In [10]:
######################################################## PYMC3 #######################################################
######################################################## PYMC3 #######################################################
print('')
print('Pymc3: ')

# Log-likelihood of documents for LDA
def logp_lda_doc(beta, theta):
  
  """Returns the log-likelihood function for given documents.
  
  K : number of topics in the model
  V : number of words (size of vocabulary)
  D : number of documents (in a mini-batch)
  
  Parameters
  ----------
  beta : tensor (K x V)
      Word distribution.
  theta : tensor (D x K)
      Topic distributions for the documents.
  """
  
  def ll_docs_f(docs):
    
    dixs, vixs = docs.nonzero()
    vfreqs = docs[dixs, vixs]
    ll_docs = vfreqs * pmmath.logsumexp(
          tt.log(theta[dixs]) + tt.log(beta.T[vixs]), axis=1).ravel()
      
    # Per-word log-likelihood times no. of tokens in the whole dataset
    return tt.sum(ll_docs) / (tt.sum(vfreqs)+1e-9) * n_tokens

  return ll_docs_f



# fit the pymc3 LDA

# we have sparse dataset. It's better to have dence batch so that all words accure there
minibatch_size = 128

# defining minibatch
doc_t_minibatch = pm.Minibatch(docs_tr.toarray(), minibatch_size)
doc_t = shared(docs_tr.toarray()[:minibatch_size])

with pm.Model() as model:
    theta = Dirichlet('theta', a=pm.floatX((1.0 / n_topics) * np.ones((minibatch_size, n_topics))),
                   shape=(minibatch_size, n_topics), transform=t_stick_breaking(1e-9),
                   # do not forget scaling
                   total_size = n_samples_tr)
    beta = Dirichlet('beta', a=pm.floatX((1.0 / n_topics) * np.ones((n_topics, n_words))),
                 shape=(n_topics, n_words), transform=t_stick_breaking(1e-9))
    # Note, that we defined likelihood with scaling, so here we need no additional `total_size` kwarg
    doc = pm.DensityDist('doc', logp_lda_doc(beta, theta), observed=doc_t)

    
# Encoder
class LDAEncoder:
    """Encode (term-frequency) document vectors to variational means and (log-transformed) stds.
    """
    def __init__(self, n_words, n_hidden, n_topics, p_corruption=0, random_seed=1):
        rng = np.random.RandomState(random_seed)
        self.n_words = n_words
        self.n_hidden = n_hidden
        self.n_topics = n_topics
        self.w0 = shared(0.01 * rng.randn(n_words, n_hidden).ravel(), name='w0')
        self.b0 = shared(0.01 * rng.randn(n_hidden), name='b0')
        self.w1 = shared(0.01 * rng.randn(n_hidden, 2 * (n_topics - 1)).ravel(), name='w1')
        self.b1 = shared(0.01 * rng.randn(2 * (n_topics - 1)), name='b1')
        self.rng = MRG_RandomStreams(seed=random_seed)
        self.p_corruption = p_corruption

    def encode(self, xs):
        if 0 < self.p_corruption:
            dixs, vixs = xs.nonzero()
            mask = tt.set_subtensor(
                tt.zeros_like(xs)[dixs, vixs],
                self.rng.binomial(size=dixs.shape, n=1, p=1-self.p_corruption)
            )
            xs_ = xs * mask
        else:
            xs_ = xs

        w0 = self.w0.reshape((self.n_words, self.n_hidden))
        w1 = self.w1.reshape((self.n_hidden, 2 * (self.n_topics - 1)))
        hs = tt.tanh(xs_.dot(w0) + self.b0)
        zs = hs.dot(w1) + self.b1
        zs_mean = zs[:, :(self.n_topics - 1)]
        zs_rho = zs[:, (self.n_topics - 1):]
        return {'mu': zs_mean, 'rho':zs_rho}

    def get_params(self):
        return [self.w0, self.b0, self.w1, self.b1]
    
    
# call Encoder
encoder = LDAEncoder(n_words=n_words, n_hidden=100, n_topics=n_topics, p_corruption=0.0)
local_RVs = OrderedDict([(theta, encoder.encode(doc_t))])

# get parameters 
encoder_params = encoder.get_params()


# Train pymc3 Model
η = .1
s = shared(η)
def reduce_rate(a, h, i):
    s.set_value(η/((i/minibatch_size)+1)**.7)

with model:
    approx = pm.MeanField(local_rv=local_RVs)
    approx.scale_cost_to_minibatch = False
    inference = pm.KLqp(approx)
inference.fit(10000, callbacks=[reduce_rate], obj_optimizer=pm.sgd(learning_rate=s),
              more_obj_params=encoder_params, total_grad_norm_constraint=200,
              more_replacements={doc_t:doc_t_minibatch})


# Extracting characteristic words
doc_t.set_value(docs_tr.toarray())
samples = pm.sample_approx(approx, draws=100)
beta_pymc3 = samples['beta'].mean(axis=0)



# Predictive distribution
def calc_pp(ws, thetas, beta, wix):
    """
    Parameters
    ----------
    ws: ndarray (N,)
        Number of times the held-out word appeared in N documents.
    thetas: ndarray, shape=(N, K)
        Topic distributions for N documents.
    beta: ndarray, shape=(K, V)
        Word distributions for K topics.
    wix: int
        Index of the held-out word

    Return
    ------
    Log probability of held-out words.
    """
    return ws * np.log(thetas.dot(beta[:, wix]))

def eval_lda(transform, beta, docs_te, wixs):
    """Evaluate LDA model by log predictive probability.

    Parameters
    ----------
    transform: Python function
        Transform document vectors to posterior mean of topic proportions.
    wixs: iterable of int
        Word indices to be held-out.
    """
    lpss = []
    docs_ = deepcopy(docs_te)
    thetass = []
    wss = []
    total_words = 0
    for wix in wixs:
        ws = docs_te[:, wix].ravel()
        if 0 < ws.sum():
            # Hold-out
            docs_[:, wix] = 0

            # Topic distributions
            thetas = transform(docs_)

            # Predictive log probability
            lpss.append(calc_pp(ws, thetas, beta, wix))

            docs_[:, wix] = ws
            thetass.append(thetas)
            wss.append(ws)
            total_words += ws.sum()
        else:
            thetass.append(None)
            wss.append(None)

    # Log-probability
    lp = np.sum(np.hstack(lpss)) / total_words

    return {
        'lp': lp,
        'thetass': thetass,
        'beta': beta,
        'wss': wss
    }



inp = tt.matrix(dtype='int64')
sample_vi_theta = theano.function(
    [inp],
    approx.sample_node(approx.model.theta, 100,  more_replacements={doc_t: inp}).mean(0)
)
def transform_pymc3(docs):
    return sample_vi_theta(docs)

%time result_pymc3 = eval_lda(transform_pymc3, beta_pymc3, docs_te.toarray(), np.arange(100))
print('Predictive log prob (pm3) = {}'.format(result_pymc3['lp']))

print('')
###################################################### SKLEARN #######################################################
###################################################### SKLEARN #######################################################
print('Sklearn: \n')

from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
%time lda.fit(docs_tr)
%time sklearn_theta = lda.fit_transform(docs_te)
beta_sklearn = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]




Pymc3: 


  self.shared = theano.shared(data[in_memory_slc])
  rval = inputs[0].__getitem__(inputs[1:])
  rval = inputs[0].__getitem__(inputs[1:])
  rval = inputs[0].__getitem__(inputs[1:])
  rval = inputs[0].__getitem__(inputs[1:])
Average Loss = 2.3883e+06: 100%|██████████| 10000/10000 [04:39<00:00, 38.09it/s]
Finished [100%]: Average Loss = 2.3899e+06
INFO (theano.gof.compilelock): Refreshing lock /Users/zachcolinwolpe/.theano/compiledir_Darwin-18.6.0-x86_64-i386-64bit-i386-3.7.3-64/lock_dir/lock


CPU times: user 46.4 s, sys: 2.06 s, total: 48.5 s
Wall time: 44.3 s
Predictive log prob (pm3) = -6.10896207824732

Sklearn: 

CPU times: user 37.6 s, sys: 848 ms, total: 38.5 s
Wall time: 42.1 s
CPU times: user 17.9 s, sys: 294 ms, total: 18.2 s
Wall time: 18.4 s


In [52]:
###################################################### GENSIM #######################################################
###################################################### GENSIM #######################################################
print('Gensim: ')

# Stop words
# download stopwords
nltk.download('stopwords')

# NLTK Stop words
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Clean data
# Convert to list
data = np.array(dataset.data)

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]


def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# remove stop words & lemmantize
# Define functions for stopwords, bigrams, trigrams and lemmatization
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


# call the above functions
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])





##### Calculate dictionary & corpus #####
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized, prune_at=n_words)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])


#### Fit LDA Mallet ####
time()
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = './mallet-2.0.8/bin/mallet' # update this path
t0 = time()
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=n_topics, id2word=id2word)
print('done in %0.3fs.' % (time() - t0))

Gensim: 


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zachcolinwolpe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


















































































































































Exception ignored in: <function Minibatch.__del__ at 0x1c1b9f4598>
Traceback (most recent call last):
  File "/anaconda3/lib/python3.7/site-packages/pymc3/data.py", line 272, in __del__
    del Minibatch.RNG[id(self)]
KeyError: (120809965104,)


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 4), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 2), (30, 2), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (1073, 1), (1548, 1), (3839, 1), (4323, 1), (4329, 1), (5343, 1), (5645, 1), (5958, 1), (7436, 1), (7894, 1), (12717, 1), (13604, 1), (13697, 1), (13876, 1), (14168, 1), (14825, 1), (14958, 1), (16462, 1)]]
done in 89.036s.


# Evaluation Time Performance

In [11]:
t0 = time() 
%time result_pymc3 = eval_lda(transform_pymc3, beta_pymc3, docs_te.toarray(), np.arange(100))
pymc3_time = time() - t0
print('Predictive log prob (pm3) = {}'.format(result_pymc3['lp']))

CPU times: user 46.5 s, sys: 2.3 s, total: 48.8 s
Wall time: 45.2 s
Predictive log prob (pm3) = -6.11006905086737


In [12]:
# sklearn eval performance
def transform_sklearn(docs):
    thetas = lda.transform(docs)
    return thetas / thetas.sum(axis=1)[:, np.newaxis]

t0 = time() 
%time result_sklearn = eval_lda(transform_sklearn, beta_sklearn, docs_te.toarray(), np.arange(100))
sklearn_time = time() - t0
print('Predictive log prob (sklearn) = {}'.format(result_sklearn['lp']))

CPU times: user 3min 11s, sys: 1.44 s, total: 3min 12s
Wall time: 2min 41s
Predictive log prob (sklearn) = -5.49455772514246


In [13]:
# save the model times
times = {
    'pymc3 training time': pymc3_time, 
    'sklearn training time': sklearn_time,
}

## Interpretation


## save the results

In [53]:
from sklearn.externals import joblib

# save sklearn theta distribution
np.save('sklearn_theta', sklearn_theta)

# save sklearn model
joblib.dump(lda, 'sklearn_20NewsGroups.pkl') 

# save pymc3 model
joblib.dump(result_pymc3, 'result_pymc3.pkl')

# save gensim model
ldamallet.save('20NewsGroups_gensim.pkl')


# Save times dict
import pickle
pickle_out = open('model_training_times.pkl', 'wb')
pickle.dump(times, pickle_out)
pickle_out.close()

In [None]:
type(ldamallet)

In [None]:
# Compute Coherence Score
t0 = time()
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)
print('done in %0.3fs.' % (time() - t0))

In [None]:
# Compute Coherence Score
t0 = time()
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)
print('done in %0.3fs.' % (time() - t0))

In [None]:
# Compute Coherence Score
t0 = time()
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='u_mass')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)
print('done in %0.3fs.' % (time() - t0))

In [None]:
tf_vectorizer.vocabulary_

docs_te.toarray().shape

In [60]:
feature_names[500:510]

['jesus',
 'jewish',
 'jews',
 'jim',
 'job',
 'jobs',
 'john',
 'jpeg',
 'just',
 'key']

# Compute Coherence

In [None]:
doc_by_word = np.array(tf.toarray())

In [61]:
#####
##### DUMMY TEST : W={w1,w2} ONLY
#####


# take test subset, for two words
doc_by_word_12 = doc_by_word[:,:2]

# caculate D(w_i) for i = 1:(number of words)
from math import log

# Single occurance D(w_i)
w = np.sum(doc_by_word, axis=0)
print(w.shape)


docs_with_12 = 0
# Double occurance D(w_i, w_j)
for i in range(doc_by_word.shape[0]):
    # iterate over each document
    
    if doc_by_word_12[i][0] > 0:
        # word i in doc[i]
        
        if doc_by_word_12[i][1] > 0:
            # word j in doc[i]
            docs_with_12 = docs_with_12 + 1
      
print('')
print('docs_with_12: ', docs_with_12)
print('')
print('w1 occurances: ', w[0])
print('score D(w1, w2)=', log((docs_with_12 + 1)/w[0]))

NameError: name 'doc_by_word' is not defined

In [None]:
# full Coherence Calculation

# iterate over each (wi,wj) combernation, such that i<j


scores = []
D_wi_wj = []

for i in range(doc_by_word.shape[1]-1): 
    """
    Compute the vector of D(wi,wj) values, 
    such that: D={ D(w1,w1000), D(w1,w2), D(w2,w3), ... , D(w999, w1000)}
    """
    # iterate over each w1, excluding the last word
    
    d = 0 # D(wi,wj)
    
    if i == 0:
        # if first node calculate D(w1,w-1)
        sub = np.column_stack([doc_by_word[:,0], doc_by_word[:,-1]]) # get correct index
        
        for j,k in sub:
            if j & k != 0:
                # document contains both words
                d = d+1
            
        D_wi_wj.append(d)
        d = 0
        

    # slice sub-matrix containing all docs but only wi & wj
    sub = doc_by_word[:, i:i+2]

    for j,k in sub:
        if j & k != 0:
            # document contains both words
            d = d+1
                
    
    D_wi_wj.append(d)
    
    
    
    if i == 999:
        print('d: ', d)
        print('i: ', i)
        print('j: ', j)
        print('k: ', k)
        print(D_wi_wj)
        print('len(D_wi_wj): ', len(D_wi_wj))
                


In [None]:
# Calculate Coherence
np.sum([log(i) for i in ([i+1 for i in D_wi_wj] / w)])

In [None]:
np.array(result_pymc3['beta']).shape

In [None]:
np.sum(doc_by_word, axis=1)

In [None]:
calculate_coherence(beta_sklearn)

In [None]:
calculate_coherence(ldamallet.word_topics.T)

In [None]:
np.sum([[1,2,3]], axis=0)
#np.array([1,2,3]).shape

In [None]:
def calculate_coherence(beta):
    """
    Calculate Topic Model Coherence using the UMass technique
    
    Parameters:
        beta: a (Topic x Word) matrix learnt by the model
    """
    from math import log10

    scores = []
    D_wi_wj = []

    for i in range(beta.shape[1]-1): 
        # iterate over each w1, excluding the last word
        """
        Compute the vector of D(wi,wj) values, 
        such that: D={ D(w1,w1000), D(w1,w2), D(w2,w3), ... , D(w999, w1000)}
        """

        d = 0 # D(wi,wj)

        if i == 0:
            # if first node calculate D(w1,w-1)
            sub = np.column_stack([beta[:,0], beta[:,-1]]) # get correct index

            for j,k in sub:
                if (j != 0) & (k != 0):
                    # document contains both words
                    d = d+1

            D_wi_wj.append(d)
            d = 0


        # slice sub-matrix containing all docs but only wi & wj
        sub = beta[:, i:i+2]

        for j,k in sub:
            if (j != 0) & (k != 0):
                # document contains both words
                d = d+1


        D_wi_wj.append(d)


    # Single occurance D(w_i)
    w = np.sum(beta, axis=1)
            
    # Calculate Coherence
    coherence = np.sum([log10(i) for i in ([i+1 for i in D_wi_wj] / w)])
    
    return coherence



In [None]:
doc_by_word = np.array(tf.toarray())
len(doc_by_word.sum(axis=0))

In [None]:
# term frequency (Doc x Word) matrix 
doc_by_word = np.array(tf.toarray())

# order term frequency according to Most Common Words
doc_by_word.sort(axis=0)

In [None]:
def print_top_words(beta, feature_names, n_top_words=10):
    for i in range(len(beta)):
        print(("Topic #%d: " % i) + " ".join([feature_names[j]
            for j in beta[i].argsort()[:-n_top_words - 1:-1]]))


doc_t.set_value(docs_tr.toarray())
samples = pm.sample_approx(approx, draws=100)
beta_pymc3 = samples['beta'].mean(axis=0)

print_top_words(beta_pymc3, feature_names)

In [None]:
[feature_names[j] for j in beta_pymc3[1].argsort()[:-10 - 1:-1]]

In [47]:
#####
#####
##### TOPIC COHERENCE
#####
#####

from math import log10


def coherence(corpus, beta, feature_names, n_words=10, epsilon=1):
    """
    Return:
        Topic Model Coherence
        
    Looking at only a one lag 
        
    Parameters:
        corpus: corpus of documents
        topic_word: 
        beta: learnt Beta Distribution
        feature_names: feature names of tokens
        n_words: number of N most common words to return
        epsilon: smoothing parameter
    
    """
    
    n_topics = beta.shape[0] # number of topics
    corpus = np.array(corpus)
    
    
    common_words = [] # find n_words most common words per topic
    for i in range(len(beta)):
            common_words.append([feature_names[j] for j in beta[i].argsort()[:-n_words - 1:-1]])
    
                        
              
    coherence = []
    for k in range(n_topics):
        # for each topic
    
        
        for vi in range(n_words-1):
            # for each word in each topic
            D_vi = 0
            
            
            for doc in corpus:
                # for each document 
                
                D_vi_vj = 0
                vj = vi+1
                
                if common_words[k][vi] in doc:
                    # word is in the document
                    D_vi = D_vi + 1
                  
                    if common_words[k][vj] in doc:
                        # only check lag-1 words if word vi in doc
                        D_vi_vj = D_vi_vj + 1
            
            if D_vi != 0:
                # catch errors
                coherence.append(log10((D_vi_vj+epsilon)/D_vi))

    model_coherence = np.sum(coherence)/n_topics
    
    print("coherence for each topic: ", model_coherence)
    
    return model_coherence
        

In [49]:
coherence(data_samples, beta_sklearn, feature_names)

coherence for each topic:  -23.574894990451185


-23.574894990451185

In [50]:
coherence(data_samples, beta_pymc3, feature_names)

coherence for each topic:  -25.5253973343703


-25.5253973343703

In [None]:
coherence(data_samples, beta_sklearn, feature_names)

In [None]:
lda