# Fit LDA ensemble

see: https://radimrehurek.com/gensim/auto_examples/tutorials/run_ensemblelda.html

In [1]:
import pickle
import pandas as pd
from gensim.models import LdaModel
from gensim.models import EnsembleLda
from gensim.models.ldamulticore import LdaMulticore
import numpy as np
import statistics
import os
from matplotlib import pyplot as plt 

%matplotlib inline

In [2]:
DATA_PATH = 'data'
MODELS_PATH = 'models'

## Load dictionary and corpus

Load the dictionary, tokenized train-, validate- and test- corpus prepared in the notebook 01_Clean_and_tokenize.

In [3]:
# Load the datasets for Scholarly Document Processing (SDP)
with open(os.path.join(DATA_PATH, 'dictionary_sdp.pickle'), 'rb') as handle:
    dictionary_sdp = pickle.load(handle)

with open(os.path.join(DATA_PATH, 'corpus_train_sdp.pickle'), 'rb') as handle:
    corpus_train_sdp = pickle.load(handle)

with open(os.path.join(DATA_PATH, 'corpus_validate_sdp.pickle'), 'rb') as handle:
    corpus_validate_sdp = pickle.load(handle)

## Fit ensemble LDA
* Training
* Optimizing the $\epsilon$ parameter
  * Compute perplexity by value of $\epsilon$
  * Plot perplexity curves
  * Find best value of $\epsilon$
* Output SDP topics
* Save the model

### Training

In [4]:
# parameters passed to the fit_ensemble_lda function
topic_model_class = LdaModel
ensemble_workers = 16
num_models = ensemble_workers
distance_workers = 16
num_topics = 50
passes = 8
#iterations = 100
chunksize = 2000

In [5]:
def fit_ensemble_lda(dictionary, corpus):
    """A function to train an ensemble LDA"""
    ensemble = EnsembleLda(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        passes=passes,
        num_models=num_models,
        topic_model_class=topic_model_class,
        ensemble_workers=ensemble_workers,
        distance_workers=distance_workers,
        chunksize=chunksize
    )
    return(ensemble)

### Optimize the epsilon parameter
Parameters for optimizing the $\epsilon$ clustering parameter

In [6]:
max_runs = 5  # the number of times the LDA ensemble will be trained
steps = 25 # the number of steps covering the epsilon range

In [7]:
def get_shape_range(ensemble, steps):
    """Get the range of epsilon values for this ensemble"""
    shape = ensemble.asymmetric_distance_matrix.shape
    without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0], dtype=bool)].reshape(shape[0], -1)
    return(np.linspace(without_diagonal.min(), without_diagonal.max(), num=steps))

In [8]:
def compute_perplexity_for_ensemble(dictionary, corpus_train, corpus_validate, max_runs, steps):
    """
    Trains LDA ensemble 'max_run' times, each time with with 'steps' different values of epsilon, return perplexity values.
    @return: a list of dataframes, one dataframe for each run. Each row in the dataframe has 'k' num stable topics, 'eps' clustering parameter epsilon, 'val' perplexity value.
    """
    perplexity = []
    for run in range(max_runs):
        print(f"Run {run + 1} / {max_runs}")
        eps = []
        num_topics_eps = []
        px = []
        ensemble = fit_ensemble_lda(dictionary, corpus_train)
        shape_range = get_shape_range(ensemble, steps)
        for val in shape_range:
            ensemble.recluster(eps=val)
            len_topics = len(ensemble.get_topics())
            num_topics_eps.append(len_topics)
            eps.append(val)
            if len_topics > 1:
                # compute the perplexity for this value of epsilon
                log_perplexity = ensemble.log_perplexity(corpus_validate)
                perplexity_eps = np.exp(-log_perplexity)
                px.append(perplexity_eps)
#                print(f"eps: {val} stable number of topics: {len_topics}, perplexity: {perplexity_eps}")
            else:
                # there's 1 or no stable topics, perplexity is infinite
                px.append(np.inf)
        perplexity.append(pd.DataFrame.from_dict({'k': num_topics_eps, 'eps': eps, 'val': px}))
    return(perplexity)

### Compute perplexity by value of $\epsilon$ clustering parameter for the SDP data set

In [9]:
%%time

perplexity_sdp = compute_perplexity_for_ensemble(dictionary_sdp, corpus_train_sdp, corpus_validate_sdp, max_runs, steps)

Run 1 / 10
Run 2 / 10
Run 3 / 10
Run 4 / 10
Run 5 / 10
Run 6 / 10
Run 7 / 10


Process Process-197:
Process Process-195:
Process Process-203:
Traceback (most recent call last):
Process Process-208:
  File "/home/atroncos/anaconda3/envs/openalex_exp/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Process Process-200:
Process Process-196:
  File "/home/atroncos/anaconda3/envs/openalex_exp/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/atroncos/anaconda3/envs/openalex_exp/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/atroncos/anaconda3/envs/openalex_exp/lib/python3.12/site-packages/gensim/models/ensemblelda.py", line 437, in _generate_topic_models_worker
    _generate_topic_models(ensemble=ensemble, num_models=num_models, random_states=random_states)
  File "/home/atroncos/anaconda3/envs/openalex_exp/lib/python3.12/multiprocessing/process.p

KeyboardInterrupt: 

### Plot the perplexity curves

In [10]:
for run in range(max_runs):
    plt.plot(perplexity_sdp[run].eps, perplexity_sdp[run].val)
plt.title("Perplexity for the SDP data set")
plt.xlabel("Epsilon")
plt.ylabel("Perplexity")
plt.show()

NameError: name 'perplexity_sdp' is not defined