Ignis: Latent Dirichlet Allocation
============

In [1]:
import ignis

In [2]:
# Python setup: Note that the `PYTHONHASHSEED` environmental variable needs to be set *before* the Python kernel is intialised --
# We only print it here for easy post-hoc reference.
import os
os.environ.get("PYTHONHASHSEED")

'11399'

In [3]:
# Jupyter notebook setup
import ipywidgets as widgets
from IPython.core.display import display, HTML

# Custom styling:
# - Prevent vertical scrollbars in output subareas
# - Resize to fit pyLDAvis visualisations without causing other cells to overflow
style = """
<style>
   .jupyter-widgets-output-area .output_scroll {
        height: unset !important;
        border-radius: unset !important;
        -webkit-box-shadow: unset !important;
        box-shadow: unset !important;
    }
    .jupyter-widgets-output-area  {
        height: auto !important;
    }
</style>
<style>
    #notebook-container { width: 1370px !important; }
    div.output_area { width: unset !important; }
</style>
"""
display(HTML(style))

Model training (LDA)
----

Load from an `ignis.Corpus`, add the processed docs to an LDA model, and train it.

The random seed and parallelisation can both affect results, so setting the seed and number of workers is necessary for reproducibility.

In [4]:
corpus = ignis.load_corpus("bbc-full.corpus")

In [5]:
model_options = {
    "k": 6,
    "term_weighting": "idf",
    "until_max_ll": False,
    "verbose": True,
    "seed": 7157,
}
vis_options = {"verbose": True}
results = ignis.train_model(
    corpus,
    model_type="tp_lda",
    model_options=model_options,
    vis_type="pyldavis",
    vis_options=vis_options,
)

Training LDA model on 2122 documents:
{'term_weighting': 'idf', 'k': 6, 'seed': 7157, 'workers': 8, 'parallel_scheme': 'default', 'iterations': 1000, 'update_every': 100, 'until_max_ll': False, 'max_extra_iterations': 5000, 'verbose': True, 'tw': <TermWeight.IDF: 1>, 'parallel': <ParallelScheme.DEFAULT: 0>}



100%|██████████| 1000/1000 [00:12<00:00, 78.06it/s, Log-likelihood=-21.11047]

Model training complete. (12.814s)
Preparing LDA visualisation


  elif isinstance(data[0], collections.Mapping):


 . Done. (2.133s)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [6]:
results.show_visualisation()

In [7]:
model = results.ignis_model.model

doc_topic_dists = [model.docs[n].get_topic_dist() for n in range(len(model.docs))]
doc_lengths = [len(model.docs[n].words) for n in range(len(model.docs))]
vocab = model.vocabs
term_frequency = model.vocab_freq

In [8]:
import pandas as pd
import numpy as np

In [9]:
# Pandas appears to operate in a column-based fashion, but the tomotopy topic_term_dists is naturally arranged by row;
# We can save a bunch of runtime by preparing the DataFrame before sending it off to pyLDAvis, since there are way more columns than there are rows
# (There are as many columns as terms, but only as many rows as topics)

# By default Pandas thinks the DataFrame is a single column, but it's really a single row, so we Transpose
topic_term_dists_rows = [pd.DataFrame(model.get_topic_word_dist(k)).T for k in range(model.k)]
topic_term_dists = pd.concat(topic_term_dists_rows, ignore_index=True)

In [10]:
def _df_with_names(data, index_name, columns_name):
   if type(data) == pd.DataFrame:
      # we want our index to be numbered
      df = pd.DataFrame(data.values)
   else:
      df = pd.DataFrame.from_records(data)
   df.index.name = index_name
   df.columns.name = columns_name
   return df


def _series_with_name(data, name):
   if type(data) == pd.Series:
      data.name = name
      # ensures a numeric index
      return data.reset_index()[name]
   else:
      return pd.Series(data, name=name)

In [11]:
topic_term_dists = _df_with_names(topic_term_dists, 'topic', 'term')
doc_topic_dists  = _df_with_names(doc_topic_dists, 'doc', 'topic')
term_frequency   = _series_with_name(term_frequency, 'term_frequency')
doc_lengths      = _series_with_name(doc_lengths, 'doc_length')
vocab            = _series_with_name(vocab, 'vocab')
R = min(30, len(vocab))
lambda_step = 0.1
n_jobs = -1

In [12]:
doc_topic_dists

topic,0,1,2,3,4,5
doc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.047943,0.280047,0.000097,0.174497,0.058377,0.439036
1,0.052017,0.629893,0.000204,0.253956,0.000227,0.063680
2,0.000213,0.155745,0.026529,0.358298,0.459044,0.000172
3,0.009788,0.030228,0.813703,0.094110,0.000045,0.052115
4,0.918476,0.016484,0.000269,0.064237,0.000301,0.000233
5,0.031558,0.000850,0.136327,0.327627,0.464157,0.039440
6,0.000112,0.126414,0.708836,0.127161,0.006768,0.030721
7,0.000201,0.189146,0.217945,0.246108,0.017629,0.328960
8,0.895383,0.000932,0.000352,0.102632,0.000393,0.000304
9,0.000261,0.667404,0.052385,0.279485,0.000273,0.000211


In [None]:
%%timeit
topic_freq       = (doc_topic_dists.T * doc_lengths).T.sum()

In [None]:
topic_freq       = (doc_topic_dists.T * doc_lengths).T.sum()
topic_freq

In [None]:
topic_freq       = (doc_topic_dists.T * doc_lengths).T.sum()
# topic_freq       = np.dot(doc_topic_dists.T, doc_lengths)
topic_proportion = (topic_freq / topic_freq.sum())

topic_order      = topic_proportion.index
# reorder all data based on new ordering of topics
topic_freq       = topic_freq[topic_order]
topic_term_dists = topic_term_dists.iloc[topic_order]
doc_topic_dists  = doc_topic_dists[topic_order]

# token counts for each term-topic combination (widths of red bars)
term_topic_freq = (topic_term_dists.T * topic_freq).T
## Quick fix for red bar width bug.  We calculate the
## term frequencies internally, using the topic term distributions and the
## topic frequencies, rather than using the user-supplied term frequencies.
## For a detailed discussion, see: https://github.com/cpsievert/LDAvis/pull/41
term_frequency = np.sum(term_topic_freq, axis=0)

In [None]:
from joblib import Parallel, delayed, cpu_count

def _job_chunks(l, n_jobs):
   n_chunks = n_jobs
   if n_jobs < 0:
      # so, have n chunks if we are using all n cores/cpus
      n_chunks = cpu_count() + 1 - n_jobs

   return _chunks(l, n_chunks)

def _chunks(l, n):
    """ Yield successive n-sized chunks from l.
    """
    for i in range(0, len(l), n):
        yield l[i:i+n]
        
def _find_relevance(log_ttd, log_lift, R, lambda_):
   relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift
   return relevance.T.apply(lambda s: s.sort_values(ascending=False).index).head(R)


def _find_relevance_chunks(log_ttd, log_lift, R, lambda_seq):
   return pd.concat([_find_relevance(log_ttd, log_lift, R, l) for l in lambda_seq])

In [None]:
import time
start_time = time.perf_counter()

# marginal distribution over terms (width of blue bars)
term_proportion = term_frequency / term_frequency.sum()

# compute the distinctiveness and saliency of the terms:
# this determines the R terms that are displayed when no topic is selected
topic_given_term = topic_term_dists / topic_term_dists.sum()
kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
distinctiveness = kernel.sum()
saliency = term_proportion * distinctiveness

elapsed = time.perf_counter() - start_time
print(f"distinctiveness/saliency calcs {elapsed:.3f}s")
start_time = time.perf_counter()

# Order the terms for the "default" view by decreasing saliency:
default_term_info  = pd.DataFrame({'saliency': saliency, 'Term': vocab, \
                                  'Freq': term_frequency, 'Total': term_frequency, \
                                  'Category': 'Default'}). \
  sort_values(by='saliency', ascending=False). \
  head(R).drop('saliency', 1)
# Rounding Freq and Total to integer values to match LDAvis code:
default_term_info['Freq'] = np.floor(default_term_info['Freq'])
default_term_info['Total'] = np.floor(default_term_info['Total'])
ranks = np.arange(R, 0, -1)
default_term_info['logprob'] = default_term_info['loglift'] = ranks

elapsed = time.perf_counter() - start_time
print(f"Ordering stuff {elapsed:.3f}s")
start_time = time.perf_counter()

## compute relevance and top terms for each topic
log_lift = np.log(topic_term_dists / term_proportion)
log_ttd = np.log(topic_term_dists)
lambda_seq = np.arange(0, 1 + lambda_step, lambda_step)

elapsed = time.perf_counter() - start_time
print(f"loglift stuff {elapsed:.3f}s")
start_time = time.perf_counter()

def topic_top_term_df(tup):
  new_topic_id, (original_topic_id, topic_terms) = tup
  term_ix = topic_terms.unique()
  return pd.DataFrame({'Term': vocab[term_ix], \
                       'Freq': term_topic_freq.loc[original_topic_id, term_ix], \
                       'Total': term_frequency[term_ix], \
                       'logprob': log_ttd.loc[original_topic_id, term_ix].round(4), \
                       'loglift': log_lift.loc[original_topic_id, term_ix].round(4), \
                       'Category': 'Topic%d' % new_topic_id})

top_terms = pd.concat(Parallel(n_jobs=n_jobs)(delayed(_find_relevance_chunks)(log_ttd, log_lift, R, ls) \
                                             for ls in _job_chunks(lambda_seq, n_jobs)))
topic_dfs = map(topic_top_term_df, enumerate(top_terms.T.iterrows(), 1))
elapsed = time.perf_counter() - start_time
print(f"concats {elapsed:.3f}s")
start_time = time.perf_counter()
pd.concat([default_term_info] + list(topic_dfs))