Ignis: Latent Dirichlet Allocation
============

In [1]:
import ignis

In [2]:
# Jupyter notebook setup
import ipywidgets as widgets
from IPython.core.display import display, HTML

# Custom styling:
# - Prevent vertical scrollbars in output subareas
# - Resize to fit pyLDAvis visualisations without causing other cells to overflow
style = """
<style>
   .jupyter-widgets-output-area .output_scroll {
        height: unset !important;
        border-radius: unset !important;
        -webkit-box-shadow: unset !important;
        box-shadow: unset !important;
    }
    .jupyter-widgets-output-area  {
        height: auto !important;
    }
</style>
<style>
    #notebook-container { width: 1370px !important; }
    div.output_area { width: unset !important; }
</style>
"""
display(HTML(style))

Model training (LDA)
----

Load from an `ignis.Corpus`, add the processed docs to an LDA model, and train it.

The random seed and parallelisation can both affect results, so setting the seed and number of workers is necessary for reproducibility.

In [3]:
corpus = ignis.load_corpus("bbc-full.corpus")

With the current public version of `pyLDAvis`, (2.1.2), preparing the visualisation data takes very long with recent versions of `pandas` (>0.23.4).  We have an option here to use an optimised version of the preparation function built into Ignis.

In [4]:
use_optimised = True

In [5]:
model_options = {
    "k": 6,
    "term_weighting": "idf",
    "until_max_ll": False,
    "verbose": True,
    "seed": 7157,
}
vis_options = {"verbose": True, "use_optimised": use_optimised}
results = ignis.train_model(
    corpus,
    model_type="tp_lda",
    model_options=model_options,
    vis_type="pyldavis",
    vis_options=vis_options,
)
results.show_visualisation()

Training LDA model on 2122 documents:
{'term_weighting': 'idf', 'k': 6, 'seed': 7157, 'workers': 8, 'parallel_scheme': 'default', 'iterations': 1000, 'update_every': 100, 'until_max_ll': False, 'max_extra_iterations': 5000, 'verbose': True, 'tw': <TermWeight.IDF: 1>, 'parallel': <ParallelScheme.DEFAULT: 0>}



100%|██████████| 1000/1000 [00:19<00:00, 50.86it/s, Log-likelihood=-21.11047]

Model training complete. (19.668s)
Preparing LDA visualisation...




 Done. (0.612s)




In [6]:
results.show_visualisation()

In [7]:
model = results.ignis_model.model

doc_topic_dists = [model.docs[n].get_topic_dist() for n in range(len(model.docs))]
doc_lengths = [len(model.docs[n].words) for n in range(len(model.docs))]
vocab = model.vocabs
term_frequency = model.vocab_freq

In [8]:
import pandas as pd
import numpy as np

In [9]:
def _df_with_names(data, index_name, columns_name):
    if type(data) == pd.DataFrame:
        # we want our index to be numbered
        df = pd.DataFrame(data.values)
    else:
        df = pd.DataFrame.from_records(data)
    df.index.name = index_name
    df.columns.name = columns_name
    return df


def _series_with_name(data, name):
    if type(data) == pd.Series:
        data.name = name
        # ensures a numeric index
        return data.reset_index()[name]
    else:
        return pd.Series(data, name=name)

In [10]:
# Pandas is column-oriented, but the tomotopy topic_term_dists is naturally arranged by row;
# We can save a bunch of runtime by preparing the DataFrame before sending it off to pyLDAvis, since there are way more columns than there are rows
# (There are as many columns as terms, but only as many rows as topics)

# By default Pandas thinks the DataFrame is a single column, but it's really a single row, so we Transpose
topic_term_dists_rows = [
    pd.DataFrame(model.get_topic_word_dist(k), dtype="float64").T
    for k in range(model.k)
]
topic_term_dists = pd.concat(topic_term_dists_rows)
a = _df_with_names(topic_term_dists, "topic", "term")

In [11]:
topic_term_dists_rows = [
    pd.Series(model.get_topic_word_dist(k), dtype="float64") for k in range(model.k)
]
topic_term_dists = pd.concat(topic_term_dists_rows, axis=1)
b = _df_with_names(topic_term_dists.T, "topic", "term")

In [12]:
(a == b).all(axis=None)

True

In [13]:
topic_term_dists = _df_with_names(topic_term_dists, "topic", "term")
doc_topic_dists = _df_with_names(doc_topic_dists, "doc", "topic")
term_frequency = _series_with_name(term_frequency, "term_frequency")
doc_lengths = _series_with_name(doc_lengths, "doc_length")
vocab = _series_with_name(vocab, "vocab")
R = min(30, len(vocab))
lambda_step = 0.1
n_jobs = -1

topic_freq = doc_topic_dists.mul(doc_lengths, axis="index").sum()
# ZW: topic_freq       = (doc_topic_dists.T * doc_lengths).T.sum()
# topic_freq       = np.dot(doc_topic_dists.T, doc_lengths)
topic_proportion = topic_freq / topic_freq.sum()

topic_order = topic_proportion.index
# reorder all data based on new ordering of topics
topic_freq = topic_freq[topic_order]
topic_term_dists = topic_term_dists.iloc[topic_order]
doc_topic_dists = doc_topic_dists[topic_order]

# token counts for each term-topic combination (widths of red bars)
term_topic_freq = (topic_term_dists.T * topic_freq).T.astype("float32")
## Quick fix for red bar width bug.  We calculate the
## term frequencies internally, using the topic term distributions and the
## topic frequencies, rather than using the user-supplied term frequencies.
## For a detailed discussion, see: https://github.com/cpsievert/LDAvis/pull/41
term_frequency = np.sum(term_topic_freq, axis=0)

In [14]:
from joblib import Parallel, delayed, cpu_count


def _job_chunks(l, n_jobs):
    n_chunks = n_jobs
    if n_jobs < 0:
        # so, have n chunks if we are using all n cores/cpus
        n_chunks = cpu_count() + 1 - n_jobs

    return _chunks(l, n_chunks)


def _chunks(l, n):
    """ Yield successive n-sized chunks from l.
    """
    for i in range(0, len(l), n):
        yield l[i : i + n]


def _find_relevance(log_ttd, log_lift, R, lambda_):
    relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift
    return relevance.T.apply(lambda topic: topic.nlargest(R).index)
    # ZW: return relevance.T.apply(lambda s: s.sort_values(ascending=False).index).head(R)


def _find_relevance_chunks(log_ttd, log_lift, R, lambda_seq):
    return pd.concat([_find_relevance(log_ttd, log_lift, R, l) for l in lambda_seq])

In [15]:
import time

start_time = time.perf_counter()

# marginal distribution over terms (width of blue bars)
term_proportion = term_frequency / term_frequency.sum()

# compute the distinctiveness and saliency of the terms:
# this determines the R terms that are displayed when no topic is selected
tt_sum = topic_term_dists.sum()
topic_given_term = pd.eval("topic_term_dists / tt_sum")
# ZW: topic_given_term = topic_term_dists.T.div(topic_term_dists.sum(), axis="index").T
# ZW: topic_given_term = topic_term_dists / topic_term_dists.sum()

log_1 = np.log(pd.eval("(topic_given_term.T / topic_proportion)"))
kernel = pd.eval("topic_given_term * log_1.T")
# ZW: kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
distinctiveness = kernel.sum()
saliency = term_proportion * distinctiveness

elapsed = time.perf_counter() - start_time
print(f"distinctiveness/saliency calcs {elapsed:.3f}s")

distinctiveness/saliency calcs 0.012s


In [16]:
start_time = time.perf_counter()

# Order the terms for the "default" view by decreasing saliency:
default_term_info = (
    pd.DataFrame(
        {
            "saliency": saliency,
            "Term": vocab,
            "Freq": term_frequency,
            "Total": term_frequency,
            "Category": "Default",
        }
    )
    .sort_values(by="saliency", ascending=False)
    .head(R)
    .drop("saliency", 1)
)
# Rounding Freq and Total to integer values to match LDAvis code:
default_term_info["Freq"] = np.floor(default_term_info["Freq"])
default_term_info["Total"] = np.floor(default_term_info["Total"])
ranks = np.arange(R, 0, -1)
default_term_info["logprob"] = default_term_info["loglift"] = ranks

elapsed = time.perf_counter() - start_time
print(f"Ordering stuff {elapsed:.3f}s")
start_time = time.perf_counter()

## compute relevance and top terms for each topic
log_lift = np.log(pd.eval("topic_term_dists / term_proportion")).astype("float64")
# ZW: log_lift = np.log(topic_term_dists / term_proportion)
log_ttd = np.log(topic_term_dists).astype("float64")
lambda_seq = np.arange(0, 1 + lambda_step, lambda_step)

elapsed = time.perf_counter() - start_time
print(f"loglift stuff {elapsed:.3f}s")

Ordering stuff 0.013s
loglift stuff 0.004s


In [17]:
start_time = time.perf_counter()


def topic_top_term_df(tup):
    new_topic_id, (original_topic_id, topic_terms) = tup
    term_ix = topic_terms.unique()
    return pd.DataFrame(
        {
            "Term": vocab[term_ix],
            "Freq": term_topic_freq.loc[original_topic_id, term_ix],
            "Total": term_frequency[term_ix],
            "Category": "Topic%d" % new_topic_id,
            "logprob": log_ttd.loc[original_topic_id, term_ix].round(4),
            "loglift": log_lift.loc[original_topic_id, term_ix].round(4),
        }
    )


top_terms = pd.concat(
    Parallel(n_jobs=n_jobs, prefer="threads")(
        delayed(_find_relevance_chunks)(log_ttd, log_lift, R, ls)
        for ls in _job_chunks(lambda_seq, n_jobs)
    )
)
elapsed = time.perf_counter() - start_time
print(f"concats 1 {elapsed:.3f}s")
start_time = time.perf_counter()
topic_dfs = list(map(topic_top_term_df, enumerate(top_terms.T.iterrows(), 1)))
elapsed = time.perf_counter() - start_time
print(f"concats 2 {elapsed:.3f}s")
start_time = time.perf_counter()
test = pd.concat([default_term_info] + list(topic_dfs))

concats 1 0.228s
concats 2 0.040s


In [19]:
tup = list(enumerate(top_terms.T.iterrows(), 1))[0]

In [20]:
new_topic_id, (original_topic_id, topic_terms) = tup
term_ix = topic_terms.unique()
a = pd.DataFrame(
    {
        "Term": vocab[term_ix],
        "Freq": term_topic_freq.loc[original_topic_id, term_ix],
        "Total": term_frequency[term_ix],
        "Category": "Topic%d" % new_topic_id,
        "logprob": log_ttd.loc[original_topic_id, term_ix].round(4),
        "loglift": log_lift.loc[original_topic_id, term_ix].round(4),
    }
)
b = pd.concat([default_term_info] + [a])