Ignis: Latent Dirichlet Allocation
============

In [1]:
import glob
import math
import re
import threading
import time

import gensim
import nltk
import pyLDAvis
import tomotopy as tp
import tqdm
import pathlib

In [2]:
import ipywidgets as widgets

Model training (LDA)
----

Load from an `ignis.Corpus`, add the processed docs to an LDA model, and train it.

The random seed and parallelisation can both affect results, so setting the seed and number of workers is necessary for reproducibility.

In [3]:
import ignis

In [4]:
corpus = ignis.load_corpus("bbc-full.corpus")

In [5]:
# model_options = {"k": 10, "term_weighting": "idf", "until_max_ll": True, "verbose": True}
# vis_options = {"verbose": True}
# results = ignis.train_model(corpus, model_type="lda", model_options=model_options, vis_type="pyldavis", vis_options=vis_options)
# results.save("bbc-full.aurum")

In [6]:
results = ignis.load_results("bbc-full.aurum")

In [7]:
results.init_labeller("tomotopy", verbose=True)

Extracting label candidates from model...
Preparing First-order relevance labeller...
Done.


Print results
------

In [8]:
def print_topic(topic_id):
    # Labels
    labels = ", ".join(
        label for label, score in results.get_topic_labels(topic_id, top_n=10)
    )
    print(f"Suggested labels: {labels}")

    # Print this topic
    words_probs = results.get_topic_words(topic_id, top_n=10)
    words = [x[0] for x in words_probs]

    words = ", ".join(words)
    print(words)

In [9]:
for k in range(1, results.get_num_topics() + 1):
    print(f"[Topic {k}]")
    print_topic(k)
    print()

[Topic 1]
Suggested labels: striker, keeper, chelsea, arsenal, header, subs, free-kick, manchester_united, premiership, defender
we, you, they, if, he, not, what, there, all, do

[Topic 2]
Suggested labels: victory, win, injury, grand_slam, coach, victory in, win in, final, matches, of confidence
club, chelsea, liverpool, arsenal, football, game, his, fiat, ferguson, united

[Topic 3]
Suggested labels: awards, actor, nominations, award, for best, nominated, nominated for, film, stars, award for
england, ireland, wales, rugby, win, her, game, his, france, six_nations

[Topic 4]
Suggested labels: gaming, consoles, gamers, pc, graphics, xbox, sonys, console, technology, games consoles
film, best, awards, music, award, her, band, album, actor, show

[Topic 5]
Suggested labels: charges, lawyers, court, lawyer, the charges, hearing in, prosecutors, been charged with, been charged, trial for
games, gaming, game, dvd, sony, apple, computer, gadget, pc, gadgets

[Topic 6]
Suggested labels: labo

Visualise
--------
- Present as a pyLDAvis visualisation

In [10]:
vis_data = results.get_vis_data()
pyLDAvis.display(vis_data, local=True)



Experimenting with slightly different random results
--------------------------------
- Changed the number of workers to 16 (Default is 8)

In [11]:
# model_options = {"k": 10, "term_weighting": "idf", "until_max_ll": True, "verbose": True, "workers": 16}
# results2 = ignis.train_model(corpus, model_type="lda", model_options=model_options)
# results2.save("bbc-full-2.aurum")

In [12]:
results2 = ignis.load_results("bbc-full-2.aurum")

In [13]:
results2.get_num_topics()

10

In [14]:
for doc in results2.ignis_model.model.docs:
    print(doc.get_topics(top_n=2))

[(7, 0.5594062805175781), (8, 0.21345336735248566)]
[(7, 0.711746335029602), (5, 0.17704619467258453)]
[(9, 0.6855354905128479), (7, 0.15512295067310333)]
[(7, 0.8625838756561279), (5, 0.098904088139534)]
[(2, 0.4378875494003296), (7, 0.3382371664047241)]
[(7, 0.7208539247512817), (5, 0.1439865529537201)]
[(7, 0.8827990293502808), (5, 0.11130397766828537)]
[(7, 0.5618029236793518), (0, 0.19989031553268433)]
[(7, 0.5479685664176941), (0, 0.27701276540756226)]
[(9, 0.6910226345062256), (0, 0.16989770531654358)]
[(7, 0.3587944507598877), (8, 0.29951736330986023)]
[(7, 0.7183695435523987), (0, 0.16451458632946014)]
[(7, 0.44843196868896484), (2, 0.3644072711467743)]
[(5, 0.26739269495010376), (7, 0.184526726603508)]
[(7, 0.43277350068092346), (0, 0.391849547624588)]
[(7, 0.6997372508049011), (5, 0.2104395031929016)]
[(9, 0.47500738501548767), (7, 0.29740843176841736)]
[(7, 0.7864019870758057), (5, 0.1678394079208374)]
[(7, 0.49560627341270447), (5, 0.18973073363304138)]
[(5, 0.497266858816

[(1, 0.6029906272888184), (5, 0.19795048236846924)]
[(1, 0.8044866323471069), (4, 0.11142183840274811)]
[(1, 0.7567901015281677), (5, 0.24150002002716064)]
[(1, 0.8970515131950378), (5, 0.10117283463478088)]
[(1, 0.48739656805992126), (5, 0.4791116714477539)]
[(5, 0.4239179193973541), (1, 0.3134559392929077)]
[(1, 0.6906723976135254), (5, 0.21895204484462738)]
[(1, 0.8465461134910583), (5, 0.15128304064273834)]
[(5, 0.6608431935310364), (1, 0.31512555480003357)]
[(1, 0.3992515802383423), (5, 0.3334200978279114)]
[(1, 0.6025018095970154), (5, 0.3758294880390167)]
[(1, 0.8382245302200317), (5, 0.1589391678571701)]
[(1, 0.5373834371566772), (5, 0.3693496882915497)]
[(1, 0.5723985433578491), (5, 0.4203452169895172)]
[(1, 0.570962131023407), (5, 0.42816445231437683)]
[(1, 0.8360345959663391), (5, 0.161085307598114)]
[(1, 0.6721039414405823), (5, 0.3190843164920807)]
[(5, 0.5515499711036682), (1, 0.2951350510120392)]
[(1, 0.5831139087677002), (5, 0.3266565203666687)]
[(5, 0.561468780040741),

In [15]:
# results2.init_vis("pyldavis", verbose=True)

In [16]:
vis_data2 = results2.get_vis_data()

In [17]:
pyLDAvis.display(vis_data)

In [18]:
pyLDAvis.display(vis_data2)

In [30]:
import pprint

topic_docs = [doc for doc, prob in results2.get_topic_documents(4, 2)]

def show_topic_docs(index=0):
    doc_id = topic_docs[index]
    print(results2.get_document_by_id(doc_id))
    print()
    pprint.pprint(results2.get_document_topics(doc_id, 10))

widgets.interact(show_topic_docs, index=(0, len(topic_docs) - 1))

interactive(children=(IntSlider(value=0, description='index', max=370), Output()), _dom_classes=('widget-inter…

<function __main__.show_topic_docs(index=0)>

Iterate
--------
- See what the main topics might be, slice initial corpus and re-run LDA to get sub-topics