Basic Text Pre-processing and Topic Modelling
======

In [1]:
import glob
import math
import re
import threading
import time

import gensim
import nltk
import pyLDAvis
import tomotopy as tp
import tqdm

In [2]:
# Interactively print the documents in the corpus at various stages
import ipywidgets as widgets


def show_all(doc_list):
    return widgets.interact(
        lambda doc: print(doc_list[doc]), doc=(0, len(doc_list) - 1)
    )

Data ingestion
-----

In [3]:
data_files = glob.glob("./data/bbc/*/*.txt")

In [4]:
raw_docs = []
for file in tqdm.tqdm(data_files):
    with open(file) as f:
        doc = f.read()
        raw_docs.append(doc)

100%|██████████| 2225/2225 [00:00<00:00, 11600.45it/s]


In [5]:
show_all(raw_docs)

interactive(children=(IntSlider(value=1112, description='doc', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_all.<locals>.<lambda>(doc)>

Text pre-processing and tokenisation
------

- Naive tokenisation (by whitespace)
- Remove single apostrophes
- Strip leading/trailing non-informative punctuation from tokens

In [6]:
remove_punctuation = "'\"()[]?!,.:;<>/|_"


def naive_tokenise(doc):
    new_tokens = []

    tokens = doc.split()
    for token in tokens:
        token = token.strip(remove_punctuation)
        token = token.replace("'", "")
        token = token.strip()
        
        if token != "":
            new_tokens.append(token)

    return new_tokens


docs = [naive_tokenise(doc) for doc in raw_docs]

In [7]:
display_docs = [" ".join(doc) for doc in docs]
show_all(display_docs)

interactive(children=(IntSlider(value=1112, description='doc', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_all.<locals>.<lambda>(doc)>

Chunk into significant bigrams/trigrams based on collocation frequency

- Min count: Must appear in at least 0.1% of the documents

- Scoring: "default" or "npmi"

- Threshold: Intuitively, higher threshold means fewer phrases. With the default scorer, this is greater than or equal to 0; with the NPMI scorer, this is in the range -1 to 1.

- Common terms: These terms will be ignored if they come between normal words. E.g., if `common_terms` includes the word "of", then when the phraser sees "Wheel of Fortune" it actually evaluates _"Wheel Fortune"_ as an n-gram, putting "of" back in only at the output level.

In [8]:
min_count = math.ceil(len(docs) / 1000)
scoring = "npmi"
# We want a relatively high threshold so that we don't start littering spurious n-grams all over our corpus, diluting our results.
# E.g., we want "Lord_of_the_Rings", but not "slightly_better_than_analysts"
threshold = 0.8
common_terms = ["a", "an", "the", "of", "on", "in", "at"]

This could take a while, so set up a threaded function with a basic progress indicator in the main thread

In [9]:
def find_ngrams(docs, results):
    bigram = gensim.models.Phrases(
        docs,
        min_count=min_count,
        threshold=threshold,
        scoring=scoring,
        common_terms=common_terms,
    )
    trigram = gensim.models.Phrases(
        bigram[docs],
        min_count=min_count,
        threshold=threshold,
        scoring=scoring,
        common_terms=common_terms,
    )

    # Finalise the bigram/trigram generators for efficiency
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    results[0] = bigram_mod
    results[1] = trigram_mod

In [10]:
print("Generating n-grams ", flush=True, end="")

start_time = time.perf_counter()

results = [None, None]
t = threading.Thread(target=find_ngrams, args=(docs, results))
t.start()

progress_countdown = 1.0

while t.isAlive():
    time.sleep(0.1)
    progress_countdown -= 0.1
    if progress_countdown <= 0:
        print(". ", flush=True, end="")
        progress_countdown = 1

elapsed = time.perf_counter() - start_time
print(f"Done. ({elapsed:.3f}s)")

print("Applying n-grams... ", flush=True, end="")
bigram_mod = results[0]
trigram_mod = results[1]

docs = [trigram_mod[bigram_mod[doc]] for doc in docs]
print("Done.")

Generating n-grams . . . . . . . . . . Done. (12.511s)
Applying n-grams... Done.


In [11]:
display_docs = [" ".join(doc) for doc in docs]
show_all(display_docs)

interactive(children=(IntSlider(value=1112, description='doc', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_all.<locals>.<lambda>(doc)>

Second-pass tokenisation
- Case folding
- Remove stop words (optional), remove purely numeric/non-alphabetic tokens
- Remove single letters/numbers (assuming meaningful tokens would have been picked up earlier by the phraser)

In [12]:
# stopset = set(nltk.corpus.stopwords.words("english"))
# Testing term weighting
stopset = []


def second_tokenise(tokens):
    new_tokens = []
    for token in tokens:
        token = token.casefold()
        if token in stopset or re.match("^[^a-z]+$", token) or len(token) <= 1:
            continue
        new_tokens.append(token)

    return new_tokens


docs = [second_tokenise(doc) for doc in docs]

In [13]:
display_docs = [" ".join(doc) for doc in docs]
show_all(display_docs)

interactive(children=(IntSlider(value=1112, description='doc', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_all.<locals>.<lambda>(doc)>

Model training (LDA)
----

Add processed docs to the LDA model and train it.

The random seed and parallelisation can affect results, so setting both the seed and number of workers is necessary for reproducibility.

In [24]:
# Persistence
model_seed = 11399
num_workers = 10

# Model options
model_file = "model.bin"
num_topics = 20

# Training iterations
load_saved_model = False
burn_in = 0
train_batch = 10
train_total = 2500

# Extended training
train_until_min_ll = False
max_iterations = 10000

In [25]:
if load_saved_model:
    model = tp.LDAModel.load(model_file)
    print(f"Loaded from '{model_file}'.")
else:
    model = tp.LDAModel(tw=tp.TermWeight.IDF, seed=model_seed, k=num_topics)
    
    model.burn_in = burn_in

    for doc in tqdm.tqdm(docs):
        model.add_doc(doc)

    model.train(0, workers=num_workers, parallel=tp.ParallelScheme.PARTITION)
    print(
        f"Num docs: {len(model.docs)}, Vocab size: {model.num_vocabs}, "
        f"Num words: {model.num_words}"
    )
    print(f"Removed top words: {model.removed_top_words}")

    print("Training model...", flush=True)

    try:
        for i in range(0, train_total, train_batch):
            start_time = time.perf_counter()
            model.train(
                train_batch, workers=num_workers, parallel=tp.ParallelScheme.PARTITION
            )
            elapsed = time.perf_counter() - start_time
            print(
                f"Iteration: {i + train_batch}\tLog-likelihood: {model.ll_per_word}\t"
                f"Time: {elapsed:.3f}s",
                flush=True,
            )
    except KeyboardInterrupt:
        print("Stopping train sequence.")
    model.save(model_file)
    print(f"Saved to '{model_file}'.")

100%|██████████| 2225/2225 [00:00<00:00, 14747.00it/s]


Num docs: 2225, Vocab size: 34384, Num words: 805379
Removed top words: []
Training model...
Iteration: 10	Log-likelihood: -22.677888419721707	Time: 0.159s
Iteration: 20	Log-likelihood: -22.07540857048148	Time: 0.149s
Iteration: 30	Log-likelihood: -21.804846460684498	Time: 0.150s
Iteration: 40	Log-likelihood: -21.619040925487955	Time: 0.152s
Iteration: 50	Log-likelihood: -21.47973015310507	Time: 0.152s
Iteration: 60	Log-likelihood: -21.37217531561843	Time: 0.154s
Iteration: 70	Log-likelihood: -21.27384627189861	Time: 0.153s
Iteration: 80	Log-likelihood: -21.196773924312883	Time: 0.148s
Iteration: 90	Log-likelihood: -21.131574705105177	Time: 0.147s
Iteration: 100	Log-likelihood: -21.068046530214072	Time: 0.152s
Iteration: 110	Log-likelihood: -21.009791986889958	Time: 0.149s
Iteration: 120	Log-likelihood: -20.960465545538238	Time: 0.149s
Iteration: 130	Log-likelihood: -20.92751689081927	Time: 0.150s
Iteration: 140	Log-likelihood: -20.889633040538527	Time: 0.150s
Iteration: 150	Log-likeli

Iteration: 1280	Log-likelihood: -20.228860334357393	Time: 0.147s
Iteration: 1290	Log-likelihood: -20.228725862496663	Time: 0.149s
Iteration: 1300	Log-likelihood: -20.22920593715187	Time: 0.147s
Iteration: 1310	Log-likelihood: -20.22760738162664	Time: 0.162s
Iteration: 1320	Log-likelihood: -20.23186990583289	Time: 0.153s
Iteration: 1330	Log-likelihood: -20.229841266507798	Time: 0.161s
Iteration: 1340	Log-likelihood: -20.222484648368418	Time: 0.145s
Iteration: 1350	Log-likelihood: -20.227890681624487	Time: 0.149s
Iteration: 1360	Log-likelihood: -20.224676851625695	Time: 0.156s
Iteration: 1370	Log-likelihood: -20.229095778375836	Time: 0.159s
Iteration: 1380	Log-likelihood: -20.225347898786303	Time: 0.146s
Iteration: 1390	Log-likelihood: -20.2249415937564	Time: 0.148s
Iteration: 1400	Log-likelihood: -20.223942932770573	Time: 0.147s
Iteration: 1410	Log-likelihood: -20.222594482011583	Time: 0.147s
Iteration: 1420	Log-likelihood: -20.21946493159511	Time: 0.148s
Iteration: 1430	Log-likelihood:

In [26]:
if train_until_min_ll and not load_saved_model:
    print("Continuing to train until minimum log-likelihood...")
    print("(N.B.: This may not correlate with increased human interpretability)")
    print()
    last_ll = model.ll_per_word
    i = 0
    consecutive_loss = 0

    while True:
        try:
            start_time = time.perf_counter()
            model.train(
                train_batch, workers=num_workers, parallel=tp.ParallelScheme.PARTITION
            )
            i += train_batch
            elapsed = time.perf_counter() - start_time
            print(
                f"Iteration: {i}\tLog-likelihood: {model.ll_per_word}\t"
                f"Time: {elapsed:.3f}s",
                flush=True,
            )

            if model.ll_per_word < last_ll:
                consecutive_loss += 1
            else:
                consecutive_loss = 0
                model.save(model_file)
            last_ll = model.ll_per_word

            if consecutive_loss == 2 or i >= max_iterations:
                break

        except KeyboardInterrupt:
            print("Stopping extended train sequence.")
            break

    model = tp.LDAModel.load(model_file)
    print(f"Best recent model saved at '{model_file}' (LL: {model.ll_per_word}).")

Topic labelling

In [27]:
print("Extracting suggested topic labels...", flush=True)
# extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
extractor = tp.label.PMIExtractor(min_cf=5, min_df=3, max_len=5, max_cand=20000)
candidates = extractor.extract(model)
# labeler = tp.label.FoRelevance(model, candidates, min_df=5, smoothing=1e-2,
# mu=0.25)
labeler = tp.label.FoRelevance(
    model, candidates, min_df=3, smoothing=1e-2, mu=0.25, workers=num_workers
)
print("Done.")

Extracting suggested topic labels...
Done.


Print results
------

In [28]:
def print_topic(topic_id):
    # Labels
    labels = ", ".join(
        label for label, score in labeler.get_topic_labels(topic_id, top_n=10)
    )
    print(f"Suggested labels: {labels}")

    # Print this topic
    words_probs = model.get_topic_words(topic_id, top_n=10)
    words = [x[0] for x in words_probs]

    words = ", ".join(words)
    print(words)

In [29]:
for k in range(model.k):
    print(f"[Topic {k+1}]")
    print_topic(k)
    print()

[Topic 1]
Suggested labels: web, catalogues, tailored, or blogs, the web, find what, indexing, people find, search engine, rss
search, tv, google, web, blogs, blog, television, online, show, viewers

[Topic 2]
Suggested labels: government, governments, countries, minister, the government, economic, policy, areas, mr, government has
eu, government, countries, aid, report, european, foreign, children, education, president

[Topic 3]
Suggested labels: first novel, originals, the originals, creatures, reopening, shows such as, shows such, cgi, it can do, pollard
ballet, she, book, argentina, bnp, bates, murray, novel, books, disney

[Topic 4]
Suggested labels: portable, video, gaming, sony, mobile, digital, gadgets, multimedia, pc, devices
mobile, games, phones, phone, gaming, apple, game, video, mobiles, gadget

[Topic 5]
Suggested labels: anti-virus, viruses, malicious, users, security, phishing, virus, sophos, program, security firm
security, software, e-mail, users, microsoft, spam, vi

Visualise
--------
- Present data in the format expected by pyLDAvis

In [30]:
model_data = {
    "topic_term_dists": [model.get_topic_word_dist(k) for k in range(model.k)],
    "doc_topic_dists": [model.docs[n].get_topic_dist() for n in range(len(model.docs))],
    "doc_lengths": [len(model.docs[n].words) for n in range(len(model.docs))],
    "vocab": model.vocabs,
    "term_frequency": model.vocab_freq,
}

Again, this could take a while

In [31]:
def prepare_vis(model_data, results):
    vis_data = pyLDAvis.prepare(**model_data, mds="pcoa")
    results[0] = vis_data

In [32]:
print("Preparing LDA visualisation", flush=True, end="")

results = [None]
t = threading.Thread(target=prepare_vis, args=(model_data, results))
t.start()

progress_countdown = 1.0

while t.isAlive():
    time.sleep(0.1)
    progress_countdown -= 0.1
    if progress_countdown <= 0:
        print(" .", flush=True, end="")
        progress_countdown = 1

print(" Done.")

vis_data = results[0]

Preparing LDA visualisation . . . . . . . . . . . . . . . . Done.


In [33]:
pyLDAvis.display(vis_data, local=False)

Iterate
--------
- See what the main topics might be, slice initial corpus and re-run LDA to get sub-topics