Basic Text Pre-processing and Topic Modelling
======

In [1]:
import glob
import math
import re
import threading
import time

import gensim
import nltk
import pyLDAvis
import tomotopy as tp
import tqdm
import pathlib

In [2]:
import ipywidgets as widgets

Data ingestion
--------------------

We will track the contents and filename of each document, then tokenise them all and feed them into an `ignis.Corpus`.

We should, by all accounts, actually be preparing a separate text cleaning function and running the raw text through it immediately, but this way we can see the effects of each step of the data cleaning.

In [3]:
raw_files = glob.glob("./data/bbc/*/*.txt")

In [4]:
raw_docs = []
for file in tqdm.tqdm(raw_files):
    filename = pathlib.Path(file).as_posix()
    metadata = {
        "filename": filename
    }
    
    with open(file) as f:
        text = f.read()
    
    raw_docs.append([metadata, text])

100%|██████████| 2225/2225 [00:00<00:00, 9233.96it/s]


In [5]:
def show_raw_doc(doc_id=0):
    print(raw_docs[doc_id][0])
    print()
    print(raw_docs[doc_id][1])

widgets.interact(show_raw_doc, doc_id=(0, len(raw_docs) - 1))

interactive(children=(IntSlider(value=0, description='doc_id', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_raw_doc(doc_id=0)>

Text pre-processing and tokenisation
------

### Naive tokenisation (by whitespace)
- Case folding
- Strip leading/trailing non-informative punctuation from tokens
- Remove single apostrophes
- Remove single brackets within words
  - For dealing with cases like "the recipient(s)" -- Which will get tokenised to "the recipient(s" otherwise

In [6]:
strip_punctuation = "'\"()[]<>?!,.:;/|_"
bracket_pairs = [
    ["(", ")"],
    ["[", "]"],
]


def naive_tokenise(doc):
    """
    Naively tokenises a document.
    
    Returns
    -------
    str
        The document as a string of space-separated tokens
    """
    new_tokens = []
    
    tokens = doc.split()
    for token in tokens:
        token = token.casefold()
        token = token.strip(strip_punctuation)
        token = token.replace("'", "")
        
        for bracket_pair in bracket_pairs:
            if bracket_pair[0] in token and bracket_pair[1] not in token:
                token = token.replace(bracket_pair[0], "")
            if bracket_pair[1] in token and bracket_pair[0] not in token:
                token = token.replace(bracket_pair[1], "")
        
        if token != "":
            new_tokens.append(token)
            
    return new_tokens

In [7]:
naive_tokenise('This is a t(e)st of the system\'s "tokenisation" operation(s).')

['this',
 'is',
 'a',
 't(e)st',
 'of',
 'the',
 'systems',
 'tokenisation',
 'operations']

In [8]:
naive_docs = []
for raw_doc in raw_docs:
    naive_docs.append([raw_doc[0], naive_tokenise(raw_doc[1])])

In [9]:
def show_naive_doc(doc_id=0):
    print(naive_docs[doc_id][0])
    print()
    print(" ".join(naive_docs[doc_id][1]))

widgets.interact(show_naive_doc, doc_id=(0, len(naive_docs) - 1))

interactive(children=(IntSlider(value=0, description='doc_id', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_naive_doc(doc_id=0)>

### Automated n-gram detection

Chunk into significant bigrams/trigrams based on collocation frequency

(N.B.: Gensim implies that the input to the Phraser should be a list of single sentences, but we will feed it a list of documents instead.)

- Min count: How many documents the n-grams need to appear in

- Scoring: "default" or "npmi"

- Threshold: Intuitively, higher threshold means fewer phrases.
  - With the default scorer, this is greater than or equal to 0; with the NPMI scorer, this is in the range -1 to 1.

- Common terms: These terms will be ignored if they come between normal words.
  - E.g., if `common_terms` includes the word "of", then when the phraser sees "Wheel of Fortune" it actually evaluates _"Wheel Fortune"_ as an n-gram, putting "of" back in only at the output level.

In [10]:
min_count = 5
scoring = "npmi"
# We want a relatively high threshold so that we don't start littering spurious n-grams all over our corpus, diluting our results.
# E.g., we want "Lord_of_the_Rings", but not "slightly_better_than_analysts"
threshold = 0.7
common_terms = ["a", "an", "the", "of", "on", "in", "at"]

This could take a while, so set up a threaded function with a basic progress indicator in the main thread

In [11]:
def find_trigrams(docs, results):
    # Build, finalise, and apply the bigram model
    bigram_model = gensim.models.Phrases(
        docs,
        min_count=min_count,
        threshold=threshold,
        scoring=scoring,
        common_terms=common_terms,
    )
    bigram_model = gensim.models.phrases.Phraser(bigram_model)
    
    bigram_docs = bigram_model[docs]
    
    # Repeat to get trigrams
    trigram_model = gensim.models.Phrases(
        bigram_docs,
        min_count=min_count,
        threshold=threshold,
        scoring=scoring,
        common_terms=common_terms,
    )
    trigram_model = gensim.models.phrases.Phraser(trigram_model)
    
    trigram_docs = trigram_model[docs]
    
    results[0] = trigram_docs

In [12]:
print("Finding trigrams", flush=True, end="")
start_time = time.perf_counter()

# Just send the textual content through the Phraser, not the document metadata
for_phrasing = [naive_doc[1] for naive_doc in naive_docs]

# Will contain the documents after trigram processing
results = [None]
t = threading.Thread(target=find_trigrams, args=(for_phrasing, results))
t.start()

progress_countdown = 1.0

while t.isAlive():
    time.sleep(0.1)
    progress_countdown -= 0.1
    if progress_countdown <= 0:
        print(" .", flush=True, end="")
        progress_countdown = 1

elapsed = time.perf_counter() - start_time
print(f" Done. ({elapsed:.3f}s)")

after_phrasing = results[0]

# Put metadata back in
phrased_docs = []
for index, tokens in enumerate(after_phrasing):
    phrased_docs.append([naive_docs[index][0], tokens])

Finding trigrams . . . . . . . . . Done. (11.202s)


In [13]:
def show_phrased_doc(doc_id=0):
    print(phrased_docs[doc_id][0])
    print()
    print(" ".join(phrased_docs[doc_id][1]))

widgets.interact(show_phrased_doc, doc_id=(0, len(phrased_docs) - 1))

interactive(children=(IntSlider(value=0, description='doc_id', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_phrased_doc(doc_id=0)>

### Post-phrasing cleaning

- Remove stop words (optional)
- Remove purely numeric/non-alphabetic/single-character tokens
  - Under the assumption that significant tokens, like the "19" in "Covid 19", would have been picked up by the phraser

In [14]:
# stopset = set(nltk.corpus.stopwords.words("english"))
# Not needed if using term weighting
stopset = []


def second_tokenise(tokens):
    new_tokens = []
    for token in tokens:
        if token in stopset or re.match("^[^a-z]+$", token) or len(token) <= 1:
            continue
        new_tokens.append(token)

    return new_tokens

In [15]:
final_docs = []
for phrased_doc in phrased_docs:
    final_docs.append([phrased_doc[0], second_tokenise(phrased_doc[1])])

In [16]:
def show_final_doc(doc_id=0):
    print(final_docs[doc_id][0])
    print()
    print(" ".join(final_docs[doc_id][1]))

widgets.interact(show_final_doc, doc_id=(0, len(final_docs) - 1))

interactive(children=(IntSlider(value=0, description='doc_id', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_final_doc(doc_id=0)>

Model training (LDA)
----

Add processed docs to the LDA model and train it.

The random seed and parallelisation can both affect results, so setting the seed and number of workers is necessary for reproducibility.

In [17]:
import ignis

In [18]:
corpus = ignis.Corpus()

for metadata, tokens in final_docs:
    corpus.add_doc(metadata, tokens)

In [19]:
results = ignis.train_model(corpus, model_type="lda", model_options={"verbose": True})

Training LDA model:
{'term_weighting': 'idf', 'k': 20, 'seed': 11399, 'workers': 8, 'parallel_scheme': 'default', 'iterations': 1000, 'update_every': 100, 'until_max_ll': False, 'max_extra_iterations': 5000, 'verbose': True, 'tw': <TermWeight.IDF: 1>, 'parallel': <ParallelScheme.DEFAULT: 0>}

Iteration: 100	Log-likelihood: -21.27849047035462	Time: 2.284s
Iteration: 200	Log-likelihood: -20.933526954007277	Time: 2.296s
Iteration: 300	Log-likelihood: -20.76408325795117	Time: 2.135s
Iteration: 400	Log-likelihood: -20.681196461159733	Time: 1.744s
Iteration: 500	Log-likelihood: -20.62150393437183	Time: 1.386s
Iteration: 600	Log-likelihood: -20.5748602387642	Time: 1.385s
Iteration: 700	Log-likelihood: -20.552054316530977	Time: 1.381s
Iteration: 800	Log-likelihood: -20.51634455334242	Time: 1.373s
Iteration: 900	Log-likelihood: -20.48474095778886	Time: 1.380s
Iteration: 1000	Log-likelihood: -20.45425931609133	Time: 1.369s
Model training complete. (16.782s)


In [30]:
results.save("test.bin")

In [20]:
# Persistence
model_seed = 11399
num_workers = 12

# Model options
model_file = "model.bin"
num_topics = 20

# Training iterations
load_saved_model = False
train_batch = 100
train_total = 1000

# Extended training
train_until_min_ll = True
max_iterations = 10000

In [21]:
# if load_saved_model:
#     model = tp.LDAModel.load(model_file)
#     print(f"Loaded from '{model_file}'.")
# else:
#     model = tp.LDAModel(tw=tp.TermWeight.IDF, seed=model_seed, k=num_topics)

#     for doc in tqdm.tqdm(docs):
#         model.add_doc(doc)

#     model.train(0, workers=num_workers, parallel=tp.ParallelScheme.DEFAULT)
#     print(
#         f"Num docs: {len(model.docs)}, Vocab size: {model.num_vocabs}, "
#         f"Num words: {model.num_words}"
#     )
#     print(f"Removed top words: {model.removed_top_words}")

#     print("Training model...", flush=True)

#     try:
#         for i in range(0, train_total, train_batch):
#             start_time = time.perf_counter()
#             model.train(
#                 train_batch, workers=num_workers, parallel=tp.ParallelScheme.DEFAULT
#             )
#             elapsed = time.perf_counter() - start_time
#             print(
#                 f"Iteration: {i + train_batch}\tLog-likelihood: {model.ll_per_word}\t"
#                 f"Time: {elapsed:.3f}s",
#                 flush=True,
#             )
#     except KeyboardInterrupt:
#         print("Stopping train sequence.")
#     model.save(model_file)
#     print(f"Saved to '{model_file}'.")

In [22]:
# if train_until_min_ll:
#     print("Continuing to train until minimum log-likelihood...")
#     print("(N.B.: This may not correlate with increased human interpretability)")
#     last_ll = model.ll_per_word
#     i = 0
#     consecutive_loss = 0

#     while True:
#         try:
#             start_time = time.perf_counter()
#             model.train(
#                 train_batch, workers=num_workers, parallel=tp.ParallelScheme.DEFAULT
#             )
#             i += train_batch
#             elapsed = time.perf_counter() - start_time
#             print(
#                 f"Iteration: {i}\tLog-likelihood: {model.ll_per_word}\t"
#                 f"Time: {elapsed:.3f}s",
#                 flush=True,
#             )

#             if model.ll_per_word < last_ll:
#                 consecutive_loss += 1
#             else:
#                 consecutive_loss = 0
#                 model.save(model_file)
#             last_ll = model.ll_per_word

#             if consecutive_loss == 2 or i >= max_iterations:
#                 break

#         except KeyboardInterrupt:
#             print("Stopping extended train sequence.")
#             break

#     model = tp.LDAModel.load(model_file)
#     print(f"Best recent model saved at '{model_file}' (LL: {model.ll_per_word}).")

Topic labelling

In [23]:
# print("Extracting suggested topic labels...", flush=True)
# # extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
# extractor = tp.label.PMIExtractor(min_cf=5, min_df=3, max_len=5, max_cand=20000)
# candidates = extractor.extract(model)
# # labeler = tp.label.FoRelevance(model, candidates, min_df=5, smoothing=1e-2,
# # mu=0.25)
# labeler = tp.label.FoRelevance(
#     model, candidates, min_df=3, smoothing=1e-2, mu=0.25, workers=num_workers
# )
# print("Done.")

Print results
------

In [24]:
# def print_topic(topic_id):
#     # Labels
#     labels = ", ".join(
#         label for label, score in labeler.get_topic_labels(topic_id, top_n=10)
#     )
#     print(f"Suggested labels: {labels}")

#     # Print this topic
#     words_probs = model.get_topic_words(topic_id, top_n=10)
#     words = [x[0] for x in words_probs]

#     words = ", ".join(words)
#     print(words)

In [25]:
# for k in range(model.k):
#     print(f"[Topic {k+1}]")
#     print_topic(k)
#     print()

Visualise
--------
- Present data in the format expected by pyLDAvis

In [26]:
# model_data = {
#     "topic_term_dists": [model.get_topic_word_dist(k) for k in range(model.k)],
#     "doc_topic_dists": [model.docs[n].get_topic_dist() for n in range(len(model.docs))],
#     "doc_lengths": [len(model.docs[n].words) for n in range(len(model.docs))],
#     "vocab": model.vocabs,
#     "term_frequency": model.vocab_freq,
# }

Again, this could take a while

In [27]:
# def prepare_vis(model_data, results):
#     vis_data = pyLDAvis.prepare(**model_data)
#     results[0] = vis_data

In [28]:
# print("Preparing LDA visualisation", flush=True, end="")

# results = [None]
# t = threading.Thread(target=prepare_vis, args=(model_data, results))
# t.start()

# progress_countdown = 1.0

# while t.isAlive():
#     time.sleep(0.1)
#     progress_countdown -= 0.1
#     if progress_countdown <= 0:
#         print(" .", flush=True, end="")
#         progress_countdown = 1

# print(" Done.")

# vis_data = results[0]

In [29]:
# pyLDAvis.display(vis_data)

Iterate
--------
- See what the main topics might be, slice initial corpus and re-run LDA to get sub-topics