Basic Text Pre-processing and Topic Modelling
======

In [1]:
import glob
import math
import re
import threading
import time

import gensim
import nltk
import pyLDAvis
import tomotopy as tp
import tqdm
import pathlib

In [2]:
import ipywidgets as widgets

Data ingestion
--------------------

We will track the contents and filename of each document, then tokenise them all and feed them into an `ignis.Corpus`.

We should, by all accounts, actually be preparing a separate text cleaning function and running the raw text through it immediately, but this way we can see the effects of each step of the data cleaning.

In [3]:
raw_files = glob.glob("./data/bbc/*/*.txt")

In [4]:
raw_docs = []
for file in tqdm.tqdm(raw_files):
    filename = pathlib.Path(file).as_posix()
    metadata = {
        "filename": filename
    }
    
    with open(file) as f:
        text = f.read()
    
    raw_docs.append([metadata, text])

100%|██████████| 2225/2225 [00:00<00:00, 9196.17it/s]


In [5]:
def show_raw_doc(doc_id=0):
    print(raw_docs[doc_id][0])
    print()
    print(raw_docs[doc_id][1])

widgets.interact(show_raw_doc, doc_id=(0, len(raw_docs) - 1))

interactive(children=(IntSlider(value=0, description='doc_id', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_raw_doc(doc_id=0)>

Text pre-processing and tokenisation
------

### Naive tokenisation (by whitespace)
- Case folding
- Strip leading/trailing non-informative punctuation from tokens
- Remove single apostrophes
- Remove single brackets within words
  - For dealing with cases like "the recipient(s)" -- Which will get tokenised to "the recipient(s" otherwise

In [6]:
strip_punctuation = "'\"()[]<>?!,.:;/|_"
bracket_pairs = [
    ["(", ")"],
    ["[", "]"],
]


def naive_tokenise(doc):
    """
    Naively tokenises a document.
    
    Returns
    -------
    str
        The document as a string of space-separated tokens
    """
    new_tokens = []
    
    tokens = doc.split()
    for token in tokens:
        token = token.casefold()
        token = token.strip(strip_punctuation)
        token = token.replace("'", "")
        
        for bracket_pair in bracket_pairs:
            if bracket_pair[0] in token and bracket_pair[1] not in token:
                token = token.replace(bracket_pair[0], "")
            if bracket_pair[1] in token and bracket_pair[0] not in token:
                token = token.replace(bracket_pair[1], "")
        
        if token != "":
            new_tokens.append(token)
            
    return new_tokens

In [7]:
naive_tokenise('This is a t(e)st of the system\'s "tokenisation" operation(s).')

['this',
 'is',
 'a',
 't(e)st',
 'of',
 'the',
 'systems',
 'tokenisation',
 'operations']

In [8]:
naive_docs = []
for raw_doc in raw_docs:
    naive_docs.append([raw_doc[0], naive_tokenise(raw_doc[1])])

In [9]:
def show_naive_doc(doc_id=0):
    print(naive_docs[doc_id][0])
    print()
    print(" ".join(naive_docs[doc_id][1]))

widgets.interact(show_naive_doc, doc_id=(0, len(naive_docs) - 1))

interactive(children=(IntSlider(value=0, description='doc_id', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_naive_doc(doc_id=0)>

### Automated n-gram detection

Chunk into significant bigrams/trigrams based on collocation frequency

(N.B.: Gensim implies that the input to the Phraser should be a list of single sentences, but we will feed it a list of documents instead.)

- Min count: How many documents the n-grams need to appear in

- Scoring: "default" or "npmi"

- Threshold: Intuitively, higher threshold means fewer phrases.
  - With the default scorer, this is greater than or equal to 0; with the NPMI scorer, this is in the range -1 to 1.

- Common terms: These terms will be ignored if they come between normal words.
  - E.g., if `common_terms` includes the word "of", then when the phraser sees "Wheel of Fortune" it actually evaluates _"Wheel Fortune"_ as an n-gram, putting "of" back in only at the output level.

In [10]:
min_count = 5
scoring = "npmi"
# We want a relatively high threshold so that we don't start littering spurious n-grams all over our corpus, diluting our results.
# E.g., we want "Lord_of_the_Rings", but not "slightly_better_than_analysts"
threshold = 0.7
common_terms = ["a", "an", "the", "of", "on", "in", "at"]

This could take a while, so set up a threaded function with a basic progress indicator in the main thread

In [11]:
def find_trigrams(docs, results):
    # Build, finalise, and apply the bigram model
    bigram_model = gensim.models.Phrases(
        docs,
        min_count=min_count,
        threshold=threshold,
        scoring=scoring,
        common_terms=common_terms,
    )
    bigram_model = gensim.models.phrases.Phraser(bigram_model)
    
    bigram_docs = bigram_model[docs]
    
    # Repeat to get trigrams
    trigram_model = gensim.models.Phrases(
        bigram_docs,
        min_count=min_count,
        threshold=threshold,
        scoring=scoring,
        common_terms=common_terms,
    )
    trigram_model = gensim.models.phrases.Phraser(trigram_model)
    
    trigram_docs = trigram_model[docs]
    
    results[0] = trigram_docs

In [12]:
# # Commenting out to save processing time, since we are loading the saved corpus/results directly

# print("Finding trigrams", flush=True, end="")
# start_time = time.perf_counter()

# # Just send the textual content through the Phraser, not the document metadata
# for_phrasing = [naive_doc[1] for naive_doc in naive_docs]

# # Will contain the documents after trigram processing
# results = [None]
# t = threading.Thread(target=find_trigrams, args=(for_phrasing, results))
# t.start()

# progress_countdown = 1.0

# while t.isAlive():
#     time.sleep(0.1)
#     progress_countdown -= 0.1
#     if progress_countdown <= 0:
#         print(" .", flush=True, end="")
#         progress_countdown = 1

# elapsed = time.perf_counter() - start_time
# print(f" Done. ({elapsed:.3f}s)")

# after_phrasing = results[0]

# # Put metadata back in
# phrased_docs = []
# for index, tokens in enumerate(after_phrasing):
#     phrased_docs.append([naive_docs[index][0], tokens])

In [13]:
# def show_phrased_doc(doc_id=0):
#     print(phrased_docs[doc_id][0])
#     print()
#     print(" ".join(phrased_docs[doc_id][1]))

# widgets.interact(show_phrased_doc, doc_id=(0, len(phrased_docs) - 1))

### Post-phrasing cleaning

- Remove stop words (optional)
- Remove purely numeric/non-alphabetic/single-character tokens
  - Under the assumption that significant tokens, like the "19" in "Covid 19", would have been picked up by the phraser

In [14]:
# # stopset = set(nltk.corpus.stopwords.words("english"))
# # Not needed if using term weighting
# stopset = []


# def second_tokenise(tokens):
#     new_tokens = []
#     for token in tokens:
#         if token in stopset or re.match("^[^a-z]+$", token) or len(token) <= 1:
#             continue
#         new_tokens.append(token)

#     return new_tokens

In [15]:
# final_docs = []
# for phrased_doc in phrased_docs:
#     final_docs.append([phrased_doc[0], second_tokenise(phrased_doc[1])])

In [16]:
# def show_final_doc(doc_id=0):
#     print(final_docs[doc_id][0])
#     print()
#     print(" ".join(final_docs[doc_id][1]))

# widgets.interact(show_final_doc, doc_id=(0, len(final_docs) - 1))

Model training (LDA)
----

Add processed docs to the LDA model and train it.

The random seed and parallelisation can both affect results, so setting the seed and number of workers is necessary for reproducibility.

In [17]:
import ignis

In [18]:
## Uncomment to perform actual model training instead of loading from file

# corpus = ignis.Corpus()

# for metadata, tokens in final_docs:
#     corpus.add_doc(metadata, tokens)
# corpus.save("bbc-full.corpus")

In [19]:
corpus = ignis.load_corpus("bbc-full.corpus")

In [20]:
# model_options = {"k": 10, "term_weighting": "idf", "until_max_ll": True, "verbose": True}
# vis_options = {"verbose": True}
# results = ignis.train_model(corpus, model_type="lda", model_options=model_options, vis_type="pyldavis", vis_options=vis_options)
# results.save("bbc-full.aurum")

In [21]:
results = ignis.load_results("bbc-full.aurum")

In [22]:
results.init_labeller("tomotopy")

Print results
------

In [23]:
def print_topic(topic_id):
    # Labels
    labels = ", ".join(
        label for label, score in results.get_topic_labels(topic_id, top_n=10)
    )
    print(f"Suggested labels: {labels}")

    # Print this topic
    words_probs = results.get_topic_words(topic_id, top_n=10)
    words = [x[0] for x in words_probs]

    words = ", ".join(words)
    print(words)

In [24]:
for k in range(results.get_num_topics()):
    print(f"[Topic {k+1}]")
    print_topic(k)
    print()

[Topic 1]
Suggested labels: etc, you, do, what, people, if, how, even, government, like
we, you, if, he, they, not, there, all, what, do

[Topic 2]
Suggested labels: striker, keeper, header, chelsea, arsenal, free-kick, subs, premiership, manchester_united, yards
club, chelsea, liverpool, arsenal, football, game, fiat, his, united, ferguson

[Topic 3]
Suggested labels: victory, win, injury, grand_slam, coach, win in, final, victory in, matches, france
england, wales, ireland, rugby, win, her, game, his, france, match

[Topic 4]
Suggested labels: awards, nominations, actor, award, for best, nominated, film, nominated for, stars, award for
film, best, awards, music, award, her, band, album, show, actor

[Topic 5]
Suggested labels: consoles, gaming, gamers, graphics, xbox, sonys, pc, console, the nintendo, games
games, game, gaming, dvd, sony, computer, gadgets, pc, gamers, apple

[Topic 6]
Suggested labels: court, charges, lawyers, lawyer, hearing in, the charges, prosecutors, trial for,

Visualise
--------
- Present as a pyLDAvis visualisation

In [25]:
vis_data = results.get_vis_data()
pyLDAvis.display(vis_data)

In [33]:
model_options = {"k": 10, "term_weighting": "idf", "until_max_ll": True, "verbose": True}
results2 = ignis.train_model(corpus, model_type="lda", model_options=model_options)

Training LDA model:
{'term_weighting': 'idf', 'k': 10, 'seed': 11399, 'workers': 8, 'parallel_scheme': 'default', 'iterations': 1000, 'update_every': 100, 'until_max_ll': True, 'max_extra_iterations': 5000, 'verbose': True, 'tw': <TermWeight.IDF: 1>, 'parallel': <ParallelScheme.DEFAULT: 0>}

Iteration: 100	Log-likelihood: -21.706075096679754	Time: 1.448s
Iteration: 200	Log-likelihood: -21.387143670986926	Time: 1.498s
Iteration: 300	Log-likelihood: -21.221527587230437	Time: 1.177s
Iteration: 400	Log-likelihood: -21.123314272466406	Time: 1.155s
Iteration: 500	Log-likelihood: -21.068089958678257	Time: 1.571s
Iteration: 600	Log-likelihood: -21.02252663809977	Time: 1.590s
Iteration: 700	Log-likelihood: -20.997718915873676	Time: 1.209s
Iteration: 800	Log-likelihood: -20.981467437471252	Time: 1.152s
Iteration: 900	Log-likelihood: -20.958075356823937	Time: 1.154s
Iteration: 1000	Log-likelihood: -20.93905816528948	Time: 1.152s

Continuing to train until maximum log-likelihood.
(N.B.: This may n

In [37]:
for doc in results2.ignis_model.model.docs:
    print(doc.get_topics(top_n=2))

[(7, 0.5523127317428589), (0, 0.14025264978408813)]
[(7, 0.6678508520126343), (0, 0.26588231325149536)]
[(5, 0.7109578251838684), (7, 0.1552649885416031)]
[(7, 0.8604100346565247), (0, 0.12739431858062744)]
[(7, 0.4312160611152649), (4, 0.353122353553772)]
[(7, 0.6640769839286804), (0, 0.2738098204135895)]
[(7, 0.8760752081871033), (0, 0.1230764240026474)]
[(7, 0.6439716219902039), (0, 0.29854485392570496)]
[(7, 0.6547201871871948), (0, 0.2895469665527344)]
[(5, 0.6924548149108887), (0, 0.14631658792495728)]
[(7, 0.42689716815948486), (8, 0.2632777988910675)]
[(7, 0.6194180846214294), (0, 0.3361169397830963)]
[(7, 0.7394058108329773), (0, 0.22009657323360443)]
[(0, 0.36322906613349915), (7, 0.33551037311553955)]
[(7, 0.4763184189796448), (0, 0.31889253854751587)]
[(7, 0.6797342896461487), (0, 0.30569833517074585)]
[(5, 0.4435613751411438), (7, 0.31237417459487915)]
[(7, 0.8329787254333496), (0, 0.15002185106277466)]
[(7, 0.5361886024475098), (0, 0.2505306005477905)]
[(0, 0.586696505546

[(4, 0.5381830930709839), (0, 0.26150840520858765)]
[(9, 0.5211380124092102), (0, 0.1843048334121704)]
[(5, 0.4330812096595764), (0, 0.3038221597671509)]
[(9, 0.5143186450004578), (0, 0.2278672307729721)]
[(9, 0.5261824727058411), (0, 0.33383503556251526)]
[(9, 0.6691820025444031), (0, 0.2469549924135208)]
[(0, 0.5414095520973206), (9, 0.26191258430480957)]
[(4, 0.6587149500846863), (0, 0.2814241349697113)]
[(8, 0.45485448837280273), (0, 0.2648252248764038)]
[(0, 0.6162623167037964), (8, 0.18200626969337463)]
[(0, 0.46208760142326355), (8, 0.22950297594070435)]
[(8, 0.5846084952354431), (0, 0.2949901819229126)]
[(4, 0.6595343351364136), (0, 0.24587571620941162)]
[(0, 0.3210221827030182), (6, 0.2935340702533722)]
[(9, 0.459940642118454), (7, 0.19921499490737915)]
[(4, 0.4730648100376129), (9, 0.3146366477012634)]
[(8, 0.5550597906112671), (0, 0.31319284439086914)]
[(8, 0.3739808201789856), (0, 0.2826446294784546)]
[(8, 0.630523681640625), (0, 0.22675774991512299)]
[(8, 0.467336803674697

In [31]:
results.ignis_model.model.docs[1].topics

Exception: doc doesn't has 'Zs' field!

Iterate
--------
- See what the main topics might be, slice initial corpus and re-run LDA to get sub-topics