Ignis: Text Pre-processing
==========================

In [1]:
import collections
import glob
import pathlib
import re
import threading
import time

import gensim
import tqdm

In [2]:
# Jupyter notebook setup
import ipywidgets as widgets
from IPython.core.display import display, HTML

# Custom styling:
# - Prevent vertical scrollbars in output subareas
style = """
<style>
   .jupyter-widgets-output-area .output_scroll {
        height: unset !important;
        border-radius: unset !important;
        -webkit-box-shadow: unset !important;
        box-shadow: unset !important;
    }
    .jupyter-widgets-output-area  {
        height: auto !important;
    }
</style>
"""
display(HTML(style))

Data ingestion
--------------------

We will track the contents and filename of each document, then tokenise them all and feed them into an `ignis.Corpus` that will be saved.

We should, by all accounts, actually be preparing a separate text cleaning function and running the raw text through it immediately, but this way we can see the effects of each step of the data cleaning.

In [3]:
raw_files = glob.glob("./data/bbc/*/*.txt")

In [4]:
RawDocument = collections.namedtuple("RawDocument", "metadata, tokens, human_readable")

In [5]:
raw_docs = []
for file in tqdm.tqdm(raw_files):
    filename = pathlib.Path(file).as_posix()

    metadata = {"filename": filename}

    with open(file) as f:
        tokens = f.read()

    raw_doc = RawDocument(metadata, tokens, human_readable=tokens)

    raw_docs.append(raw_doc)

100%|██████████| 2225/2225 [00:00<00:00, 7889.08it/s]


In [6]:
def show_raw_doc(doc_id=0):
    print(raw_docs[doc_id].metadata)
    print()
    print(raw_docs[doc_id].tokens)


widgets.interact(show_raw_doc, doc_id=(0, len(raw_docs) - 1))

interactive(children=(IntSlider(value=0, description='doc_id', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_raw_doc(doc_id=0)>

Text pre-processing and tokenisation
------

### Naive tokenisation (by whitespace)
- Case folding
- Strip leading/trailing non-informative punctuation from tokens
- Remove single apostrophes
- Remove single brackets within words
  - For dealing with cases like "the recipient(s)" -- Which will get tokenised to "the recipient(s" otherwise

In [7]:
strip_punctuation = "'\"()[]<>?!,.:;/|_"
bracket_pairs = [
    ["(", ")"],
    ["[", "]"],
]


def naive_tokenise(doc):
    """
    Naively tokenises a document.
    
    Returns
    -------
    str
        The document as a string of space-separated tokens
    """
    new_tokens = []

    tokens = doc.split()
    for token in tokens:
        token = token.casefold()
        token = token.strip(strip_punctuation)
        token = token.replace("'", "")

        for bracket_pair in bracket_pairs:
            if bracket_pair[0] in token and bracket_pair[1] not in token:
                token = token.replace(bracket_pair[0], "")
            if bracket_pair[1] in token and bracket_pair[0] not in token:
                token = token.replace(bracket_pair[1], "")

        if token != "":
            new_tokens.append(token)

    return new_tokens

In [8]:
naive_tokenise('This is a t(e)st of the system\'s "tokenisation" operation(s).')

['this',
 'is',
 'a',
 't(e)st',
 'of',
 'the',
 'systems',
 'tokenisation',
 'operations']

In [9]:
naive_docs = []
for raw_doc in raw_docs:
    naive_docs.append(raw_doc._replace(tokens=naive_tokenise(raw_doc.tokens)))

In [10]:
def show_naive_doc(doc_id=0):
    print(naive_docs[doc_id].metadata)
    print()
    print(" ".join(naive_docs[doc_id].tokens))


widgets.interact(show_naive_doc, doc_id=(0, len(naive_docs) - 1))

interactive(children=(IntSlider(value=0, description='doc_id', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_naive_doc(doc_id=0)>

### Automated n-gram detection

Chunk into significant bigrams based on collocation frequency

(N.B.: Gensim implies that the input to the Phraser should be a list of single sentences, but we will feed it a list of documents instead.)

- Min count: How many documents the n-grams need to appear in

- Scoring: "default" or "npmi"

- Threshold: Intuitively, higher threshold means fewer phrases.
  - With the default scorer, this is greater than or equal to 0; with the NPMI scorer, this is in the range -1 to 1.

- Common terms: These terms will be ignored if they come between normal words.
  - E.g., if `common_terms` includes the word "of", then when the phraser sees "Wheel of Fortune" it actually evaluates _"Wheel Fortune"_ as an n-gram, putting "of" back in only at the output level to give _wheel_of_fortune_.
  - With the `common_terms` option set properly,  there do not seem to be many significant trigrams and above left to pick up -- We stick with bigrams to save on runtime.

In [11]:
min_count = 5
scoring = "npmi"
# We want a relatively high threshold so that we don't start littering spurious n-grams all over our corpus, diluting our results.
# E.g., we want "Lord_of_the_Rings", but not "slightly_better_than_analysts"
threshold = 0.7
common_terms = ["a", "an", "the", "of", "on", "in", "at"]

This could take a while, so set up a threaded function with a basic progress indicator in the main thread

In [12]:
def find_phrases(docs, results, trigrams=False):
    # Build, finalise, and apply the bigram model
    bigram_model = gensim.models.Phrases(
        docs,
        min_count=min_count,
        threshold=threshold,
        scoring=scoring,
        common_terms=common_terms,
    )
    bigram_model = gensim.models.phrases.Phraser(bigram_model)

    bigram_docs = bigram_model[docs]

    results[0] = bigram_docs

    if trigrams:
        # Repeat to get trigrams
        trigram_model = gensim.models.Phrases(
            bigram_docs,
            min_count=min_count,
            threshold=threshold,
            scoring=scoring,
            common_terms=common_terms,
        )
        trigram_model = gensim.models.phrases.Phraser(trigram_model)

        trigram_docs = trigram_model[docs]

        results[0] = trigram_docs

In [13]:
print("Finding phrases", flush=True, end="")
start_time = time.perf_counter()

# Just send the textual content through the Phraser, not the document metadata
for_phrasing = [naive_doc.tokens for naive_doc in naive_docs]

# Will contain the documents after trigram processing
results = [None]
t = threading.Thread(target=find_phrases, args=(for_phrasing, results, False))
t.start()

progress_countdown = 1.0

while t.isAlive():
    time.sleep(0.1)
    progress_countdown -= 0.1
    if progress_countdown <= 0:
        print(" .", flush=True, end="")
        progress_countdown = 1

elapsed = time.perf_counter() - start_time
print(f" Done. ({elapsed:.3f}s)")

after_phrasing = results[0]

# Put metadata back in
phrased_docs = []
for index, tokens in enumerate(after_phrasing):
    phrased_docs.append(naive_docs[index]._replace(tokens=tokens))

Finding phrases . . . . Done. (5.492s)


In [14]:
def show_phrased_doc(doc_id=0):
    print(phrased_docs[doc_id].metadata)
    print()
    print(" ".join(phrased_docs[doc_id].tokens))


widgets.interact(show_phrased_doc, doc_id=(0, len(phrased_docs) - 1))

interactive(children=(IntSlider(value=0, description='doc_id', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_phrased_doc(doc_id=0)>

In [15]:
# Trigrams and above in the corpus (with `common_terms` set properly)
seen_tokens = set()
for document in phrased_docs:
    for token in document.tokens:
        if token.count("_") >= 2:
            if token not in seen_tokens:
                print(token)
                seen_tokens.add(token)

lord_of_the_rings
house_of_lords
tens_of_thousands
chambers_of_commerce
billions_of_dollars
weapons_of_mass
tip_of_the_iceberg
prisoner_of_azkaban
tells_the_story
shaun_of_the_dead
pirates_of_the_caribbean
passion_of_the_christ
meet_the_fockers
dismantle_an_atomic
hall_of_fame
phantom_of_the_opera
hepburn_in_the_aviator
miscarriages_of_justice
convention_on_human
hands_of_extremists
code_of_conduct
archbishop_of_canterbury
prevention_of_terrorism
tests_in_tel
faking_a_motorcycle
icing_on_the_cake
capt_a_persico
blogs_in_existence
hitting_the_shelves


In [16]:
# Does Gensim's Phraser completely replace all the relevant original tokens, or is it a partial replacement for some reason?
ngrams = [token.split("_") for token in seen_tokens]
first_tokens = [ngram[0] for ngram in ngrams]

test_docs = [
    ["a", "test"],
    ["flash", "memory"],
    ["another", "boop"],
    ["bot", "nets"],
    ["ehh"]
]

actual_docs = [list(document.tokens) for document in phrased_docs]

def test_ngrams(doc_tokens_list):
    for tokens in doc_tokens_list:
        for index, token in enumerate(tokens):
            if token in first_tokens:
                possibles = [ngram for ngram in ngrams if ngram[0] == token]
                for possible in possibles:
                    # Check this ngram
                    if tokens[index:len(possible)] == possible:
                        print(f"Found: {possible}")          
                        
test_ngrams(test_docs)
test_ngrams(actual_docs)
print("Done")

Done


### Post-phrasing cleaning

- Remove stop words (optional)
- Remove purely numeric/non-alphabetic/single-character tokens
  - Under the assumption that significant tokens, like the "19" in "Covid 19" or the "11" in "Chapter 11 (bankruptcy)" would have been picked up by the phraser

In [17]:
# Stoplist based on TF-IDF
# Not theoretically needed if using term weighting during the training process, but stopword
# removal could still help with runtimes and interpretability


import collections
import math

# Calculate IDF (For calculations of this size, the overhead of Pandas/Numpy is probably not worth it)
df_dict = {}
for doc in phrased_docs:
    for token in set(doc.tokens):
        if token in df_dict:
            df_dict[token] += 1
        else:
            df_dict[token] = 1

# While we're at it, we could remove document-level hapax legomena
hapax = [token for token, count in df_dict.items() if count == 1]

token_idf = {}
for token in df_dict:
    token_idf[token] = math.log(len(phrased_docs) / df_dict[token])


def tf_idf(tokens, token_idf):
    """
    Calculates the TF-IDF for each unique term in the given list of document tokens
    
    Parameters
    ----------
    tokens: iterable of str
        Tokens that make up a single document
    token_idf: dict
        Mapping of terms to their global IDF values
    """
    token_tf_idf = {}
    counts = collections.Counter(tokens)
    for token in set(tokens):
        token_tf = counts[token] / len(tokens)
        # In particular, accessing a Pandas Series by string index is much slower than accessing a Dictionary by key
        token_tf_idf[token] = token_tf * token_idf[token]
    return token_tf_idf

In [18]:
# Thresholding

# Number of terms with lowest TF-IDF scores to consider per document
n_lowest = 50
# The proportion of documents each of these terms must appear in to be considered a stopword
stopword_proportion = 0.15

per_doc_stopwords = []
for doc in phrased_docs:
    token_tf_idf = tf_idf(doc.tokens, token_idf)
    # token_tf_idf is a dict of token -> score
    scores = sorted(list(token_tf_idf.items()), key=lambda x: x[1])
    lowest_n = [score[0] for score in scores[:n_lowest]]
    per_doc_stopwords.append(set(lowest_n))

# Check how many *stopword lists* each token appears in; this is more discriminative than 
# checking how many actual *documents* each token appears in instead
stopword_df = {}
for per_doc in per_doc_stopwords:
    for stopword in per_doc:
        if stopword in stopword_df:
            stopword_df[stopword] += 1
        else:
            stopword_df[stopword] = 1

total_docs = len(phrased_docs)
final_stopwords = []
for stopword, count in stopword_df.items():
    if count / total_docs > stopword_proportion:
        final_stopwords.append((stopword, count))
final_stopwords.sort(key=lambda x: x[1], reverse=True)

# List of removed words sorted by TF-IDF (so in roughly decreasing order of commonness)
removed_words = [stopword for stopword, count in final_stopwords]
stopset = set(removed_words).union(set(hapax))

print("Stopwords, ordered by TF-IDF:")
print(", ".join(removed_words))
print(f"({len(removed_words)})")
print("Also removing document-level hapaxes:")
print(", ".join(hapax))

Stopwords, ordered by TF-IDF:
the, to, a, of, and, in, for, on, said, is, has, with, it, at, by, that, was, be, have, as, but, from, an, which, will, been, this, also, are, its, not, up, had, were, who, he, their, more, would, out, one, after, they, last, about, over, -, than, year, all, two, when, there, new, could, other, into, years, his, we, now, first, if, time, or, made, us, only, no, some, being, so, while, added, before, make, three, well, told, can, back, take, because, them, most, what, next, get, just, way, world, do, people, i, since, against, such, very, any, set, many, like, still
(103)
Also removing document-level hapaxes:
$639m, year-earlier, search-engine, 464,000, $42.09bn, $10.9bn, $3.36bn, $284m, catwoman, $11.1bn, warners, bertelsmanns, timewarner, aols, â£600m, loosening, $1.2974, $1.2871, sinche, half-point, greenspans, moscow-based, firestone, â£479m, $540m, rosnefts, broughton, brul, 274.5, surcharge, controllable, surcharges, $141m, short-haul, a321, eddington

In [19]:
def second_tokenise(tokens, stopset):
    new_tokens = []
    for token in tokens:
        if token in stopset or re.match("^[^a-z]+$", token) or len(token) <= 1:
            continue
        new_tokens.append(token)

    return new_tokens

In [20]:
final_docs = []
for phrased_doc in phrased_docs:
    final_docs.append(phrased_doc._replace(tokens=second_tokenise(phrased_doc.tokens, stopset)))

In [21]:
def show_final_doc(doc_id=0):
    print(final_docs[doc_id].metadata)
    print()
    print(" ".join(final_docs[doc_id].tokens))


widgets.interact(show_final_doc, doc_id=(0, len(final_docs) - 1))

interactive(children=(IntSlider(value=0, description='doc_id', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_final_doc(doc_id=0)>

In [22]:
# Simple deduplication
seen_docs = set()
deduped_docs = []
for doc in final_docs:
    # Cast the document tokens as a tuple so that we can use it as a deduplicating hash
    doc_hash = tuple(doc.tokens)
    if doc_hash in seen_docs:
        # print(f"Duplicate document: {doc[0]['filename']}")
        pass
    else:
        seen_docs.add(doc_hash)
        deduped_docs.append(doc)

Save to Ignis Corpus
----

In [23]:
import ignis

In [24]:
corpus = ignis.Corpus()

for doc in deduped_docs:
    corpus.add_doc(**doc._asdict())
corpus.save("bbc-tf-idf.corpus")

In [25]:
# And make sure it loads without errors as well.
corpus = ignis.load_corpus("bbc-tf-idf.corpus")

In [26]:
corpus_doc_ids = list(corpus.documents.keys())


def show_corpus_doc(index=0):
    doc = corpus.documents[corpus_doc_ids[index]]
    print(doc.metadata)
    print()
    print("-" * 10)
    print()
    print(doc.human_readable)
    print()
    print("-" * 10)
    print()
    print(" ".join(doc.tokens))


widgets.interact(show_corpus_doc, index=(0, len(corpus_doc_ids) - 1))

interactive(children=(IntSlider(value=0, description='index', max=2118), Output()), _dom_classes=('widget-inte…

<function __main__.show_corpus_doc(index=0)>