Ignis: Text Pre-processing
==========================

In [1]:
# General library imports
import collections
import glob
import pathlib
import re
import threading
import time

import gensim
from tqdm.auto import tqdm

import ignis

In [2]:
# Jupyter notebook setup
import ipywidgets as widgets
from IPython.core.display import display, HTML

# Custom styling:
# - Prevent vertical scrollbars in output subareas
style = """
<style>
   div.cell > div.output_wrapper > div.output.output_scroll {
     height: auto;
   }
   .jupyter-widgets-output-area .output_scroll {
        height: unset;
        border-radius: unset;
        -webkit-box-shadow: unset;
        box-shadow: unset;
    }
    .jupyter-widgets-output-area, div.output_stdout, div.output_result  {
        height: auto;
        max-height: 60em;
        overflow-y: auto;
    }
</style>
"""
display(HTML(style))

Data ingestion
--------------------

We will track the contents and filename of each document, then tokenise them all and feed them into an `ignis.Corpus` that will be saved.

We should, by all accounts, actually be preparing a separate text cleaning function and running the raw text through it immediately, but this way we can see the effects of each step of the data cleaning.

In [3]:
raw_files = glob.glob("./data/bbc/*/*.txt")

In [4]:
RawDocument = collections.namedtuple("RawDocument", "metadata, tokens, display_str")

In [5]:
raw_docs = []
for file in tqdm(raw_files):
    filename = pathlib.Path(file).as_posix()

    metadata = {"filename": filename}

    # Basic HTML conversion (we could just use the plain text as well, BeautifulSoup can parse it)
    with open(file) as f:
        tokens = f.read()
        lines = [line for line in tokens.split("\n") if line != ""]

        # Assume first line is the title
        title = f"<strong>{lines[0]}</strong>"
        paras = [f"<p>{line}</p>" for line in lines[1:]]
        body = "\n".join(paras)
        display_str = f"<html><body>{title}{body}</body></html>"

    raw_doc = RawDocument(metadata, tokens, display_str)

    raw_docs.append(raw_doc)

HBox(children=(FloatProgress(value=0.0, max=2225.0), HTML(value='')))




In [6]:
def show_raw_doc(doc_id=0):
    print(raw_docs[doc_id].metadata)
    print()
    print(raw_docs[doc_id].tokens)


widgets.interact(show_raw_doc, doc_id=(0, len(raw_docs) - 1))

interactive(children=(IntSlider(value=0, description='doc_id', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_raw_doc(doc_id=0)>

Text pre-processing and tokenisation
------

### Pre-tokenisation cleaning
- URLs/Emails

In [7]:
URL_REGEX = re.compile(
    # protocol identifier
    r"(?:(?:(?:https?|ftp):)?//)"
    # user:pass authentication
    r"(?:\S+(?::\S*)?@)?" r"(?:"
    # IP address exclusion
    # private & local networks
    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
    # IP address dotted notation octets
    # excludes loopback network 0.0.0.0
    # excludes reserved space >= 224.0.0.0
    # excludes network & broadcast addresses
    # (first & last IP address of each class)
    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
    r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
    r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
    r"|"
    # host & domain names, may end with dot
    # can be replaced by a shortest alternative
    # r"(?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+"
    # r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
    # # domain name
    # r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
    r"(?:"
    r"(?:"
    r"[a-z0-9\u00a1-\uffff]"
    r"[a-z0-9\u00a1-\uffff_-]{0,62}"
    r")?"
    r"[a-z0-9\u00a1-\uffff]\."
    r")+"
    # TLD identifier name, may end with dot
    r"(?:[a-z\u00a1-\uffff]{2,}\.?)" r")"
    # port number (optional)
    r"(?::\d{2,5})?"
    # resource path (optional)
    r"(?:[/?#]\S*)?",
    re.IGNORECASE,
)

In [8]:
pre_regexes = [
    # Rough email regex
    re.compile(r"\b[a-z0-9._%+-]+@[a-z0-9.-]+[.][a-z]{2,}\b", re.IGNORECASE),
    # URL regex
    URL_REGEX,
    # Rougher URL regex
    re.compile(r"\bwww[.][a-z0-9.-]+[.][a-z]{2,}\b", re.IGNORECASE),
]

In [9]:
def pre_regex(doc):
    for regex in pre_regexes:
        doc = regex.sub("", doc)
    return doc


pre_clean_docs = []
for doc in raw_docs:
    pre_clean_docs.append(doc._replace(tokens=pre_regex(doc.tokens)))

### Naive tokenisation (by whitespace)
- Case folding
- Strip leading/trailing non-informative punctuation from tokens
- Remove single apostrophes
- Remove single brackets within words
  - For dealing with cases like "the recipient(s)" -- Which will get tokenised to "the recipient(s" otherwise

In [10]:
strip_punctuation = "'\"()[]<>?!,.:;/|_"
bracket_pairs = [
    ["(", ")"],
    ["[", "]"],
]


def naive_tokenise(doc):
    """
    Naively tokenises a document.
    
    Returns
    -------
    str
        The document as a string of space-separated tokens
    """
    new_tokens = []

    tokens = doc.split()
    for token in tokens:
        token = token.casefold()
        token = token.strip(strip_punctuation)
        token = token.replace("'", "")

        for bracket_pair in bracket_pairs:
            if bracket_pair[0] in token and bracket_pair[1] not in token:
                token = token.replace(bracket_pair[0], "")
            if bracket_pair[1] in token and bracket_pair[0] not in token:
                token = token.replace(bracket_pair[1], "")

        if token != "":
            new_tokens.append(token)

    return new_tokens

In [11]:
naive_docs = []
for doc in pre_clean_docs:
    naive_docs.append(doc._replace(tokens=naive_tokenise(doc.tokens)))

In [12]:
def show_naive_doc(doc_id=0):
    print(naive_docs[doc_id].metadata)
    print()
    print(" ".join(naive_docs[doc_id].tokens))


widgets.interact(show_naive_doc, doc_id=(0, len(naive_docs) - 1))

interactive(children=(IntSlider(value=0, description='doc_id', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_naive_doc(doc_id=0)>

### Automated n-gram detection

Chunk into significant bigrams based on collocation frequency

(N.B.: Gensim implies that the input to the Phraser should be a list of single sentences, but we will feed it a list of documents instead.)

- Min count: How many documents the n-grams need to appear in

- Scoring: "default" or "npmi"

- Threshold: Intuitively, higher threshold means fewer phrases.
  - With the default scorer, this is greater than or equal to 0; with the NPMI scorer, this is in the range -1 to 1.

- Common terms: These terms will be ignored if they come between normal words.
  - E.g., if `common_terms` includes the word "of", then when the phraser sees "Wheel of Fortune" it actually evaluates _"Wheel Fortune"_ as an n-gram, putting "of" back in only at the output level to give _wheel_of_fortune_.
  - With the `common_terms` option set properly,  there do not seem to be many significant trigrams and above left to pick up -- We stick with bigrams to save on runtime.

In [13]:
min_count = 5
scoring = "npmi"
# We want a relatively high threshold so that we don't start littering spurious n-grams all over our corpus, diluting our results.
# E.g., we want "Lord_of_the_Rings", but not "slightly_better_than_analysts"
threshold = 0.7
common_terms = ["a", "an", "the", "of", "on", "in", "at"]

# Up to 4-grams, not counting `common_terms`
phraser_iterations = 3

In [14]:
for_phrasing = [doc.tokens for doc in naive_docs]

for i in range(phraser_iterations):
    print(f"Iteration {i + 1}...")
    
    phraser = ignis.util.ImprovedPhraser(
        for_phrasing,
        min_count=min_count,
        threshold=threshold,
        scoring=scoring,
        common_terms=common_terms,
        drop_non_alpha=True,
        verbose=True,
    )
    for_phrasing = phraser.find_ngrams(for_phrasing, verbose=True)

# `for_phrasing` contains the post-phrasing tokens for each document;
# We need to recombine them with each document's metadata etc.
phrased_docs = []
for index, doc in enumerate(for_phrasing):
    phrased_docs.append(naive_docs[index]._replace(tokens=doc))

Iteration 1...
Gensim Phraser initialised. 4.848s
Improved Phraser initialised. 0.035s


HBox(children=(FloatProgress(value=0.0, max=2225.0), HTML(value='')))


Iteration 2...
Gensim Phraser initialised. 4.930s
Improved Phraser initialised. 0.007s


HBox(children=(FloatProgress(value=0.0, max=2225.0), HTML(value='')))


Iteration 3...
Gensim Phraser initialised. 4.579s
Improved Phraser initialised. 0.001s


HBox(children=(FloatProgress(value=0.0, max=2225.0), HTML(value='')))




In [15]:
def show_phrased_doc(doc_id=0):
    print(phrased_docs[doc_id].metadata)
    print()
    print("/".join(phrased_docs[doc_id].tokens))


widgets.interact(show_phrased_doc, doc_id=(0, len(phrased_docs) - 1))

interactive(children=(IntSlider(value=0, description='doc_id', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_phrased_doc(doc_id=0)>

In [16]:
# Trigrams and above in the corpus (with `common_terms` set properly)
seen_tokens = set()
for document in phrased_docs:
    for token in document.tokens:
        if token.count(" ") >= 2:
            if token not in seen_tokens:
                print(token)
                seen_tokens.add(token)

lord of the rings
reuters news agency
founder mikhail khodorkovsky
wall street journal
gross domestic product
sports utility vehicles
radio 4s today programme
standard & poors
chancellor gerhard schroeder
judge letitia clark
gas monopoly gazprom
told bbc news
sir digby jones
president george w bush
house of lords
george w bushs
international monetary fund
tens of thousands
monetary fund imf
governor mervyn king
chambers of commerce
shadow chancellor oliver letwin
billions of dollars
chapter 11 bankruptcy
faces stiff competition
gross domestic product gdp
sir alex ferguson
tampa bay buccaneers
george w bush
jp morgan chase
lula da silva
weapons of mass destruction
tip of the iceberg
german chancellor gerhard schroeder
told the bbc news website
asian tsunami disaster
da vinci code
prisoner of azkaban
tells the story
million dollar baby
dead mans shoes
shaun of the dead
londons leicester square
pirates of the caribbean
eternal sunshine of the spotless mind
passion of the christ
howls movi

### Post-phrasing cleaning

- Remove stop words (optional)
- Remove purely numeric/non-alphabetic/single-character tokens
  - Under the assumption that significant tokens, like the "19" in "Covid 19" or the "11" in "Chapter 11 (bankruptcy)" would have been picked up by the phraser

In [17]:
stopset = set()

In [18]:
# Stoplist based on TF-IDF
# Not theoretically needed if using term weighting during the training process, but stopword
# removal could still help with runtimes and interpretability


import collections
import math

# Calculate IDF (For calculations of this size, the overhead of Pandas/Numpy is probably not worth it)
df_dict = {}
for doc in phrased_docs:
    for token in set(doc.tokens):
        if token in df_dict:
            df_dict[token] += 1
        else:
            df_dict[token] = 1

# While we're at it, we could remove document-level hapax legomena
hapax = [token for token, count in df_dict.items() if count == 1]

token_idf = {}
for token in df_dict:
    token_idf[token] = math.log(len(phrased_docs) / df_dict[token])


def tf_idf(tokens, token_idf):
    """
    Calculates the TF-IDF for each unique term in the given list of document tokens
    
    Parameters
    ----------
    tokens: iterable of str
        Tokens that make up a single document
    token_idf: dict
        Mapping of terms to their global IDF values
    """
    token_tf_idf = {}
    counts = collections.Counter(tokens)
    for token in set(tokens):
        token_tf = counts[token] / len(tokens)
        # In particular, accessing a Pandas Series by string index is much slower than accessing a Dictionary by key
        token_tf_idf[token] = token_tf * token_idf[token]
    return token_tf_idf

In [19]:
# Thresholding

# Number of terms with lowest TF-IDF scores to consider per document
n_lowest = 50
# The proportion of documents each of these terms must appear in to be considered a stopword
stopword_proportion = 0.3

per_doc_stopwords = []
for doc in phrased_docs:
    token_tf_idf = tf_idf(doc.tokens, token_idf)
    # token_tf_idf is a dict of token -> score
    scores = sorted(list(token_tf_idf.items()), key=lambda x: x[1])
    lowest_n = [score[0] for score in scores[:n_lowest]]
    per_doc_stopwords.append(set(lowest_n))

# Check how many *stopword lists* each token appears in; this is more discriminative than
# checking how many actual *documents* each token appears in instead
stopword_df = {}
for per_doc in per_doc_stopwords:
    for stopword in per_doc:
        if stopword in stopword_df:
            stopword_df[stopword] += 1
        else:
            stopword_df[stopword] = 1

total_docs = len(phrased_docs)
final_stopwords = []
for stopword, count in stopword_df.items():
    if count / total_docs > stopword_proportion:
        final_stopwords.append((stopword, count))
final_stopwords.sort(key=lambda x: x[1], reverse=True)

# List of removed words sorted by TF-IDF (so in roughly decreasing order of commonness)
removed_words = [stopword for stopword, count in final_stopwords]
stopset = stopset.union(removed_words)

print("Stopwords, ordered by TF-IDF:")
print(", ".join(removed_words))
print(f"({len(removed_words)})")

Stopwords, ordered by TF-IDF:
the, to, a, of, and, in, for, on, said, is, has, with, it, at, that, by, was, be, have, as, but, from, an, which, will, been, this, also, are, its, not, up, had, were, who, he, their, would, out, one, after, they, about, last, over, -
(46)


In [20]:
stopset = stopset.union(set(hapax))
print("Also removing document-level hapaxes.")

Also removing document-level hapaxes.


In [21]:
import nltk.corpus

nltk_stopwords = nltk.corpus.stopwords.words("english")
stopset = stopset.union(nltk_stopwords)
print("Also removing NLTK English stopwords.")

Also removing NLTK English stopwords.


In [22]:
domain_stopset = []

# Corpus-specific high-frequency words
domain_stopset += ["mr"]
stopset = stopset.union(domain_stopset)
print("Domain stopset:")
print(", ".join(domain_stopset))

Domain stopset:
mr


In [23]:
# Whitelisting
whitelist = []

stopset -= set(whitelist)

In [24]:
def second_tokenise(tokens, stopset):
    new_tokens = []
    for token in tokens:
        if token in stopset or re.match("^[^a-z]+$", token) or len(token) <= 1:
            continue
        new_tokens.append(token)

    return new_tokens

In [25]:
final_docs = []
for phrased_doc in phrased_docs:
    final_docs.append(
        phrased_doc._replace(tokens=second_tokenise(phrased_doc.tokens, stopset))
    )

In [26]:
# See the top remaining high-frequency words (for further cleaning if necessary)
corpus_tf = {}
for doc in final_docs:
    doc_counts = collections.Counter(doc.tokens)

    for token in set(doc.tokens):
        if token in corpus_tf:
            corpus_tf[token] += doc_counts[token]
        else:
            corpus_tf[token] = doc_counts[token]

sorted(corpus_tf.items(), key=lambda x: x[1], reverse=True)[:20]

[('people', 1969),
 ('us', 1908),
 ('new', 1816),
 ('year', 1607),
 ('could', 1510),
 ('first', 1282),
 ('years', 1223),
 ('two', 1175),
 ('time', 1147),
 ('government', 1023),
 ('world', 1003),
 ('uk', 956),
 ('make', 927),
 ('best', 925),
 ('get', 890),
 ('made', 856),
 ('like', 838),
 ('film', 834),
 ('game', 831),
 ('many', 829)]

In [27]:
def show_final_doc(doc_id=0):
    print(final_docs[doc_id].metadata)
    print()
    print(" ".join(final_docs[doc_id].tokens))


widgets.interact(show_final_doc, doc_id=(0, len(final_docs) - 1))

interactive(children=(IntSlider(value=0, description='doc_id', max=2224), Output()), _dom_classes=('widget-int…

<function __main__.show_final_doc(doc_id=0)>

In [28]:
# Simple deduplication
seen_docs = set()
dupe_count = 0
deduped_docs = []
for doc in final_docs:
    if len(doc.tokens) == 0:
        continue

    # Cast the document tokens as a tuple so that we can use it as a deduplicating hash
    doc_hash = tuple(doc.tokens)
    if doc_hash in seen_docs:
        # print(f"Duplicate document: {doc[0]['filename']}")
        dupe_count += 1
    else:
        seen_docs.add(doc_hash)
        deduped_docs.append(doc)

print(f"{dupe_count} dupes.")

107 dupes.


Save to Ignis Corpus
----

In [29]:
corpus = ignis.Corpus()

for doc in deduped_docs:
    if len(doc.tokens) < 5:
        continue
    corpus.add_doc(**doc._asdict())
corpus.save("bbc.corpus")

In [30]:
# And make sure it loads without errors as well.
corpus = ignis.load_corpus("bbc.corpus")

In [31]:
corpus_doc_ids = list(corpus.documents.keys())


def show_corpus_doc(index=0):
    doc = corpus.documents[corpus_doc_ids[index]]
    print(doc.metadata)
    print()
    print("-" * 10)
    print()
    print(" | ".join(doc.tokens))
    print()
    print("-" * 10)
    print()

    # Jupyter notebooks will interpret anything between $ signs as LaTeX formulae when rendering HTML output,
    # so we need to replace them with escaped $ signs (only in Jupyter environments)
    display_str = doc.display_str.replace("$", r"\$")
    display(HTML(display_str))


widgets.interact(show_corpus_doc, index=(0, len(corpus_doc_ids) - 1))

interactive(children=(IntSlider(value=0, description='index', max=2117), Output()), _dom_classes=('widget-inte…

<function __main__.show_corpus_doc(index=0)>