In [1]:
# Tomotopy HLDA Test
import tomotopy as tp

import glob
import string
import re
import nltk
import pathlib
import time

from ipywidgets import widgets
from IPython.core.display import HTML, display

In [2]:
# Config

# Random seed for training runs
model_seed = 11399

# Dataset path
data_path = pathlib.Path("E:/Datasets/BBC/tech")


# Pre-processing (need to retrain if changed, of course)
extra_tokenise = False
do_stemming = False

# If train_and_save is False, will attempt to load from model_file instead
model_file = "bbc_model.bin"
train_and_save = True

In [3]:
# Tomotopy model
# N.B.: Will be overwritten below if train_and_save is False
model = tp.HLDAModel(seed=model_seed, depth=3)

In [4]:
# Ingest BBC data

# Stopwords
stopset = set(nltk.corpus.stopwords.words("english") + list(string.punctuation))
# Pronouns, titles
stopset.update(
    ["i", "i'm", "i'd", "i've", "i'll"]
    + ["she", "she's", "she'd", "she'll"]
    + ["he", "he's", "he'd", "he'll"]
    + ["they", "they're", "they'd", "they'll"]
    + ["mr", "dr"]
)
# Modals
stopset.update(["would", "could", "should", "shall", "can"])
# Corpus-specific
stopset.update(["will", "also", "said"])

# Stemming
if do_stemming:
    # Save a representative full token for each stem (for friendly display).
    # Each key -> 2nd level Dictionary of counts for full forms
    stem_to_word_count = {}
    stemmer = nltk.stem.snowball.SnowballStemmer("english")

# Iterate over data
for file in glob.glob(f"{data_path}/*.txt"):
    with open(file) as f:
        doc = f.read()

        # Remove unicode chars
        doc = doc.encode("ascii", "ignore").decode()

        # Case folding
        tokens = doc.casefold().split()

        # Preliminary removal of leading/trailing punctuation and stopwords
        tokens = [x.strip(string.punctuation) for x in tokens]
        tokens = [
            x
            for x in tokens
            if x and x not in stopset and not re.match(r"^(\W|\d)+$", x)
        ]

        # Extra tokenisation
        if extra_tokenise:
            clean = " ".join(tokens)
            tokens = nltk.word_tokenize(clean)
            # Secondary stopword cleaning
            tokens = [x.strip(string.punctuation) for x in tokens]
            tokens = [
                x
                for x in tokens
                if x and x not in stopset and not re.match(r"^(\W|\d)+$", x)
            ]

        # Stemming
        if do_stemming:
            new_tokens = []
            for token in tokens:
                stem = stemmer.stem(token)

                # Save friendly version
                if stem not in stem_to_word_count:
                    stem_to_word_count[stem] = {}
                if token not in stem_to_word_count[stem]:
                    stem_to_word_count[stem][token] = 1
                else:
                    stem_to_word_count[stem][token] += 1

                new_tokens.append(stem)

            tokens = new_tokens

        # Add to model
        model.add_doc(tokens)

# Flatten stem_to_word_count to the full word with the highest count
if do_stemming:
    stem_to_word = {}

    for stem, counts in stem_to_word_count.items():
        highest = ["", 0]
        for word, count in counts.items():
            if count > highest[1]:
                highest = [word, count]

        stem_to_word[stem] = highest[0]

In [5]:
# Model training / Load model file
if train_and_save:
    for i in range(0, 500, 50):
        start_time = time.perf_counter()
        model.train(50)
        elapsed = time.perf_counter() - start_time
        print(f"Iteration: {i + 50}\tLog-likelihood: {model.ll_per_word}\tTime: {elapsed:.3f}s", flush=True)
    print(f"Saving to {model_file}.")
    model.save(model_file)
else:
    model = tp.HLDAModel.load(model_file)

Iteration: 0	Log-likelihood: -8.233996412263439	Time: 8.428s
Iteration: 50	Log-likelihood: -8.143486634034783	Time: 8.531s
Iteration: 100	Log-likelihood: -8.079864114218084	Time: 8.416s
Iteration: 150	Log-likelihood: -8.052688681266682	Time: 8.324s
Iteration: 200	Log-likelihood: -8.027816234759872	Time: 8.438s
Iteration: 250	Log-likelihood: -8.02535193768052	Time: 8.424s
Iteration: 300	Log-likelihood: -8.024104181550895	Time: 8.422s
Iteration: 350	Log-likelihood: -8.013049786072353	Time: 8.426s
Iteration: 400	Log-likelihood: -8.005423091512514	Time: 8.446s
Iteration: 450	Log-likelihood: -8.013088026966864	Time: 8.417s
Saving to bbc_model.bin.


Results
=======

In [6]:
# Utils
def word_by_id(word_id):
    return model.vocabs[word_id]

In [7]:
# Results by topic
def print_with_parents(topic_id):
    # Recursively print any parents first
    parent_id = model.parent_topic(topic_id)
    if parent_id >= 0:
        print_with_parents(parent_id)

    # Print this topic
    words_probs = model.get_topic_words(topic_id, top_n=10)
    words = [x[0] for x in words_probs]

    # Lookup stem -> most common form if necessary
    if do_stemming:
        words = [stem_to_word[x] for x in words]

    words = ", ".join(words)
    print(f"Level {model.level(topic_id)}: {words}")


for k in range(model.k):
    if not model.is_live_topic(k):
        continue

    print(f"Topic {k}")
    print("=-=-=-=-=-=-=")
    print_with_parents(k)
    print()

Topic 0
=-=-=-=-=-=-=
Level 0: people, one, technology, new, many, use, get, make, users, like

Topic 8
=-=-=-=-=-=-=
Level 0: people, one, technology, new, many, use, get, make, users, like
Level 1: music, video, digital, players, devices, market, consumer, technologies, portable, year

Topic 9
=-=-=-=-=-=-=
Level 0: people, one, technology, new, many, use, get, make, users, like
Level 1: music, technology, media, control, player, voice, digital, firms, consumer, able

Topic 10
=-=-=-=-=-=-=
Level 0: people, one, technology, new, many, use, get, make, users, like
Level 1: tv, project, viewers, media, allow, editing, wyver, drama, productions, audiences

Topic 11
=-=-=-=-=-=-=
Level 0: people, one, technology, new, many, use, get, make, users, like
Level 1: pacific, pc, assault, finest, wartime, feel, action, along, hour, rather

Topic 12
=-=-=-=-=-=-=
Level 0: people, one, technology, new, many, use, get, make, users, like
Level 1: robot, human, humans, robots, interaction, professor,

Level 0: people, one, technology, new, many, use, get, make, users, like
Level 1: creative, technology, art, game, artists, stone, industries, engine, understand, worlds
Level 2: river, thirst, practices, arm, 21st, commonly, navigated, bell, keen, bug

Topic 101
=-=-=-=-=-=-=
Level 0: people, one, technology, new, many, use, get, make, users, like
Level 1: digital, network, cinemas, uk, films, film, cinema, prints, print, projectors
Level 2: boxes, sky, tv, cards, set-top, loyalty, viewers, content, whalley, rewarded

Topic 102
=-=-=-=-=-=-=
Level 0: people, one, technology, new, many, use, get, make, users, like
Level 1: pacific, pc, assault, finest, wartime, feel, action, along, hour, rather
Level 2: duty, call, game, games, little, honor, titles, presentation, shoot, atmosphere

Topic 103
=-=-=-=-=-=-=
Level 0: people, one, technology, new, many, use, get, make, users, like
Level 1: content, internet, regulation, net, tv, providers, consumers, currie, lord, regulate
Level 2: progra

Level 1: image, dutch, created, city, lyons, images, gigapixel, tno, viewers, taken
Level 2: piero, bbc, detail, analysis, camera, sport, sports, broadcast, england, wealth

Topic 184
=-=-=-=-=-=-=
Level 0: people, one, technology, new, many, use, get, make, users, like
Level 1: cash, machines, windows, viruses, banks, networks, security, virus, system, risks
Level 2: posters, campaign, phone, pass, stations, port, fitted, transport, beam, londoners

Topic 185
=-=-=-=-=-=-=
Level 0: people, one, technology, new, many, use, get, make, users, like
Level 1: music, video, digital, players, devices, market, consumer, technologies, portable, year
Level 2: mobile, phones, mobiles, camera, multimedia, phone, people, cameras, gartner, sold

Topic 186
=-=-=-=-=-=-=
Level 0: people, one, technology, new, many, use, get, make, users, like
Level 1: released, world, title, confirmed, pc, set, wars, ico, ds, ps2
Level 2: calls, broadband, voip, service, phone, wanadoo, telephony, home, maker, applica

In [8]:
# Interactive results by document

colour_map = {0: "blue", 1: "red", 2: "green"}

# Topic -> Level and Topic -> Top words mappings
topic_to_level = {}
topic_to_words = {}
for k in range(model.k):
    # Level
    if not model.is_live_topic(k):
        continue
    topic_to_level[k] = model.level(k)

    # Top words
    word_probs = model.get_topic_words(k, top_n=10)
    words = [x[0] for x in word_probs]

    # Lookup stem -> most common form if necessary
    if do_stemming:
        words = [stem_to_word[x] for x in words]

    topic_to_words[k] = words

# Each document
def show_doc(d=0):
    doc = model.docs[d]
    # Get unique doc topics -- Should be in ascending order of level after sorting
    doc_topics = list(set(doc.topics))
    doc_topics.sort()

    # Header
    for level in range(len(doc_topics)):
        output = (
            f"<h{level+1}><span style='color:{colour_map[level]}'>"
            f"Topic {doc_topics[level]} (Level {level}): "
            f"{', '.join(topic_to_words[doc_topics[level]])}"
            f"</span></h{level+1}>"
        )
        display(HTML(output))

    display(HTML("<hr/><h5>Processed Document</h5>"))

    # Documents words
    words = [word_by_id(x) for x in doc.words]
    if do_stemming:
        words = [stem_to_word[x] for x in words]

    word_html = []
    for word, topic in zip(words, doc.topics):
        word_html.append(
            f"<span style='color: {colour_map[topic_to_level[topic]]}'>{word}</span>"
        )

    display(HTML(" ".join(word_html)))

In [9]:
widgets.interact(show_doc, d=(0, len(model.docs) - 1))

interactive(children=(IntSlider(value=0, description='d', max=400), Output()), _dom_classes=('widget-interact'…

<function __main__.show_doc(d=0)>