In [1]:
from timeit import default_timer as timer
from datetime import timedelta
import pickle
from collections import Counter

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import tqdm

In [3]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device {device}")

Using device cuda


In [4]:
%load_ext autoreload
%autoreload 2

import data_handling as data
import preprocess as pp
from models import nvdm

In [5]:
# # Original data
# DATA_RAW_PATH = "./data/bds_1.txt"
# IDs, BDs = data.load_raw(DATA_RAW_PATH)

In [6]:
# Data that has already been preprocessed
# Generated by applying pp.preprocess_text() to each BD,
# then saved to a TSV
DATA_CLEAN_PATH = "./data/bds_1_clean.txt"
IDs_raw, BDs_raw = data.load_raw(DATA_CLEAN_PATH)

In [7]:
# Some entries have empty BDs, so filter those out
IDs = []
BDs = []
for iid, bd in zip(IDs_raw, BDs_raw):
    if len(bd) > 0:
        IDs.append(iid)
        BDs.append(bd)

print(len(IDs), len(BDs))

2034 2034


Following PyTorch's tutorial for data setup.
https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

In [8]:
# Build frequency table
# (cleaned data joins tokens by space)
# counter = Counter()
# for desc in BDs:
#     counter.update(desc.split(" "))

In [9]:
# PyTorch torchtext vocabulary converts tokens to indices and vice versa.
# Also has an '<unk>' for OOV words (might be useful later).
# vocab = Vocab(counter,
#               max_size=10000,
#               min_freq=1,
#               specials=['<unk>'])
# print(len(vocab))
# actual is 70770 without max_size restriction

# Save the vocab to file
# with open("./vocabs/vocab_bds_1_clean_10000.pickle", "wb") as f:
#     pickle.dump(vocab, f)

# Load the vocab from file
with open("./vocabs/vocab_bds_1_clean_10000.pickle", "rb") as f:
    vocab = pickle.load(f)

print(len(vocab))

10001


In [10]:
class BDDataset(Dataset):
    """ Very simple dataset object. Stores all the passages.
    
    This is just for compatibility with PyTorch DataLoader.
    """
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

In [11]:
# "Preprocessing" function: just splits the text
# The file's text is already preprocessed.
def text_pipeline(text):
    return [vocab[token] for token in text.split(" ")]

def collate_batch(batch):
    """ Convert a batch of text (each a list of tokens) into appropriate torch tensors.
    
    Modification of https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html.
    We don't need labels.
    """
    # Offsets tells the model (which will use EmbeddingBag) where each text starts.
    text_list, offsets = [], [0]
    for _text in batch:
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return text_list.to(device), offsets.to(device)

In [12]:
# Create data loader to iterate over dataset in batches during training/evaluation
dataset = BDDataset(BDs)
batch_size = 64
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
hidden_size = 500
num_topics = 10

In [17]:
# Training setup

# Total number of epochs
outer_epochs = 200

# Epochs for training the encoder/decoder on each alternation.
inner_epochs = 1

model = nvdm.NVDM(len(vocab), hidden_size, num_topics, 1, device)
model = model.to(device)
model.train()

# Trains both the encoder and decoder at the same time.
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

start_time = timer()

for epoch in range(outer_epochs):

    loss_sum = 0.0
    rec_sum = 0.0
    kl_sum = 0.0
    n = len(data_loader)

    for idx, (text, offsets) in enumerate(data_loader):
        text = text.to(device)
        offsets = offsets.to(device)

        optimizer.zero_grad()
        loss_dict = model(text, offsets, kl_weight=1.0)
        loss = loss_dict["total"].sum()
        loss.backward()

        optimizer.step()

        # For printing
        loss_sum += loss.item()
        rec_sum += loss_dict["rec"].sum().item()
        kl_sum += loss_dict["kl"].sum().item()

    model_str = "All" # "Enc" if switch == 0 else "Dec"
    print(f"[Time: {timedelta(seconds=timer() - start_time)}, Epoch {epoch + 1}] Loss {loss_sum/n}, Rec {rec_sum/n}, KL {kl_sum/n}")

[Time: 0:00:07.077526, Epoch 1] Loss 3533693.89453125, Rec 3532094.71875, KL 1599.213567018509
[Time: 0:00:13.945147, Epoch 2] Loss 3339778.41796875, Rec 3337470.7890625, KL 2307.6848335266113
[Time: 0:00:20.647853, Epoch 3] Loss 3054039.77734375, Rec 3050626.8125, KL 3412.9466705322266
[Time: 0:00:27.374750, Epoch 4] Loss 2900912.69921875, Rec 2896842.7109375, KL 4070.023857116699
[Time: 0:00:34.038899, Epoch 5] Loss 2848929.4765625, Rec 2844505.6953125, KL 4423.774795532227
[Time: 0:00:40.745445, Epoch 6] Loss 2833701.453125, Rec 2829115.0859375, KL 4586.375625610352
[Time: 0:00:47.944588, Epoch 7] Loss 2828658.16796875, Rec 2824032.140625, KL 4625.994033813477
[Time: 0:00:54.914819, Epoch 8] Loss 2825123.140625, Rec 2820501.44921875, KL 4621.645446777344
[Time: 0:01:01.644543, Epoch 9] Loss 2822650.6875, Rec 2818091.40625, KL 4559.269622802734


KeyboardInterrupt: 

Some observations:

- The original paper alternates between the encoder and decoder when training. i.e. It trains the decoder first for some (e.g. 10) iterations, fixing the encoders parameters. Then it trains the encoder, fixing the decoder's parameters. This is one epoch, which is repeated some number of times until convergence. However, this results in poor training performance: the KL is observed to fluctuate. The encoder and decoder are unable to jointly converge. By training them all together both the reconstruction loss and KL appear to go down.
- Right now we weight the reconstruction and KL losses equally: $L_{total} = L_{rec} + L_{KL}$. We could define a hyperparameter $\beta$ so that $L_{total} = L_{rec} + \beta L_{KL}$, which might help balance the two.

In [None]:
MODELSAVE_PATH = "./modelsaves/nvdm_k10_300epochs.pt"
# torch.save(model.state_dict(), MODELSAVE_PATH)

model = nvdm.NVDM(len(vocab), hidden_size, num_topics, 1, device)
model.load_state_dict(torch.load(MODELSAVE_PATH))
model.eval()

In [None]:
# Extract the vocab-topic matrix (known as R in the paper).
# It has dimensions |V| x K: vocab size x number of topics
decoder = model.decoder[0]
weights = decoder.weight.data.detach().clone()
weights.size()

In [None]:
# Look at some words
# manual KNN
from nltk.stem import PorterStemmer
PORTER_STEMMER = PorterStemmer()

# Set of words used in the original paper
candidates = ["weapons", "medical", "companies", "define", "israel", "book"]

for candidate in candidates:
    test_word = PORTER_STEMMER.stem(candidate)
    idx = vocab.stoi[test_word]
    print(test_word, idx)

    # Show top 10 most similar (based on cosine distance)
    sims = F.cosine_similarity(weights[idx].unsqueeze(0), weights)
    sim_vals, sim_idxs = torch.topk(sims, 15)

    # Show ith nearest word and its score.
    for i, v in zip(sim_idxs, sim_vals):
        print(f"{vocab.itos[i]}\t{v.item()}")
    
    print("-----------")

In [None]:
# Look at most similar words per topic vector.

V, K = weights.size()
for i in range(K):
    print(f"Topic {i+1}")
    vals, idxs = torch.topk(torch.abs(weights[:, i]), 30)
    for i, v in zip(idxs, vals):
        print(f"{vocab.itos[i]}\t{v.item()}")
    print("------------")

# Part 2 Full Evaluation

In [None]:
def analysis(model):
    """ Qualitative analysis of topic model. """
    
    PORTER_STEMMER = PorterStemmer()
    # Set of words used in the original NVDM paper
    candidates = ["weapons", "medical", "companies", "define", "israel", "book"]
    
    # Extract the vocab-topic matrix (known as R in the paper).
    # It has dimensions |V| x K: vocab size x number of topics
    decoder = model.decoder[0]
    weights = decoder.weight.data.detach().clone()

    for candidate in candidates:
        test_word = PORTER_STEMMER.stem(candidate)
        idx = vocab.stoi[test_word]
        print(test_word, idx)

        # Show top 10 most similar (based on cosine distance)
        sims = F.cosine_similarity(weights[idx].unsqueeze(0), weights)
        sim_vals, sim_idxs = torch.topk(sims, 15)

        # Show ith nearest word and its score.
        for i, v in zip(sim_idxs, sim_vals):
            print(f"{vocab.itos[i]}\t{v.item()}")

        print("-----------")
    
    V, K = weights.size()
    for i in range(K):
        print(f"Topic {i+1}")
        vals, idxs = torch.topk(torch.abs(weights[:, i]), 30)
        for i, v in zip(idxs, vals):
            print(f"{vocab.itos[i]}\t{v.item()}")
        print("------------")

In [None]:
# Topic coherence.
def umass_score(tf):
    """ Compute topic coherence using UMass metric.
    
    Ref: http://qpleple.com/topic-coherence-to-evaluate-topic-models/
    
    tf: term-frequency matrix for each document.
        Each i^th row is the BOW representation of the i^th document.
    """
    
    # D(wi): count of documents containing the word wi (i.e. df)
    Dwi = np.array(np.sum(tf > 0, axis=0))[0]

    W_bin = np.zeros_like(tf)
    W_bin[tf > 0] = 1
    
    # D(wi, wj): count of documents containing both words wi and wj
    Dwi_wj = W_bin.T @ W_bin

    score_umass = np.log((Dwi_wj + 1)/ Dwi)
    
    return score_umass


def topic_coherence(topic_vocab, n_top_words, pair_score):
    """
    topic_vocab: dimensions (number of topics, vocabulary size).
    model.components_ for LDA, and the "semantic embedding" matrix in the decoder for NVDM.
    
    pair_score: matrix of scores (e.g. UMass)
    """
    coherences = []
    for topic_idx, topic in enumerate(topic_vocab):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        coh = 0
        for i in range(len(top_features_ind)):
            for j in range(i):
                coh += pair_score[top_features_ind[i], top_features_ind[j]]
        coherences.append(coh)
    return coherences

In [None]:
import os
import re

from nltk.stem import PorterStemmer


MODELSAVE_PATH = "./modelsaves"

models_k = dict()
k_values = []

for filename in os.listdir(MODELSAVE_PATH):
    
    num_topics = filename.split("_")[1][1:]
    num_topics = int(num_topics)
    k_values.append(num_topics)
    
    model = nvdm.NVDM(len(vocab), hidden_size, num_topics, 1, "cpu")
    model.load_state_dict(torch.load(os.path.join(MODELSAVE_PATH, filename), map_location="cpu"))
    model.eval()
    models_k[num_topics] = model


In [None]:
# Make the BOW matrix manually, using the existing Vocab's token-to-index mapping.
bow_mat = np.zeros((len(BDs), len(vocab)))
for d, bd in enumerate(BDs):
    token_idxs = vocab.lookup_indices(bd.split(" "))
    word_counts = Counter(token_idxs)
    for w, count in word_counts.items():
        bow_mat[d, w] = count   
bow_mat.shape

In [None]:
score_umass_mat = umass_score(bow_mat)

In [None]:
k_values.sort()
tc_values = []

for k in k_values:
    print(f'Running k = {k}')
    this_model = models_k[k]
    
    # Extract the topic vocab matrix
    decoder = this_model.decoder[0]
    weights = decoder.weight.data.detach().clone().numpy()
    topic_vocab_mat = weights.T
    coherences = topic_coherence(topic_vocab_mat, 10, score_umass_mat)
    this_c = np.median(coherences)

    tc_values.append(this_c)

In [None]:
plt.plot(k_values, tc_values);

In [None]:
def plot_top_words(topic_vocab, feature_names, n_top_words, title):
    K = len(topic_vocab)
    n_x = 5
    n_y = int(np.ceil(K / n_x))
    fig, axes = plt.subplots(n_y, n_x, figsize=(2.5 * n_x, 4 * n_y), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(topic_vocab):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 14})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=12)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=20)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [None]:
feature_names = [vocab.itos[i] for i in range(0, len(vocab))]

In [None]:
k = 10
this_model = models_k[k]
decoder = this_model.decoder[0]
weights = decoder.weight.data.detach().clone().numpy()
topic_vocab_mat = weights.T

plot_top_words(topic_vocab_mat, feature_names, 10, f"NVDM K={k} Topics")