In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.compat import v1 as tf1
from tensorflow.keras import layers as tfkl
import pandas as pd

tfb = tfp.bijectors
tfd = tfp.distributions
tfk = tfp.math.psd_kernels

from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
from sklearn import metrics

from imp import reload

import sys
sys.path.append('../src/')
import correlated_topic_model as ctmd
import dynamic_correlated_topic_model as dctm

# DCTM SotU

Download from https://www.kaggle.com/rtatman/state-of-the-union-corpus-1989-2017. Extract in a folder, and then run the following on the `sotu` folder which was inside the zip.

In [None]:
# path = './data/sotu'
path = '/Users/federicot/Downloads/1660_131107_bundle_archive/sotu/'

import datasets
df, corpus, vocabulary = datasets.get_sotu(path)

In [None]:
import sklearn
scaler = sklearn.preprocessing.MinMaxScaler([-1, 1])
index_points = scaler.fit_transform(df.years[:, None])
# index_points = year.astype(np.float64)[:, None]

# index_points = df.years.values.astype(np.float64)[:, None]
# inducing_index_points = np.unique(index_points)[:, None]

In [None]:
X = np.expand_dims(corpus.todense().astype(np.float64), -2)

np.random.seed(42)

(X_tr, X_ts, index_tr, index_ts, X_tr_sorted, X_ts_sorted,
 index_tr_sorted, index_ts_sorted
) = datasets.train_test_split(X, index_points)

inverse_transform_fn = lambda x: pd.to_datetime(scaler.inverse_transform(x)[:, 0], format='%Y')
df_train = pd.DataFrame(X_tr_sorted[:, 0, :])
df_train['years'] = inverse_transform_fn(index_tr_sorted)

df_test = pd.DataFrame(X_ts_sorted[:, 0, :])
df_test['years'] = inverse_transform_fn(index_ts_sorted)

print("Dataset shape: tr: {}, ts: {}".format(X_tr.shape, X_ts.shape))

save the data before training

In [None]:
# from scipy import sparse as sp
# dok_tr = sp.dok_matrix(X_tr_sorted[:, 0, :])
# dok_ts = sp.dok_matrix(X_ts_sorted[:, 0, :])

# name = 'sotu'
# save_pickle(dok_tr, '{}_tr_doc.pkl'.format(name))
# save_pickle(dok_ts, '{}_ts_doc.pkl'.format(name))
# save_pickle(vocabulary, '{}_vocabulary.pkl'.format(name))

# save_pickle(index_tr, '{}_tr_index.pkl'.format(name))
# save_pickle(index_ts, '{}_ts_index.pkl'.format(name))

# X_sorted = np.vstack((X_tr_sorted[:, 0, :], X_ts_sorted[:, 0, :]))
# print_to_file_for_gdtm(
#     df_train.append(df_test),
#     vocabulary,
#     sp.dok_matrix(X_sorted),
#     filename='sotu_all',
#     path='../data/')

train

In [None]:
batch_size = 5
total_samples = X_tr.shape[0]

dataset = tf.data.Dataset.zip(
    tuple(map(tf.data.Dataset.from_tensor_slices,
        (X_tr, index_tr))))
dataset = dataset.shuffle(total_samples, reshuffle_each_iteration=True)
data_tr = dataset.batch(batch_size)

In [None]:
inducing_index_points_beta = np.linspace(-1, 1, 6)[:, None]
inducing_index_points_mu = np.linspace(-1, 1, 6)[:, None]
inducing_index_points_ell = np.linspace(-1, 1, 6)[:, None]

dtype = np.float64
amplitude_beta = tfp.util.TransformedVariable(
    1., bijector=tfb.Softplus(), dtype=dtype, name='amplitude_beta')
length_scale_beta = tfp.util.TransformedVariable(
    0.5, bijector=tfb.Softplus(), dtype=dtype,
    name='length_scale_beta')
kernel_beta = tfk.MaternOneHalf(amplitude=amplitude_beta, length_scale=length_scale_beta)

amplitude_mu = tfp.util.TransformedVariable(
    1., bijector=tfb.Softplus(), dtype=dtype, name="amplitude_mu")
length_scale_mu = tfp.util.TransformedVariable(
    0.5, bijector=tfb.Softplus(), dtype=dtype,
    name="length_scale_mu")
kernel_mu = tfk.ExponentiatedQuadratic(amplitude=amplitude_mu, length_scale=length_scale_mu)

amplitude_ell = tfp.util.TransformedVariable(
    1., bijector=tfb.Softplus(), dtype=dtype, name='amplitude_ell')
length_scale_ell = tfp.util.TransformedVariable(
    0.5, bijector=tfb.Softplus(), dtype=dtype,
    name='length_scale_ell')
kernel_ell = tfk.ExponentiatedQuadratic(amplitude=amplitude_ell, length_scale=length_scale_ell)

reload(ctmd)
reload(dctm);

losses = []
perplexities = []

mdl = dctm.DCTM(
    n_topics=20, n_words=vocabulary.size,
    kernel_beta=kernel_beta,
    index_points_beta=np.unique(index_tr)[:, None],
    inducing_index_points_beta=inducing_index_points_beta,
    kernel_ell=kernel_ell,
    kernel_mu=kernel_mu,
    index_points_mu=np.unique(index_tr)[:, None],
    index_points_ell=np.unique(index_tr)[:, None],
    inducing_index_points_mu=inducing_index_points_mu,
    inducing_index_points_ell=inducing_index_points_ell,
    layer_sizes=(500, 300, 200),
    jitter_beta=1e-6,
    jitter_mu=1e-5, 
    jitter_ell=1e-6,
    encoder_jitter=1e-8,dtype=dtype)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
# optimizer.iterations = tf1.train.get_or_create_global_step()

# import os
# checkpoint_directory = "../tmp/training_checkpoints-30-topics"
# checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt-sou-20t")
# checkpoint = tf.train.Checkpoint(model=mdl)

In [None]:
n_iter = 2 # 1000
pbar = tqdm(range(n_iter), disable=False)
with tf.device('gpu'): 
    for epoch in pbar:
        loss_value = 0
        perplexity_value = 0

        for x_batch, index_points_batch in data_tr:
            loss, perpl = mdl.batch_optimize(
                x_batch,
                optimizer=optimizer,
                observation_index_points=index_points_batch,
                trainable_variables=None,
                kl_weight=float(x_batch.shape[0]) / float(total_samples))
            loss = tf.reduce_mean(loss, 0)
            loss_value += loss
            perplexity_value += perpl
        pbar.set_description(
        'loss {:.3e}, perpl {:.3e}'.format(loss_value, perplexity_value))

        losses.append(loss_value)
        perplexities.append(perplexity_value)

In [None]:
# checkpoint.save(file_prefix=checkpoint_prefix)

In [None]:
plt.plot(losses)
plt.semilogy()

In [None]:
loss, perpl = mdl.loss_perplexity(X_ts, index_ts)
print(loss)
print(perpl)

In [None]:
with tf.device('gpu'):
    elbo = mdl.elbo(X_ts, index_ts, kl_weight=0.)
    perpl = mdl.perplexity(X_ts, elbo)
    print(perpl)

In [None]:
inverse_transform_fn = lambda x: pd.to_datetime(scaler.inverse_transform(x)[:, 0], format='%Y').strftime('%Y')

reload(dctm)
tops = dctm.print_topics(
    mdl, index_points=np.unique(index_tr)[::10], vocabulary=vocabulary,
    inverse_transform_fn=inverse_transform_fn)