In [2]:
from run import get_args, process_data, prep_files, get_model, train
import torch
import scipy
import pandas as pd
import numpy as np

from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from tqdm import tqdm
from analysis_utils import get_detm_topics, topic_diversity

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
### data and file related arguments
arg_str = """
parser.add_argument('--dataset', type=str, default='un', help='name of corpus')
parser.add_argument('--data_path', type=str, default='un/', help='directory containing data')
parser.add_argument('--emb_path', type=str, default='skipgram/embeddings.txt', help='directory containing embeddings')
parser.add_argument('--save_path', type=str, default='./results', help='path to save results')
parser.add_argument('--batch_size', type=int, default=1000, help='number of documents in a batch for training')
parser.add_argument('--min_df', type=int, default=100, help='to get the right data..minimum document frequency')

### model-related arguments
parser.add_argument('--num_topics', type=int, default=50, help='number of topics')
parser.add_argument('--rho_size', type=int, default=300, help='dimension of rho')
parser.add_argument('--emb_size', type=int, default=300, help='dimension of embeddings')
parser.add_argument('--t_hidden_size', type=int, default=800, help='dimension of hidden space of q(theta)')
parser.add_argument('--theta_act', type=str, default='relu', help='tanh, softplus, relu, rrelu, leakyrelu, elu, selu, glu)')
parser.add_argument('--train_embeddings', type=int, default=1, help='whether to fix rho or train it')
parser.add_argument('--eta_nlayers', type=int, default=3, help='number of layers for eta')
parser.add_argument('--eta_hidden_size', type=int, default=200, help='number of hidden units for rnn')
parser.add_argument('--delta', type=float, default=0.005, help='prior variance')

### optimization-related arguments
parser.add_argument('--lr', type=float, default=0.005, help='learning rate')
parser.add_argument('--lr_factor', type=float, default=4.0, help='divide learning rate by this')
parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train')
parser.add_argument('--mode', type=str, default='train', help='train or eval model')
parser.add_argument('--optimizer', type=str, default='adam', help='choice of optimizer')
parser.add_argument('--seed', type=int, default=2019, help='random seed (default: 1)')
parser.add_argument('--enc_drop', type=float, default=0.0, help='dropout rate on encoder')
parser.add_argument('--eta_dropout', type=float, default=0.0, help='dropout rate on rnn for eta')
parser.add_argument('--clip', type=float, default=0.0, help='gradient clipping')
parser.add_argument('--nonmono', type=int, default=10, help='number of bad hits allowed')
parser.add_argument('--wdecay', type=float, default=1.2e-6, help='some l2 regularization')
parser.add_argument('--anneal_lr', type=int, default=0, help='whether to anneal the learning rate or not')
parser.add_argument('--bow_norm', type=int, default=1, help='normalize the bows or not')

### evaluation, visualization, and logging-related arguments
parser.add_argument('--num_words', type=int, default=20, help='number of words for topic viz')
parser.add_argument('--log_interval', type=int, default=10, help='when to log training')
parser.add_argument('--visualize_every', type=int, default=1, help='when to visualize results')
parser.add_argument('--eval_batch_size', type=int, default=1000, help='input batch size for evaluation')
parser.add_argument('--load_from', type=str, default='', help='the name of the ckpt to eval from')
parser.add_argument('--tc', type=int, default=0, help='whether to compute tc or not')
""".split('\n')

In [4]:
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

In [5]:
keys = [x.strip("parser.add_argument('").split(',')[0].strip('--').strip("'") for x in arg_str if (len(x) > 0) and (not x.startswith('#'))]
values = [x.strip("parser.add_argument('").split(',')[2].strip(" default=").strip("'") for x in arg_str if (len(x) > 0) and (not x.startswith('#'))]
tmp_dict = dict(zip(keys, values))

for k, v in tmp_dict.items():
    if v.isnumeric():
        tmp_dict[k] = int(v)
    elif ('.' in v) and (v[0].isnumeric()):
        tmp_dict[k] = float(v)    

args = AttrDict()
args.update(tmp_dict)

args.train_embeddings = 0
args.num_topics = 10
args.batch_size = 100
args.epochs = 150
args.num_words = 10

In [6]:
train_rnn_inp, train_tokens, train_counts, train_times, vocab, embeddings, args = process_data(
    file='test_data_t5.npz',
    args=args
)

args.rho_size = embeddings.shape[1]

idx: 0/2


In [8]:
ckpt = prep_files(args, 't5')

In [9]:
model, optimizer, args = get_model(args, embeddings)

In [10]:
%%time
## train model on data by looping through multiple epochs
best_epoch = 0
best_val_ppl = 1e9
all_val_ppls = []
for epoch in range(1, args.epochs):
    train(
        epoch,
        model, 
        optimizer, 
        train_tokens, 
        train_counts, 
        train_times, 
        train_rnn_inp,
        args
    )
    ## check whether to anneal lr
    lr = optimizer.param_groups[0]['lr']
    if args.anneal_lr and (len(all_val_ppls) > args.nonmono and val_ppl > min(all_val_ppls[:-args.nonmono]) and lr > 1e-5):
        optimizer.param_groups[0]['lr'] /= args.lr_factor
model.eval()
with torch.no_grad():
    print('saving topic matrix beta...')
    alpha = model.mu_q_alpha
    beta = model.get_beta(alpha).cpu().numpy()
    scipy.io.savemat(ckpt+'_beta.mat', {'values': beta}, do_compression=True)
    if args.train_embeddings:
        print('saving word embedding matrix rho...')
        rho = model.rho.weight.cpu().numpy()
        scipy.io.savemat(ckpt+'_rho.mat', {'values': rho}, do_compression=True)

Epoch: 1 .. batch: 10/17 .. LR: 0.005 .. KL_theta: 12282.47 .. KL_eta: 2642.55 .. KL_alpha: 11928697.36 .. Rec_loss: 30871900.0 .. NELBO: 42815521.45
****************************************************************************************************
Epoch----->1 .. LR: 0.005 .. KL_theta: 13688.27 .. KL_eta: 2067.8 .. KL_alpha: 11828830.76 .. Rec_loss: 30344598.94 .. NELBO: 42189185.18
****************************************************************************************************
Epoch: 2 .. batch: 10/17 .. LR: 0.005 .. KL_theta: 15023.6 .. KL_eta: 642.39 .. KL_alpha: 11352735.27 .. Rec_loss: 29986411.27 .. NELBO: 41354812.36
****************************************************************************************************
Epoch----->2 .. LR: 0.005 .. KL_theta: 19277.16 .. KL_eta: 569.21 .. KL_alpha: 11277097.18 .. Rec_loss: 29784397.41 .. NELBO: 41081340.94
****************************************************************************************************
Epoch: 3 .. batch: 10

## topic analysis

In [7]:
beta = scipy.io.loadmat('results/t5_detm_un_K_10_Htheta_800_Optim_adam_Clip_0.0_ThetaAct_relu_Lr_0.005_Bsz_100_RhoSize_768_L_3_minDF_100_trainEmbeddings_0_beta.mat')['values']

In [8]:
id2word = Dictionary([vocab])
df = pd.read_parquet('data/combined_clean.parquet')
split_text = df['filtered_text'].str.split().values

In [13]:
with torch.no_grad():
    times = [0, 2]
    topics_words = []
    for k in range(args.num_topics):
        for t in times:
            gamma = beta[k, t, :]
            top_words = list(gamma.argsort()[-args.num_words+1:][::-1])
            topic_words = [id2word[a] for a in top_words]
            topics_words.append(' '.join(topic_words))
            print('Topic {} .. Time: {} ===> {}'.format(k, t, topic_words)) 

Topic 0 .. Time: 0 ===> ['good', 'rate', 'bank', 'financial', 'interest', 'inflation', 'credit', 'risk', 'great']
Topic 0 .. Time: 2 ===> ['great', 'increase', 'good', 'return', 'long', 'rate', 'year', 'new', 'low']
Topic 1 .. Time: 0 ===> ['bank', 'change', 'business', 'policy', 'financial', 'system', 'increase', 'management', 'new']
Topic 1 .. Time: 2 ===> ['financial', 'firm', 'system', 'bank', 'capital', 'stability', 'risk', 'company', 'business']
Topic 2 .. Time: 0 ===> ['rate', 'economy', 'system', 'time', 'monetary', 'bank', 'need', 'model', 'long']
Topic 2 .. Time: 2 ===> ['policy', 'return', 'bank', 'rule', 'reserve', 'change', 'asset', 'requirement', 'condition']
Topic 3 .. Time: 0 ===> ['increase', 'financial', 'year', 'price', 'change', 'rate', 'rise', 'level', 'technology']
Topic 3 .. Time: 2 ===> ['financial', 'bank', 'household', 'consumer', 'family', 'change', 'community', 'example', 'year']
Topic 4 .. Time: 0 ===> ['price', 'productivity', 'rate', 'inflation', 'growth'

In [15]:
def get_detm_topics(beta, time, num_words, vocab, num_topics):
    with torch.no_grad(): 
        topics_words = []
        for k in range(num_topics):
            gamma = beta[k, time, :]
            top_words = list(gamma.argsort()[-num_words:][::-1])
            topics_words.append([vocab[a] for a in top_words])
    return topics_words

In [16]:
coherences = []
for t in tqdm(range(args.num_times)):
    coherences.append(
        CoherenceModel(
            topics=get_detm_topics(beta=beta, time=t, num_words=20, vocab=id2word, num_topics=args.num_topics), # use 20 words to standardize with DTM
            texts=split_text, 
            dictionary=id2word, 
            coherence='c_v'
        ).get_coherence()
    )

coherences = np.array(coherences)

100%|██████████| 4/4 [00:33<00:00,  8.27s/it]


In [17]:
diversities = []
for t in tqdm(range(args.num_times)):
    diversities.append(
        topic_diversity(topics=get_detm_topics(beta=beta, time=t, num_words=20, vocab=id2word, num_topics=args.num_topics))
    )
    
diversities = np.array(diversities)

100%|██████████| 4/4 [00:00<00:00, 270.99it/s]


In [18]:
qualities = diversities * coherences
qualities.mean(), qualities.std()

(0.17688862333264663, 0.035276393007958005)

In [19]:
coherences, diversities

(array([0.30085992, 0.33886024, 0.35465699, 0.36996204]),
 array([0.425, 0.485, 0.54 , 0.605]))

In [20]:
np.savez_compressed(
    'detm_stats_t5.npz',
    coherence=coherences,
    diversity=diversities
)

## dig into topic evolution

In [10]:
import matplotlib.pyplot as plt

In [9]:
def get_word_probs(word, topic, beta):
    word_id = id2word.token2id[word]
    probs = []
    for t in range(4):
        gamma = beta[topic, t, :]
        probs.append(gamma[word_id])
    return probs

In [14]:
for k in range(args.num_topics):
    gamma = beta[k, 0, :]
    top_words = list(gamma.argsort()[-20+1:][::-1])
    topic_words = [id2word[a] for a in top_words]
    print(k, topic_words)

0 ['good', 'rate', 'bank', 'financial', 'interest', 'inflation', 'credit', 'risk', 'great', 'year', 'system', 'long', 'increase', 'growth', 'change', 'asset', 'time', 'business', 'policy']
1 ['bank', 'change', 'business', 'policy', 'financial', 'system', 'increase', 'management', 'new', 'risk', 'growth', 'rate', 'development', 'security', 'work', 'need', 'process', 'long', 'central']
2 ['rate', 'economy', 'system', 'time', 'monetary', 'bank', 'need', 'model', 'long', 'value', 'industry', 'large', 'process', 'way', 'product', 'financial', 'current', 'price', 'result']
3 ['increase', 'financial', 'year', 'price', 'change', 'rate', 'rise', 'level', 'technology', 'household', 'result', 'low', 'income', 'interest', 'country', 'investment', 'consumer', 'economy', 'product']
4 ['price', 'productivity', 'rate', 'inflation', 'growth', 'future', 'technology', 'economy', 'increase', 'percent', 'long', 'decline', 'cost', 'text', 'demand', 'current', 'value', 'year', 'product']
5 ['rate', 'increase

In [30]:
for t in range(4):
    probs = []
    for k in range(10):
        gamma = beta[k,t,:]
        probs.append(gamma[id2word.token2id['stress']])
    print(np.argmax(probs))

4
6
1
1


In [33]:
plot_df = pd.DataFrame({
    'credit': get_word_probs('credit', 6, beta),
    'security': get_word_probs('security', 1, beta),
    'foreclosure': get_word_probs('foreclosure', 3, beta),
    'subprime': get_word_probs('subprime', 3, beta),
    'labor': get_word_probs('labor', 4, beta),
    'technology': get_word_probs('technology', 4, beta),
    #'vol': get_word_probs('vol', 3, model),
    'demand': get_word_probs('demand', 9, beta),
    'regulation': get_word_probs('regulation', 6, beta),
    'requirement': get_word_probs('requirement', 2, beta),
    'inflation': get_word_probs('inflation', 4, beta),
    'unemployment': get_word_probs('unemployment', 9, beta),
    'stress': get_word_probs('stress', 1, beta),
})

plot_df.to_csv('t5_evolve.csv', index=False)

In [None]:
plot_df.to_csv('

In [32]:
plot_df

Unnamed: 0,credit,security,foreclosure,subprime,labor,technology,demand,regulation,requirement,inflation,unemployment,stress
0,0.018971,0.007707,0.000102,3.672978e-07,0.001426,0.021014,0.003194,0.005395,0.001067,0.034638,0.000141,0.000241
1,0.0314,0.01403,0.003911,0.007089669,0.004734,0.001962,0.004127,0.0066,0.002971,0.048814,0.004918,0.000874
2,0.012021,0.005487,0.000442,0.0001170406,0.009753,0.000925,0.007858,0.00936,0.008978,0.047655,0.007841,0.008102
3,0.012666,0.003271,0.000183,6.751358e-06,0.011636,0.000639,0.003544,0.005117,0.005524,0.061391,0.013302,0.008077
