In [1]:
from run import get_args, process_data, prep_files, get_model, train
import torch
import scipy
import pandas as pd
import numpy as np

from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from tqdm import tqdm
from analysis_utils import get_detm_topics, topic_diversity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### data and file related arguments
arg_str = """
parser.add_argument('--dataset', type=str, default='un', help='name of corpus')
parser.add_argument('--data_path', type=str, default='un/', help='directory containing data')
parser.add_argument('--emb_path', type=str, default='skipgram/embeddings.txt', help='directory containing embeddings')
parser.add_argument('--save_path', type=str, default='./results', help='path to save results')
parser.add_argument('--batch_size', type=int, default=1000, help='number of documents in a batch for training')
parser.add_argument('--min_df', type=int, default=100, help='to get the right data..minimum document frequency')

### model-related arguments
parser.add_argument('--num_topics', type=int, default=50, help='number of topics')
parser.add_argument('--rho_size', type=int, default=300, help='dimension of rho')
parser.add_argument('--emb_size', type=int, default=300, help='dimension of embeddings')
parser.add_argument('--t_hidden_size', type=int, default=800, help='dimension of hidden space of q(theta)')
parser.add_argument('--theta_act', type=str, default='relu', help='tanh, softplus, relu, rrelu, leakyrelu, elu, selu, glu)')
parser.add_argument('--train_embeddings', type=int, default=1, help='whether to fix rho or train it')
parser.add_argument('--eta_nlayers', type=int, default=3, help='number of layers for eta')
parser.add_argument('--eta_hidden_size', type=int, default=200, help='number of hidden units for rnn')
parser.add_argument('--delta', type=float, default=0.005, help='prior variance')

### optimization-related arguments
parser.add_argument('--lr', type=float, default=0.005, help='learning rate')
parser.add_argument('--lr_factor', type=float, default=4.0, help='divide learning rate by this')
parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train')
parser.add_argument('--mode', type=str, default='train', help='train or eval model')
parser.add_argument('--optimizer', type=str, default='adam', help='choice of optimizer')
parser.add_argument('--seed', type=int, default=2019, help='random seed (default: 1)')
parser.add_argument('--enc_drop', type=float, default=0.0, help='dropout rate on encoder')
parser.add_argument('--eta_dropout', type=float, default=0.0, help='dropout rate on rnn for eta')
parser.add_argument('--clip', type=float, default=0.0, help='gradient clipping')
parser.add_argument('--nonmono', type=int, default=10, help='number of bad hits allowed')
parser.add_argument('--wdecay', type=float, default=1.2e-6, help='some l2 regularization')
parser.add_argument('--anneal_lr', type=int, default=0, help='whether to anneal the learning rate or not')
parser.add_argument('--bow_norm', type=int, default=1, help='normalize the bows or not')

### evaluation, visualization, and logging-related arguments
parser.add_argument('--num_words', type=int, default=20, help='number of words for topic viz')
parser.add_argument('--log_interval', type=int, default=10, help='when to log training')
parser.add_argument('--visualize_every', type=int, default=1, help='when to visualize results')
parser.add_argument('--eval_batch_size', type=int, default=1000, help='input batch size for evaluation')
parser.add_argument('--load_from', type=str, default='', help='the name of the ckpt to eval from')
parser.add_argument('--tc', type=int, default=0, help='whether to compute tc or not')
""".split('\n')

In [3]:
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

In [4]:
keys = [x.strip("parser.add_argument('").split(',')[0].strip('--').strip("'") for x in arg_str if (len(x) > 0) and (not x.startswith('#'))]
values = [x.strip("parser.add_argument('").split(',')[2].strip(" default=").strip("'") for x in arg_str if (len(x) > 0) and (not x.startswith('#'))]
tmp_dict = dict(zip(keys, values))

for k, v in tmp_dict.items():
    if v.isnumeric():
        tmp_dict[k] = int(v)
    elif ('.' in v) and (v[0].isnumeric()):
        tmp_dict[k] = float(v)    

args = AttrDict()
args.update(tmp_dict)

args.train_embeddings = 0
args.num_topics = 10
args.batch_size = 100
args.epochs = 150

In [6]:
train_rnn_inp, train_tokens, train_counts, train_times, vocab, embeddings, args = process_data(
    file='test_data_glove.npz',
    args=args
)

args.rho_size = embeddings.shape[1]

idx: 0/2


In [11]:
ckpt = prep_files(args, 'glove')

In [12]:
model, optimizer, args = get_model(args, embeddings)

In [13]:
%%time
## train model on data by looping through multiple epochs
best_epoch = 0
best_val_ppl = 1e9
all_val_ppls = []
for epoch in range(1, args.epochs):
    train(
        epoch,
        model, 
        optimizer, 
        train_tokens, 
        train_counts, 
        train_times, 
        train_rnn_inp,
        args
    )
    ## check whether to anneal lr
    lr = optimizer.param_groups[0]['lr']
    if args.anneal_lr and (len(all_val_ppls) > args.nonmono and val_ppl > min(all_val_ppls[:-args.nonmono]) and lr > 1e-5):
        optimizer.param_groups[0]['lr'] /= args.lr_factor
model.eval()
with torch.no_grad():
    print('saving topic matrix beta...')
    alpha = model.mu_q_alpha
    beta = model.get_beta(alpha).cpu().numpy()
    scipy.io.savemat(ckpt+'_beta.mat', {'values': beta}, do_compression=True)
    if args.train_embeddings:
        print('saving word embedding matrix rho...')
        rho = model.rho.weight.cpu().numpy()
        scipy.io.savemat(ckpt+'_rho.mat', {'values': rho}, do_compression=True)

Epoch: 1 .. batch: 10/17 .. LR: 0.005 .. KL_theta: 6455.87 .. KL_eta: 2210.52 .. KL_alpha: 4596614.05 .. Rec_loss: 31871246.91 .. NELBO: 36476526.55
****************************************************************************************************
Epoch----->1 .. LR: 0.005 .. KL_theta: 6244.14 .. KL_eta: 1502.6 .. KL_alpha: 4546031.62 .. Rec_loss: 31692327.53 .. NELBO: 36246105.41
****************************************************************************************************
Epoch: 2 .. batch: 10/17 .. LR: 0.005 .. KL_theta: 8862.29 .. KL_eta: 137.9 .. KL_alpha: 4296428.34 .. Rec_loss: 31092908.73 .. NELBO: 35398337.27
****************************************************************************************************
Epoch----->2 .. LR: 0.005 .. KL_theta: 8028.14 .. KL_eta: 135.28 .. KL_alpha: 4260052.24 .. Rec_loss: 31649762.24 .. NELBO: 35917977.53
****************************************************************************************************
Epoch: 3 .. batch: 10/17 .. 

## topic analysis

In [7]:
id2word = Dictionary([vocab])
df = pd.read_parquet('data/combined_clean.parquet')
split_text = df['filtered_text'].str.split().values

In [44]:
args.num_words = 10

with torch.no_grad():
    alpha = model.mu_q_alpha
    beta = model.get_beta(alpha) 
    print('beta: ', beta.size())
    print('\n')
    print('#'*100)
    print('Visualize topics...')
    times = [0, 2]
    topics_words = []
    for k in range(args.num_topics):
        for t in times:
            gamma = beta[k, t, :]
            top_words = list(gamma.cpu().numpy().argsort()[-args.num_words+1:][::-1])
            topic_words = [id2word[a] for a in top_words]
            topics_words.append(' '.join(topic_words))
            print('Topic {} .. Time: {} ===> {}'.format(k, t, topic_words)) 

beta:  torch.Size([10, 4, 4938])


####################################################################################################
Visualize topics...
Topic 0 .. Time: 0 ===> ['bank', 'financial', 'banking', 'credit', 'asset', 'monetary', 'loan', 'fund', 'management']
Topic 0 .. Time: 2 ===> ['interest', 'text', 'continue', 'paper', 'view', 'financial', 'important', 'need', 'note']
Topic 1 .. Time: 0 ===> ['time', 'recent', 'change', 'growth', 'example', 'new', 'business', 'financial', 'need']
Topic 1 .. Time: 2 ===> ['financial', 'system', 'risk', 'crisis', 'institution', 'new', 'hold', 'stress', 'large']
Topic 2 .. Time: 0 ===> ['policy', 'monetary', 'action', 'rate', 'issue', 'reserve', 'financial', 'interest', 'fund']
Topic 2 .. Time: 2 ===> ['important', 'will', 'example', 'include', 'financial', 'rate', 'take', 'significant', 'growth']
Topic 3 .. Time: 0 ===> ['rate', 'inflation', 'growth', 'percent', 'economy', 'low', 'high', 'term', 'year']
Topic 3 .. Time: 2 ===> ['policy

In [45]:
coherences = []
for t in tqdm(range(args.num_times)):
    coherences.append(
        CoherenceModel(
            topics=get_detm_topics(model=model, time=t, num_words=20, vocab=id2word, num_topics=args.num_topics), # use 20 words to standardize with DTM
            texts=split_text, 
            dictionary=id2word, 
            coherence='c_v'
        ).get_coherence()
    )

coherences = np.array(coherences)

100%|██████████| 4/4 [00:46<00:00, 11.57s/it]


In [37]:
def topic_diversity(topics):
    all_words = set(np.concatenate(topics))
    return len(all_words) / (len(topics[0]) * len(topics))

In [46]:
diversities = []
for t in tqdm(range(args.num_times)):
    diversities.append(
        topic_diversity(topics=get_detm_topics(model=model, time=t, num_words=20, vocab=id2word, num_topics=args.num_topics))
    )
    
diversities = np.array(diversities)

100%|██████████| 4/4 [00:00<00:00, 200.79it/s]


In [47]:
qualities = diversities * coherences
qualities.mean(), qualities.std()

(0.1699976422907841, 0.011359057839255672)

In [58]:
coherences, diversities

(array([0.32740801, 0.31958769, 0.32274133, 0.34633701]),
 array([0.5  , 0.49 , 0.535, 0.54 ]))

In [48]:
np.savez_compressed(
    'detm_stats_glove.npz',
    coherence=coherences,
    diversity=diversities
)

In [56]:
id2word.token2id['economy']

1424

## topic evolution

In [9]:
beta = scipy.io.loadmat('results/glove_detm_un_K_10_Htheta_800_Optim_adam_Clip_0.0_ThetaAct_relu_Lr_0.005_Bsz_100_RhoSize_300_L_3_minDF_100_trainEmbeddings_0_beta.mat')['values']

In [10]:
import matplotlib.pyplot as plt

In [11]:
def get_word_probs(word, topic, beta):
    word_id = id2word.token2id[word]
    probs = []
    for t in range(4):
        gamma = beta[topic, t, :]
        probs.append(gamma[word_id])
    return probs

In [12]:
for k in range(args.num_topics):
    gamma = beta[k, 0, :]
    top_words = list(gamma.argsort()[-20+1:][::-1])
    topic_words = [id2word[a] for a in top_words]
    print(k, topic_words)

0 ['bank', 'financial', 'banking', 'credit', 'asset', 'monetary', 'loan', 'fund', 'management', 'investment', 'mortgage', 'finance', 'committee', 'ensure', 'debt', 'transaction', 'liquidity', 'lending', 'equity']
1 ['time', 'recent', 'change', 'growth', 'example', 'new', 'business', 'financial', 'need', 'significant', 'future', 'year', 'great', 'important', 'include', 'reason', 'job', 'large', 'will']
2 ['policy', 'monetary', 'action', 'rate', 'issue', 'reserve', 'financial', 'interest', 'fund', 'asset', 'crisis', 'investment', 'decision', 'raise', 'management', 'risk', 'credit', 'bank', 'price']
3 ['rate', 'inflation', 'growth', 'percent', 'economy', 'low', 'high', 'term', 'year', 'period', 'unemployment', 'past', 'long', 'policy', 'interest', 'increase', 'short', 'average', 'current']
4 ['risk', 'capital', 'price', 'high', 'cost', 'low', 'reduce', 'rate', 'demand', 'investment', 'government', 'debt', 'higher', 'value', 'percent', 'increase', 'growth', 'inflation', 'exposure']
5 ['yea

In [30]:
for t in range(4):
    probs = []
    for k in range(10):
        gamma = beta[k,t,:]
        probs.append(gamma[id2word.token2id['foreclosure']])
    print(np.argmax(probs))

6
1
2
7


In [27]:
plot_df = pd.DataFrame({
    'credit': get_word_probs('credit', 0, beta),
    'security': get_word_probs('security', 8, beta),
    'foreclosure': get_word_probs('foreclosure', 7, beta),
    'subprime': get_word_probs('subprime', 7, beta),
    'labor': get_word_probs('labor', 9, beta),
    'technology': get_word_probs('technology', 7, beta),
    #'vol': get_word_probs('vol', 3, model),
    'demand': get_word_probs('demand', 4, beta),
    'regulation': get_word_probs('regulation', 4, beta),
    'requirement': get_word_probs('requirement', 4, beta),
    'inflation': get_word_probs('inflation', 3, beta),
    'unemployment': get_word_probs('unemployment', 3, beta),
    'stress': get_word_probs('stress', 1, beta),
})

plot_df.to_csv('glove_evolve.csv', index=False)

In [28]:
plot_df

Unnamed: 0,credit,security,foreclosure,subprime,labor,technology,demand,regulation,requirement,inflation,unemployment,stress
0,0.00467,0.001684,6.937785e-08,5.105341e-07,0.002416,0.015236,2e-06,2.846482e-13,6.970997e-12,0.386578,0.000278,0.000181
1,0.013119,0.004237,8.391542e-06,1.76211e-05,0.00534,0.000431,0.000831,1.421601e-08,1.055798e-07,0.007673,0.000419,0.001158
2,0.002345,0.000816,4.121818e-05,6.561859e-05,0.008876,0.001302,0.007674,0.01364912,0.01096065,0.008292,0.004303,0.013721
3,0.003765,0.000721,0.0003004296,0.0001491934,0.000158,0.000522,0.002667,0.01039362,0.0007503133,0.001354,0.000472,0.013455
