In [1]:
from run import get_args, process_data, prep_files, get_model, train
import torch
import scipy
import pandas as pd
import numpy as np

from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from tqdm import tqdm
from analysis_utils import get_detm_topics, topic_diversity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### data and file related arguments
arg_str = """
parser.add_argument('--dataset', type=str, default='un', help='name of corpus')
parser.add_argument('--data_path', type=str, default='un/', help='directory containing data')
parser.add_argument('--emb_path', type=str, default='skipgram/embeddings.txt', help='directory containing embeddings')
parser.add_argument('--save_path', type=str, default='./results', help='path to save results')
parser.add_argument('--batch_size', type=int, default=1000, help='number of documents in a batch for training')
parser.add_argument('--min_df', type=int, default=100, help='to get the right data..minimum document frequency')

### model-related arguments
parser.add_argument('--num_topics', type=int, default=50, help='number of topics')
parser.add_argument('--rho_size', type=int, default=300, help='dimension of rho')
parser.add_argument('--emb_size', type=int, default=300, help='dimension of embeddings')
parser.add_argument('--t_hidden_size', type=int, default=800, help='dimension of hidden space of q(theta)')
parser.add_argument('--theta_act', type=str, default='relu', help='tanh, softplus, relu, rrelu, leakyrelu, elu, selu, glu)')
parser.add_argument('--train_embeddings', type=int, default=1, help='whether to fix rho or train it')
parser.add_argument('--eta_nlayers', type=int, default=3, help='number of layers for eta')
parser.add_argument('--eta_hidden_size', type=int, default=200, help='number of hidden units for rnn')
parser.add_argument('--delta', type=float, default=0.005, help='prior variance')

### optimization-related arguments
parser.add_argument('--lr', type=float, default=0.005, help='learning rate')
parser.add_argument('--lr_factor', type=float, default=4.0, help='divide learning rate by this')
parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train')
parser.add_argument('--mode', type=str, default='train', help='train or eval model')
parser.add_argument('--optimizer', type=str, default='adam', help='choice of optimizer')
parser.add_argument('--seed', type=int, default=2019, help='random seed (default: 1)')
parser.add_argument('--enc_drop', type=float, default=0.0, help='dropout rate on encoder')
parser.add_argument('--eta_dropout', type=float, default=0.0, help='dropout rate on rnn for eta')
parser.add_argument('--clip', type=float, default=0.0, help='gradient clipping')
parser.add_argument('--nonmono', type=int, default=10, help='number of bad hits allowed')
parser.add_argument('--wdecay', type=float, default=1.2e-6, help='some l2 regularization')
parser.add_argument('--anneal_lr', type=int, default=0, help='whether to anneal the learning rate or not')
parser.add_argument('--bow_norm', type=int, default=1, help='normalize the bows or not')

### evaluation, visualization, and logging-related arguments
parser.add_argument('--num_words', type=int, default=20, help='number of words for topic viz')
parser.add_argument('--log_interval', type=int, default=10, help='when to log training')
parser.add_argument('--visualize_every', type=int, default=1, help='when to visualize results')
parser.add_argument('--eval_batch_size', type=int, default=1000, help='input batch size for evaluation')
parser.add_argument('--load_from', type=str, default='', help='the name of the ckpt to eval from')
parser.add_argument('--tc', type=int, default=0, help='whether to compute tc or not')
""".split('\n')

In [3]:
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

In [4]:
keys = [x.strip("parser.add_argument('").split(',')[0].strip('--').strip("'") for x in arg_str if (len(x) > 0) and (not x.startswith('#'))]
values = [x.strip("parser.add_argument('").split(',')[2].strip(" default=").strip("'") for x in arg_str if (len(x) > 0) and (not x.startswith('#'))]
tmp_dict = dict(zip(keys, values))

for k, v in tmp_dict.items():
    if v.isnumeric():
        tmp_dict[k] = int(v)
    elif ('.' in v) and (v[0].isnumeric()):
        tmp_dict[k] = float(v)    

args = AttrDict()
args.update(tmp_dict)

args.train_embeddings = 0
args.num_topics = 10
args.batch_size = 100
args.epochs = 150
args.num_words = 10

In [9]:
train_rnn_inp, train_tokens, train_counts, train_times, vocab, embeddings, args = process_data(
    file='test_data_t5_pooled.npz',
    args=args
)

args.rho_size = embeddings.shape[1]

idx: 0/2


In [10]:
ckpt = prep_files(args, 't5_pool')

In [11]:
model, optimizer, args = get_model(args, embeddings)

In [12]:
%%time
## train model on data by looping through multiple epochs
best_epoch = 0
best_val_ppl = 1e9
all_val_ppls = []
for epoch in range(1, args.epochs):
    train(
        epoch,
        model, 
        optimizer, 
        train_tokens, 
        train_counts, 
        train_times, 
        train_rnn_inp,
        args
    )
    ## check whether to anneal lr
    lr = optimizer.param_groups[0]['lr']
    if args.anneal_lr and (len(all_val_ppls) > args.nonmono and val_ppl > min(all_val_ppls[:-args.nonmono]) and lr > 1e-5):
        optimizer.param_groups[0]['lr'] /= args.lr_factor
model.eval()
with torch.no_grad():
    print('saving topic matrix beta...')
    alpha = model.mu_q_alpha
    beta = model.get_beta(alpha).cpu().numpy()
    scipy.io.savemat(ckpt+'_beta.mat', {'values': beta}, do_compression=True)
    if args.train_embeddings:
        print('saving word embedding matrix rho...')
        rho = model.rho.weight.cpu().numpy()
        scipy.io.savemat(ckpt+'_rho.mat', {'values': rho}, do_compression=True)

Epoch: 1 .. batch: 10/17 .. LR: 0.005 .. KL_theta: 2732.82 .. KL_eta: 1732.29 .. KL_alpha: 11792052.64 .. Rec_loss: 19354495.09 .. NELBO: 31151013.64
****************************************************************************************************
Epoch----->1 .. LR: 0.005 .. KL_theta: 1821.78 .. KL_eta: 1137.41 .. KL_alpha: 11589896.47 .. Rec_loss: 19703368.94 .. NELBO: 31296225.29
****************************************************************************************************
Epoch: 2 .. batch: 10/17 .. LR: 0.005 .. KL_theta: 37.83 .. KL_eta: 56.91 .. KL_alpha: 10663851.73 .. Rec_loss: 19751912.73 .. NELBO: 30415858.91
****************************************************************************************************
Epoch----->2 .. LR: 0.005 .. KL_theta: 32.31 .. KL_eta: 60.34 .. KL_alpha: 10512682.88 .. Rec_loss: 19716936.94 .. NELBO: 30229711.76
****************************************************************************************************
Epoch: 3 .. batch: 10/17 .. 

## topic analysis

In [13]:
id2word = Dictionary([vocab])
df = pd.read_parquet('data/combined_clean.parquet')
split_text = df['filtered_text'].str.split().values

In [14]:
with torch.no_grad():
    alpha = model.mu_q_alpha
    beta = model.get_beta(alpha) 
    print('beta: ', beta.size())
    print('\n')
    print('#'*100)
    print('Visualize topics...')
    times = [0, 2]
    topics_words = []
    for k in range(args.num_topics):
        for t in times:
            gamma = beta[k, t, :]
            top_words = list(gamma.cpu().numpy().argsort()[-args.num_words+1:][::-1])
            topic_words = [id2word[a] for a in top_words]
            topics_words.append(' '.join(topic_words))
            print('Topic {} .. Time: {} ===> {}'.format(k, t, topic_words)) 

beta:  torch.Size([10, 4, 4938])


####################################################################################################
Visualize topics...
Topic 0 .. Time: 0 ===> ['commensurate', 'careful', 'hourly', 'culture', 'cumulate', 'conjecture', 'make', 'majority', 'major']
Topic 0 .. Time: 2 ===> ['reasonable', 'circle', 'flag', 'fix', 'milder', 'mile', 'milestone', 'military', 'choose']
Topic 1 .. Time: 0 ===> ['zone', 'focused', 'flourish', 'flow', 'fluctuate', 'fluctuation', 'fly', 'focus', 'fold']
Topic 1 .. Time: 2 ===> ['zone', 'focused', 'flourish', 'flow', 'fluctuate', 'fluctuation', 'fly', 'focus', 'fold']
Topic 2 .. Time: 0 ===> ['zone', 'expend', 'expansive', 'expect', 'expectancy', 'expectation', 'expected', 'expedite', 'expeditiously']
Topic 2 .. Time: 2 ===> ['reasonable', 'false', 'sharply', 'shed', 'sheer', 'sheet', 'shelter', 'shield', 'falter']
Topic 3 .. Time: 0 ===> ['zone', 'focused', 'flourish', 'flow', 'fluctuate', 'fluctuation', 'fly', 'focus', 'fold']

In [21]:
get_detm_topics(beta=beta, time=0, num_words=10, vocab=id2word, num_topics=args.num_topics)

[['commensurate',
  'careful',
  'hourly',
  'culture',
  'cumulate',
  'conjecture',
  'make',
  'majority',
  'major',
  'payroll'],
 ['zone',
  'focused',
  'flourish',
  'flow',
  'fluctuate',
  'fluctuation',
  'fly',
  'focus',
  'fold',
  'footprint'],
 ['zone',
  'expend',
  'expansive',
  'expect',
  'expectancy',
  'expectation',
  'expected',
  'expedite',
  'expeditiously',
  'expenditure'],
 ['zone',
  'focused',
  'flourish',
  'flow',
  'fluctuate',
  'fluctuation',
  'fly',
  'focus',
  'fold',
  'footprint'],
 ['integral',
  'churning',
  'nonbank',
  'nonbanke',
  'nondepository',
  'steady',
  'steadily',
  'stay',
  'statutory',
  'choose'],
 ['zone',
  'fade',
  'failure',
  'fair',
  'fairly',
  'fairness',
  'faith',
  'fall',
  'fallout',
  'false'],
 ['zone',
  'focused',
  'flourish',
  'flow',
  'fluctuate',
  'fluctuation',
  'fly',
  'focus',
  'fold',
  'footprint'],
 ['turnaround',
  'investing',
  'mixed',
  'coal',
  'co',
  'cluster',
  'undercut',
  '

In [15]:
coherences = []
for t in tqdm(range(args.num_times)):
    coherences.append(
        CoherenceModel(
            topics=get_detm_topics(beta=beta, time=t, num_words=20, vocab=id2word, num_topics=args.num_topics), # use 20 words to standardize with DTM
            texts=split_text, 
            dictionary=id2word, 
            coherence='c_v'
        ).get_coherence()
    )

coherences = np.array(coherences)

100%|██████████| 4/4 [00:30<00:00,  7.73s/it]


In [16]:
diversities = []
for t in tqdm(range(args.num_times)):
    diversities.append(
        topic_diversity(topics=get_detm_topics(beta=beta, time=t, num_words=20, vocab=id2word, num_topics=args.num_topics))
    )
    
diversities = np.array(diversities)

100%|██████████| 4/4 [00:00<00:00, 1031.17it/s]


In [17]:
qualities = diversities * coherences
qualities.mean(), qualities.std()

(0.22504547048468504, 0.05153975511103828)

In [18]:
coherences, diversities

(array([0.46402107, 0.47910279, 0.47041304, 0.48076037]),
 array([0.665, 0.395, 0.375, 0.47 ]))

In [19]:
np.savez_compressed(
    'detm_stats_t5_pooled.npz',
    coherence=coherences,
    diversity=diversities
)