In [1]:
import pandas as pd
df = pd.read_csv('../data/us_congressional_record/us_congress_speeches_processed.csv')
df = df.sample(n=50000,random_state=42)

train = df.sample(frac=0.9,random_state=42)
test = df.drop(train.index).reset_index(drop=True)
train = train.reset_index(drop=True)

In [2]:
import sys
sys.path.append('../gtm/')
from corpus import GTMCorpus
from gtm import GTM

# Create a GTMCorpus objects (one train and one test set to avoid overfitting the supervised learning algorithm)
train_dataset = GTMCorpus(
    train, # Must contain a column 'doc' with the text of each document and a column 'doc_clean' with the cleaned text of each document.
    labels = "~party-1", # The features to predict. Would be "~ gdp" if the df has a column 'gdp'.
    content = "~1"
)

test_dataset = GTMCorpus(
    test, 
    labels = "~party-1", 
    content = "~1",
    vectorizer = train_dataset.vectorizer # pass on the same vectorizer as for the training set (this ensures the document term matrices have the same number of dimensions)
)

  from .autonotebook import tqdm as notebook_tqdm
  self.log_word_frequencies = torch.FloatTensor(np.log(np.array(self.M_bow.sum(axis=0)).flatten()))


In [3]:
import numpy as np
train_dataset.M_labels = np.delete(train_dataset.M_labels, -0, axis=1)
test_dataset.M_labels = np.delete(test_dataset.M_labels, -0, axis=1)

In [4]:
# Train the model
tm = GTM(
    train_dataset, 
    test_dataset,
    n_topics=20,
    doc_topic_prior='dirichlet', # other option is "logistic_normal"
    update_prior=False, # no prevalence covariates so no need to update the prior
    alpha=0.1, # sparsity of the dirichlet prior
    encoder_hidden_layers=[], # structure of the encoder neural net
    decoder_hidden_layers=[300], # structure of the decoder neural net
    predictor_hidden_layers=[],
    predictor_type='classifier', # 'regressor' for continuous variables such as GDP
    num_epochs=5, # No need to run many epochs. I found 5-10 to work well on 50 000 speeches.
    w_pred_loss=1000, # how much weight should we give to the prediction task in the likelihood?
    w_prior=None, # how much weight should we give to the prior in the likelihood?
    print_every=100000, # print progress every x batches
    log_every=1, # print topic-word dist every x epochs
    batch_size=250,
)


Epoch   1	Mean Training Loss:18.8664830


Epoch   1	Mean Validation Loss:13.5782588

['gentleman', 'bill', 'amendment', 'time', 'yield', 'think', 'say', 'make']
['amendment', 'gentleman', 'bill', 'committee', 'time', 'member', 'think', 'say']
['gentleman', 'bill', 'amendment', 'make', 'time', 'want', 'year', 'say']
['bill', 'amendment', 'gentleman', 'think', 'make', 'yield', 'year', 'time']
['time', 'year', 'gentleman', 'bill', 'want', 'amendment', 'make', 'say']
['gentleman', 'bill', 'time', 'amendment', 'say', 'yield', 'make', 'think']
['gentleman', 'time', 'amendment', 'yield', 'member', 'bill', 'think', 'people']
['amendment', 'gentleman', 'bill', 'committee', 'yield', 'time', 'think', 'rule']
['gentleman', 'bill', 'amendment', 'time', 'yield', 'make', 'want', 'say']
['program', 'provide', 'legislation', 'need', 'bill', 'support', 'year', 'area']
['amendment', 'bill', 'gentleman', 'time', 'make', 'yield', 'say', 'member']
['year', 'make', 'say', 'bill', 'program', 'take', 'time', 

In [5]:
# Assess the quality of the learned word embeddings 
# Top 8 closest words to a specific word

import torch
import torch.nn.functional as F

specific_word = 'tax'

word_id = [i for i,w in enumerate(train_dataset.vocab) if w == specific_word][0]

words = tm.AutoEncoder.decoder['dec_1'].weight.T

logit = torch.matmul(words.T[word_id], words)

beta = F.softmax(logit)

tm.AutoEncoder.eval()
topic_words = []
vals, indices = torch.topk(beta, 8)
vals = vals.cpu().tolist()
indices = indices.cpu().tolist()
[tm.id2token[idx] for idx in indices]

  beta = F.softmax(logit)


['tax', 'budget', 'taxis', 'spending', 'cut', 'deficit', 'income', 'debt']