In [1]:
import pandas as pd
df = pd.read_csv('../data/us_congressional_record/us_congress_speeches_processed.csv', nrows = 1000)
df.dropna(subset=['doc_clean'], inplace=True)
df['word_count'] = df['doc_clean'].apply(lambda x: len(x.split()))
df = df[df["word_count"] > 30]

import numpy as np
df['random'] = np.random.normal(size = len(df.index))

train = df.sample(frac=0.9,random_state=42)
test = df.drop(train.index).reset_index(drop=True)
train = train.reset_index(drop=True)

In [2]:
import sys
sys.path.append('../gtm/')
from corpus import GTMCorpus
from gtm import GTM

# Create a GTMCorpus objects (one train and one test set to avoid overfitting the supervised learning algorithm)
train_dataset = GTMCorpus(
    train, # Must contain a column 'doc' with the text of each document and a column 'doc_clean' with the cleaned text of each document.
    labels = "~random-1", # The features to predict. Would be "~ gdp" if the df has a column 'gdp'.
    content = "~1"
)

test_dataset = GTMCorpus(
    test, 
    labels = "~random-1", 
    content = "~1",
    vectorizer = train_dataset.vectorizer # pass on the same vectorizer as for the training set (this ensures the document term matrices have the same number of dimensions)
)

  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  np.log(np.array(self.M_bow.sum(axis=0)).flatten())


In [3]:
# Train the model
tm = GTM(
    train_dataset, 
    test_dataset,
    n_topics=20,
    doc_topic_prior='dirichlet', # other option is "logistic_normal"
    update_prior=False, # no prevalence covariates so no need to update the prior
    alpha=0.1, # sparsity of the dirichlet prior
    encoder_hidden_layers=[], # structure of the encoder neural net
    decoder_hidden_layers=[64], # structure of the decoder neural net
    predictor_hidden_layers=[10],
    predictor_type='regressor', # 'regressor' for continuous variables such as GDP
    w_pred_loss=1000, # how much weight should we give to the prediction task in the likelihood?
)


Epoch   1	Mean Training Loss:7133.3489118

Topic_0: ['thinking', 'citrus', 'supsenate', 'decline', 'foundation', 'invention', 'magnificant', 'tough']
Topic_1: ['manufacture', 'plague', 'patriotism', 'definitive', 'congratulate', 'consist', 'slaughter', 'anaylsis']
Topic_2: ['way', 'grandparent', 'write', 'detail', 'immortality', 'extra', 'stiff', 'concern']
Topic_3: ['mirror', 'pull', 'thew', 'large', 'arise', 'propriety', 'comprise', 'desk']
Topic_4: ['suppress', 'implementation', 'appreciable', 'theft', 'selfcare', 'lence', 'standard', 'clear']
Topic_5: ['target', 'southeastern', 'enshrine', 'volunteering', 'kid', 'people', 'draft', 'wholesale']
Topic_6: ['fullblown', 'taxdeferre', 'govern', 'btu', 'deal', 'chafer', 'spring', 'tragic']
Topic_7: ['propriate', 'abrogate', 'detain', 'aim', 'uninvestigated', 'discrete', 'swimmer', 'autocracy']
Topic_8: ['nonphysician', 'circuit', 'final', 'nature', 'substance', 'advice', 'andthe', 'context']
Topic_9: ['trauma', 'believethat', 'productiv


Epoch   3	Mean Training Loss:5197.8691755

Topic_0: ['language', 'decline', 'last', 'tough', 'testimony', 'day', 'line', 'way']
Topic_1: ['unanimous', 'plague', 'manufacture', 'disclosure', 'congratulate', 'lose', 'recognition', 'motion']
Topic_2: ['way', 'write', 'care', 'add', 'benefit', 'develop', 'concern', 'make']
Topic_3: ['desk', 'large', 'continue', 'consider', 'arise', 'presidential', 'international', 'title']
Topic_4: ['clear', 'today', 'measure', 'implementation', 'review', 'do', 'standard', 'begin']
Topic_5: ['target', 'people', 'know', 'effect', 'bill', 'draft', 'procedure', 'debate']
Topic_6: ['move', 'deal', 'govern', 'raise', 'bill', 'legitimate', 'establish', 'pleased']
Topic_7: ['aim', 'propriate', 'application', 'abrogate', 'alternative', 'rational', 'discrete', 'uninvestigated']
Topic_8: ['gentleman', 'question', 'final', 'state', 'stand', 'independent', 'nature', 'historic']
Topic_9: ['make', 'believe', 'certain', 'cut', 'motion', 'today', 'specific', 'republican'

In [4]:
tm.get_predictions(train_dataset, to_numpy=False).shape

torch.Size([886, 1])