In [1]:
import pandas as pd
df = pd.read_csv('../data/us_congressional_record/us_congress_speeches_sample_1000.csv')
df.dropna(subset=['doc_clean'], inplace=True)

import numpy as np
df['random'] = np.random.normal(size = len(df.index))

train = df.sample(frac=0.9,random_state=42)
test = df.drop(train.index).reset_index(drop=True)
train = train.reset_index(drop=True)

In [2]:
import sys
sys.path.append('../gtm/')
from corpus import GTMCorpus
from gtm import GTM

# Create a GTMCorpus objects (one train and one test set to avoid overfitting the supervised learning algorithm)
train_dataset = GTMCorpus(
    train, # Must contain a column 'doc' with the text of each document and a column 'doc_clean' with the cleaned text of each document.
    labels = "~random-1", # The features to predict. Would be "~ gdp" if the df has a column 'gdp'.
    content = "~1" # add an intercept to "absorb" frequent non-topical words
)

test_dataset = GTMCorpus(
    test, 
    labels = "~random-1", 
    content = "~1",
    vectorizer = train_dataset.vectorizer # pass on the same vectorizer as for the training set (this ensures the document term matrices have the same number of dimensions)
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Train the model
tm = GTM(
    train_dataset, 
    test_dataset,
    n_topics=20,
    predictor_type='regressor', # 'regressor' for continuous variables such as GDP,
)


Epoch   1	Mean Training Loss:1396.3007568

Topic_0: ['single', 'political', 'time', 'want', 'matter', 'fake_nostrum', 'face', 'body']
Topic_1: ['departure', 'use', 'other', 'coal', 'favor', 'several', 'amend', 'sex_discrimination']
Topic_2: ['serve', 'mention', 'know', 'waste', 'testimony', 'employ', 'open', 'ask']
Topic_3: ['tract_land', 'inner_city', 'practice', 'accommodation', 'less', 'town', 'transport', 'post_office']
Topic_4: ['accept', 'able', 'manner', 'cut', 'face', 'community', 'fix', 'relief']
Topic_5: ['controlledprice', 'awkward', 'hereinafter', 'apprenticeship', 'breadth', 'gap', 'conformity', 'hasto']
Topic_6: ['vote', 'eye', 'customer', 'table', 'result', 'affect', 'intention', 'department']
Topic_7: ['face', 'favor', 'quality', 'include', 'body', 'prevent', 'crop', 'address']
Topic_8: ['concern', 'money', 'industry', 'press_release', 'total', 'submarginal', 'bad', 'varied']
Topic_9: ['try', 'advocate', 'suggest', 'press', 'qualified', 'get', 'know', 'floor']
Topic_10

In [5]:
tm.get_predictions(train_dataset, to_numpy=False)[0:10]

tensor([[0.0783],
        [0.1100],
        [0.1853],
        [0.0830],
        [0.1303],
        [0.0993],
        [0.1969],
        [0.1044],
        [0.1141],
        [0.0922]], device='cuda:0')