In [1]:
import pandas as pd 
df = pd.read_csv('../data/us_congressional_record/us_congress_speeches_processed.csv')
df = df.sample(n=50000, random_state = 42)
df

Unnamed: 0.1,Unnamed: 0,doc,party,speech_year,doc_clean
691903,691903,"Speaker, I would like to express my strong su...",Republican,2005,like express support convey land use heliport ...
351367,351367,"Speaker, this bill, H R. 2277 authorizes $75,...",Democrat,1979,bill authorize airport airway trust fund year ...
250310,250310,"Speaker, I appreciate having this opportunity...",Democrat,1972,appreciate have opportunity discuss colleague ...
644446,644446,The gentleman is absolutely correct. I rememb...,Democrat,2000,gentleman remember friend say start road life ...
803736,803736,I thank the gentleman from Florida for some o...,Republican,2002,thank gentleman opportunity share passion view...
...,...,...,...,...,...
79650,79650,"Chairman, I rise in opposition to the amendme...",Democrat,1958,rise opposition amendment offer gentleman answ...
544086,544086,"Speaker, there is one central question that m...",Republican,1993,question answer people issue health care trust...
856936,856936,"Speaker, I very much appreciate the gentleman...",Republican,2003,appreciate gentleman come discuss tonight brin...
811229,811229,"Chairman, will the gentlewoman yield? Chairm...",Democrat,1997,gentlewoman yield like reemphasize gentlewoman...


In [2]:
import sys
sys.path.append('../gtm/')
from corpus import GTMCorpus
from gtm import GTM

# Create a GTMCorpus object
train_dataset = GTMCorpus(
    df, 
    prevalence = "~ party + C(speech_year)", 
    content = "~ party + C(speech_year)"
)

train_dataset.M_prevalence_covariates.shape

  from .autonotebook import tqdm as notebook_tqdm


(50000, 95)

In [3]:
train_dataset.M_prevalence_covariates

array([[1., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [4]:
train_dataset.prevalence_colnames

['Intercept',
 'party[T.Republican]',
 'C(speech_year)[T.1921]',
 'C(speech_year)[T.1922]',
 'C(speech_year)[T.1923]',
 'C(speech_year)[T.1924]',
 'C(speech_year)[T.1925]',
 'C(speech_year)[T.1926]',
 'C(speech_year)[T.1928]',
 'C(speech_year)[T.1929]',
 'C(speech_year)[T.1930]',
 'C(speech_year)[T.1931]',
 'C(speech_year)[T.1932]',
 'C(speech_year)[T.1933]',
 'C(speech_year)[T.1934]',
 'C(speech_year)[T.1935]',
 'C(speech_year)[T.1936]',
 'C(speech_year)[T.1937]',
 'C(speech_year)[T.1938]',
 'C(speech_year)[T.1939]',
 'C(speech_year)[T.1940]',
 'C(speech_year)[T.1941]',
 'C(speech_year)[T.1942]',
 'C(speech_year)[T.1943]',
 'C(speech_year)[T.1944]',
 'C(speech_year)[T.1945]',
 'C(speech_year)[T.1946]',
 'C(speech_year)[T.1947]',
 'C(speech_year)[T.1948]',
 'C(speech_year)[T.1949]',
 'C(speech_year)[T.1950]',
 'C(speech_year)[T.1951]',
 'C(speech_year)[T.1952]',
 'C(speech_year)[T.1953]',
 'C(speech_year)[T.1954]',
 'C(speech_year)[T.1955]',
 'C(speech_year)[T.1956]',
 'C(speech_year)[

In [5]:
# Train the model
tm = GTM(
    train_dataset, 
    n_topics=50,
    doc_topic_prior='logistic_normal', #dirichlet
    alpha=0.1,
    prevalence_covariates_regularization=0.1,
    update_prior=True,
    encoder_hidden_layers=[], # structure of the encoder neural net
    decoder_hidden_layers=[300], # structure of the decoder neural net
    num_epochs=5,
    print_every=10000,
    log_every=1,
    w_prior=None,
    batch_size=250
)


Epoch   1	Mean Training Loss:4.3714911

Topic_0: ['bill', 'year', 'go', 'people', 'gentleman']
Topic_1: ['year', 'bill', 'people', 'time', 'go']
Topic_2: ['bill', 'year', 'people', 'go', 'make']
Topic_3: ['bill', 'year', 'go', 'people', 'time']
Topic_4: ['bill', 'go', 'people', 'time', 'year']
Topic_5: ['bill', 'year', 'go', 'people', 'gentleman']
Topic_6: ['year', 'bill', 'go', 'people', 'program']
Topic_7: ['bill', 'year', 'make', 'say', 'go']
Topic_8: ['bill', 'year', 'people', 'make', 'time']
Topic_9: ['year', 'bill', 'people', 'go', 'make']
Topic_10: ['bill', 'year', 'go', 'time', 'people']
Topic_11: ['bill', 'year', 'gentleman', 'go', 'say']
Topic_12: ['bill', 'go', 'year', 'people', 'time']
Topic_13: ['people', 'bill', 'go', 'year', 'gentleman']
Topic_14: ['year', 'bill', 'people', 'go', 'program']
Topic_15: ['bill', 'people', 'year', 'go', 'make']
Topic_16: ['go', 'bill', 'year', 'people', 'make']
Topic_17: ['bill', 'go', 'year', 'time', 'people']
Topic_18: ['year', 'bill', 't


Epoch   2	Mean Training Loss:3.5065520

Topic_0: ['committee', 'purpose', 'report', 'authorize', 'power']
Topic_1: ['bill', 'gentleman', 'year', 'time', 'amendment']
Topic_2: ['bill', 'gentleman', 'make', 'year', 'amendment']
Topic_3: ['bill', 'gentleman', 'amendment', 'year', 'go']
Topic_4: ['go', 'say', 'people', 'get', 'bill']
Topic_5: ['bill', 'go', 'year', 'child', 'time']
Topic_6: ['program', 'year', 'tax', 'bill', 'percent']
Topic_7: ['gentleman', 'man', 'make', 'time', 'say']
Topic_8: ['bill', 'gentleman', 'make', 'amendment', 'year']
Topic_9: ['people', 'say', 'go', 'time', 'make']
Topic_10: ['program', 'provide', 'bill', 'committee', 'provision']
Topic_11: ['gentleman', 'bill', 'say', 'go', 'make']
Topic_12: ['bill', 'amendment', 'year', 'program', 'provide']
Topic_13: ['go', 'get', 'tax', 'people', 'job']
Topic_14: ['program', 'year', 'problem', 'time', 'make']
Topic_15: ['bill', 'gentleman', 'people', 'go', 'amendment']
Topic_16: ['go', 'get', 'tax', 'talk', 'cut']
Topic_1


Epoch   3	Mean Training Loss:3.4580904

Topic_0: ['court', 'establish', 'conduct', 'right', 'action']
Topic_1: ['gentleman', 'amendment', 'bill', 'yield', 'vote']
Topic_2: ['gentleman', 'amendment', 'think', 'bill', 'vote']
Topic_3: ['gentleman', 'amendment', 'bill', 'vote', 'yield']
Topic_4: ['go', 'tax', 'budget', 'cut', 'get']
Topic_5: ['child', 'care', 'bill', 'family', 'thank']
Topic_6: ['program', 'tax', 'income', 'increase', 'percent']
Topic_7: ['right', 'resolution', 'freedom', 'peace', 'man']
Topic_8: ['gentleman', 'amendment', 'bill', 'committee', 'yield']
Topic_9: ['people', 'say', 'go', 'know', 'gentleman']
Topic_10: ['program', 'provide', 'funding', 'provision', 'fund']
Topic_11: ['gentleman', 'say', 'amendment', 'vote', 'think']
Topic_12: ['bill', 'amendment', 'provision', 'legislation', 'gentleman']
Topic_13: ['price', 'oil', 'market', 'energy', 'company']
Topic_14: ['world', 'nation', 'program', 'industry', 'development']
Topic_15: ['gentleman', 'amendment', 'yield', '


Epoch   4	Mean Training Loss:3.4609587

Topic_0: ['land', 'treaty', 'aggression', 'establish', 'peace']
Topic_1: ['gentleman', 'amendment', 'yield', 'bill', 'rule']
Topic_2: ['gentleman', 'amendment', 'yield', 'vote', 'rule']
Topic_3: ['gentleman', 'amendment', 'yield', 'vote', 'rule']
Topic_4: ['tax', 'go', 'budget', 'taxis', 'cut']
Topic_5: ['child', 'woman', 'care', 'ensure', 'thank']
Topic_6: ['program', 'income', 'education', 'tax', 'funding']
Topic_7: ['peace', 'freedom', 'resolution', 'war', 'right']
Topic_8: ['gentleman', 'amendment', 'yield', 'bill', 'committee']
Topic_9: ['people', 'world', 'say', 'war', 'freedom']
Topic_10: ['funding', 'provision', 'program', 'fund', 'ensure']
Topic_11: ['gentleman', 'right', 'yield', 'vote', 'rule']
Topic_12: ['amendment', 'bill', 'rule', 'provision', 'gentleman']
Topic_13: ['price', 'oil', 'gas', 'drill', 'trade']
Topic_14: ['world', 'nation', 'development', 'community', 'trade']
Topic_15: ['gentleman', 'amendment', 'yield', 'rule', 'vote


Epoch   5	Mean Training Loss:3.4948150

Topic_0: ['aggression', 'peace', 'land', 'treaty', 'official']
Topic_1: ['gentleman', 'amendment', 'yield', 'rule', 'vote']
Topic_2: ['gentleman', 'yield', 'amendment', 'rule', 'vote']
Topic_3: ['gentleman', 'yield', 'rule', 'amendment', 'vote']
Topic_4: ['tax', 'taxis', 'budget', 'deficit', 'spending']
Topic_5: ['child', 'abortion', 'woman', 'parent', 'care']
Topic_6: ['program', 'education', 'income', 'child', 'health']
Topic_7: ['peace', 'aggression', 'freedom', 'regime', 'war']
Topic_8: ['gentleman', 'amendment', 'yield', 'rule', 'bill']
Topic_9: ['world', 'war', 'people', 'freedom', 'peace']
Topic_10: ['funding', 'provision', 'authorize', 'project', 'water']
Topic_11: ['honor', 'tribute', 'love', 'hero', 'prayer']
Topic_12: ['amendment', 'rule', 'bill', 'motion', 'gentleman']
Topic_13: ['trade', 'oil', 'price', 'market', 'drill']
Topic_14: ['world', 'trade', 'nation', 'development', 'technology']
Topic_15: ['gentleman', 'rule', 'yield', 'am

In [6]:
# Assess the quality of the learned word embeddings 
# Top 8 closest words to a specific word

import torch
import torch.nn.functional as F

specific_word = 'terrorist'

word_id = [i for i,w in enumerate(train_dataset.vocab) if w == specific_word][0]

words = tm.AutoEncoder.decoder['dec_1'].weight.T

logit = torch.matmul(words.T[word_id], words)

beta = F.softmax(logit)

tm.AutoEncoder.eval()
topic_words = []
vals, indices = torch.topk(beta, 8)
vals = vals.cpu().tolist()
indices = indices.cpu().tolist()
[tm.id2token[idx] for idx in indices]

  beta = F.softmax(logit)


['terrorist',
 'terrorism',
 'terror',
 'ensure',
 'hero',
 'bravery',
 '11th',
 'bless']

In [7]:
ldavis_format = tm.get_ldavis_data_format(train_dataset)
import pyLDAvis
gtm_vis_data = pyLDAvis.prepare(**ldavis_format)
pyLDAvis.display(gtm_vis_data)

In [8]:
dfc = tm.estimate_effect(train_dataset, n_samples=25, topic_ids=[18,44])

  """


TypeError: LogisticNormalPrior.sample() missing 1 required positional argument: 'epoch'

In [None]:
dfc[dfc['covariate'] == "C(speech_year)[T.2009]"]