In [1]:
import pandas as pd
df = pd.read_csv('../data/hansard_speeches_processed.csv')
df = df.sample(n=50000,random_state=42)

train = df.sample(frac=0.9,random_state=42)
test = df.drop(train.index).reset_index(drop=True)
train = train.reset_index(drop=True)

In [2]:
import sys
sys.path.append('../gtm/')
from corpus import GTMCorpus
from gtm import GTM

# Create a GTMCorpus objects (one train and one test set to avoid overfitting the supervised learning algorithm)
train_dataset = GTMCorpus(
    train, # Must contain a column 'doc' with the text of each document and a column 'doc_clean' with the cleaned text of each document.
    labels = "~party-1", # The features to predict. Would be "~ gdp" if the df has a column 'gdp'.
    content = "~1"
)

test_dataset = GTMCorpus(
    test, 
    labels = "~party-1", 
    content = "~1",
    vectorizer = train_dataset.vectorizer # pass on the same vectorizer as for the training set (this ensures the document term matrices have the same number of dimensions)
)

  from .autonotebook import tqdm as notebook_tqdm
  self.log_word_frequencies = torch.FloatTensor(np.log(np.array(self.M_bow.sum(axis=0)).flatten()))


In [3]:
import numpy as np
train_dataset.M_labels = np.delete(train_dataset.M_labels, -0, axis=1)
test_dataset.M_labels = np.delete(test_dataset.M_labels, -0, axis=1)

In [12]:
# Train the model
tm = GTM(
    train_dataset, 
    test_dataset,
    n_topics=20,
    doc_topic_prior='dirichlet', # other option is "logistic_normal"
    update_prior=False, # no prevalence covariates so no need to update the prior
    alpha=0.1, # sparsity of the dirichlet prior
    encoder_hidden_layers=[], # structure of the encoder neural net
    decoder_hidden_layers=[50], # structure of the decoder neural net
    predictor_hidden_layers=[],
    predictor_type='classifier', # 'regressor' for continuous variables such as GDP
    num_epochs=10, # No need to run many epochs. I found 10 to work well on 50 000 speeches.
    w_pred_loss=1000, # how much weight should we give to the prediction task in the likelihood?
    w_prior=None, # how much weight should we give to the prior in the likelihood?
    print_every=50, # print progress every x batches
    log_every=1, # print topic-word dist every x epochs
    batch_size=256,
)

Epoch   1	Iter   50	Training Loss:15.1167202
Rec Loss:2.7431283
MMD Loss:9.7257586
Sparsity Loss:0.0000000
Pred Loss:2.6478331

Epoch   1	Iter  100	Training Loss:14.6502352
Rec Loss:2.6080754
MMD Loss:9.3702087
Sparsity Loss:0.0000000
Pred Loss:2.6719515

Epoch   1	Iter  150	Training Loss:14.4637280
Rec Loss:2.5625768
MMD Loss:9.2773342
Sparsity Loss:0.0000000
Pred Loss:2.6238170


Epoch   1	Mean Training Loss:15.4359754


Epoch   1	Mean Validation Loss:15.5909480

['year', 'hon', 'people', 'friend', 'give', 'make', 'say', 'way']
['friend', 'hon', 'make', 'give', 'say', 'people', 'member', 'support']
['hon', 'friend', 'make', 'member', 'people', 'say', 'work', 'time']
['hon', 'say', 'friend', 'make', 'people', 'take', 'year', 'time']
['friend', 'hon', 'make', 'say', 'time', 'take', 'people', 'year']
['hon', 'say', 'make', 'take', 'friend', 'year', 'give', 'time']
['hon', 'year', 'say', 'people', 'make', 'give', 'friend', 'time']
['hon', 'friend', 'year', 'say', 'people', 'make', 'membe

Epoch   6	Iter   50	Training Loss:5.2708597
Rec Loss:2.5292442
MMD Loss:0.2705612
Sparsity Loss:0.0000000
Pred Loss:2.4710546

Epoch   6	Iter  100	Training Loss:5.1750774
Rec Loss:2.5467181
MMD Loss:0.2497677
Sparsity Loss:0.0000000
Pred Loss:2.3785913

Epoch   6	Iter  150	Training Loss:5.1840124
Rec Loss:2.5548675
MMD Loss:0.2301888
Sparsity Loss:0.0000000
Pred Loss:2.3989558


Epoch   6	Mean Training Loss:5.0649729


Epoch   6	Mean Validation Loss:5.9430970

['hospital', 'patient', 'health', 'service', 'care', 'rail', 'road', 'transport']
['tax', 'unemployment', 'income', 'rate', 'wage', 'pensioner', 'cent', 'inflation']
['police', 'crime', 'service', 'health', 'hospital', 'officer', 'drug', 'policing']
['friend', 'hon', 'thank', 'issue', 'agree', 'work', 'people', 'support']
['trade', 'country', 'hon', 'industry', 'friend', 'world', 'make', 'negotiation']
['industry', 'investment', 'hon', 'manufacturing', 'friend', 'export', 'job', 'economy']
['hon', 'question', 'debate', 'matter', 

Epoch  10	Iter   50	Training Loss:4.1466627
Rec Loss:2.1698108
MMD Loss:0.1156736
Sparsity Loss:0.0000000
Pred Loss:1.8611782

Epoch  10	Iter  100	Training Loss:4.4587851
Rec Loss:2.4270387
MMD Loss:0.1970768
Sparsity Loss:0.0000000
Pred Loss:1.8346696

Epoch  10	Iter  150	Training Loss:4.4065566
Rec Loss:2.4575858
MMD Loss:0.1081774
Sparsity Loss:0.0000000
Pred Loss:1.8407934


Epoch  10	Mean Training Loss:4.4149794


Epoch  10	Mean Validation Loss:5.1640263

['hospital', 'patient', 'care', 'health', 'service', 'rail', 'bus', 'road']
['tax', 'pensioner', 'unemployment', 'pension', 'wage', 'income', 'benefit', 'poverty']
['police', 'crime', 'prison', 'policing', 'hospital', 'drug', 'officer', 'service']
['child', 'friend', 'school', 'parent', 'hon', 'family', 'thank', 'question']
['trade', 'treaty', 'agreement', 'union', 'country', 'negotiation', 'peace', 'export']
['industry', 'steel', 'manufacturing', 'hon', 'friend', 'investment', 'tourism', 'export']
['question', 'hon', 'statement'

In [5]:
# Assess the quality of the learned word embeddings 
# Top 8 closest words to a specific word

import torch
import torch.nn.functional as F

specific_word = 'tax'

word_id = [i for i,w in enumerate(train_dataset.vocab) if w == specific_word][0]

words = tm.AutoEncoder.decoder['dec_1'].weight.T

logit = torch.matmul(words.T[word_id], words)

beta = F.softmax(logit)

tm.AutoEncoder.eval()
topic_words = []
vals, indices = torch.topk(beta, 8)
vals = vals.cpu().tolist()
indices = indices.cpu().tolist()
[tm.id2token[idx] for idx in indices]

  beta = F.softmax(logit)


['hon', 'people', 'say', 'make', 'take', 'work', 'year', 'friend']