In [1]:
import pandas as pd
df = pd.read_csv('../data/hansard_speeches_processed.csv')
df = df.sample(n=50000, random_state = 42)

import sys
sys.path.append('../gtm/')
from corpus import GTMCorpus
from gtm import GTM

# Create a GTMCorpus object
train_dataset = GTMCorpus(
    df, 
    prevalence = "~ party", 
    content = "~ party"
)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Train the model
tm = GTM(
    train_dataset, 
    n_topics=10,
    doc_topic_prior='dirichlet',
    alpha=0.1,
    update_prior=False,
    num_epochs=10,
    print_every=10000,
    log_every=1,
    w_prior=None,
    batch_size=256
)


Epoch   1	Mean Training Loss:3.0027141

['year', 'make', 'increase', 'need', 'service', 'cent', 'say', 'take']
['hon', 'make', 'take', 'say', 'friend', 'point', 'member', 'need']
['hon', 'make', 'friend', 'take', 'year', 'say', 'need', 'country']
['hon', 'friend', 'make', 'give', 'member', 'say', 'year', 'know']
['hon', 'friend', 'make', 'say', 'take', 'agree', 'point', 'member']
['hon', 'make', 'take', 'say', 'year', 'clause', 'amendment', 'people']
['hon', 'agree', 'friend', 'make', 'take', 'year', 'people', 'give']
['year', 'need', 'say', 'make', 'people', 'take', 'hon', 'country']
['hon', 'agree', 'make', 'friend', 'take', 'point', 'say', 'people']
['hon', 'people', 'friend', 'make', 'year', 'say', 'point', 'agree']
['hon', 'friend', 'make', 'people', 'say', 'year', 'take', 'give']
['austerity', 'poverty', 'dole', 'poll', 'sack', 'miner', 'unionist', 'worker']



Epoch   2	Mean Training Loss:2.6908663

['year', 'cent', 'service', 'problem', 'need', 'say', 'make', 'property']
['hon


Epoch   9	Mean Training Loss:2.4825752

['freight', 'rail', 'road', 'motorway', 'journey', 'train', 'traffic', 'passenger']
['economy', 'apprenticeship', 'unemployment', 'inflation', 'pensioner', 'wage', 'university', 'apprentice']
['childminder', 'fundholde', 'fiddling', 'miner', 'recapitalisation', 'bluster', 'g8', 'coursing']
['hon', 'say', 'friend', 'know', 'victim', 'ask', 'prison', 'people']
['defendant', 'jury', 'miscarriage', 'soca', 'innocence', 'disclosure', 'offence', 'referendum']
['subsection', 'petitioner', 'houseboat', '3a', 'page', 'no', 'insert', 'section']
['offence', 'sentence', 'miscarriage', 'defendant', 'convict', 'jury', 'extradition', 'murder']
['tenant', 'rent', 'house', 'housing', 'property', 'nurse', 'flat', 'cent']
['inflation', 'cent', 'investment', 'price', 'tax', 'borrowing', 'industry', 'output']
['apprenticeship', 'economy', 'unemployment', 'pensioner', 'inflation', 'wage', 'university', 'apprentice']
['hon', 'friend', 'make', 'people', 'say', 'year', 

In [9]:
# Assess the quality of the learned word embeddings 
# Top 8 closest words to a specific word

import torch
import torch.nn.functional as F

specific_word = 'crime'

word_id = [i for i,w in enumerate(train_dataset.vocab) if w == specific_word][0]

words = tm.AutoEncoder.decoder['dec_1'].weight.T

logit = torch.matmul(words.T[word_id], words)

beta = F.softmax(logit)

tm.AutoEncoder.eval()
topic_words = []
vals, indices = torch.topk(beta, 8)
vals = vals.cpu().tolist()
indices = indices.cpu().tolist()
[tm.id2token[idx] for idx in indices]

  beta = F.softmax(logit)


['crime', 'police', 'hon', 'agree', 'dispute', 'party', 'year', 'say']

In [None]:
# Visualize the learned embeddings space 

# Topic, covariates, and word embeddings
topics_and_covs = tm.AutoEncoder.decoder['dec_0'].weight.T
words = tm.AutoEncoder.decoder['dec_1'].weight.T

# Dimension reduction (PCA 50 components + UMAP 2 components)
all_vecs = torch.cat([words.T, topics_and_covs], dim=0)
all_vecs = all_vecs.cpu().detach().numpy()

from sklearn.decomposition import PCA
pca_args={"n_components": 50, "svd_solver": "full"}
pca_model = PCA(**pca_args).fit(all_vecs)
all_vecs = pca_model.transform(all_vecs)

import umap
umap_args={"n_neighbors": 15, "n_components": 2, "random_state": 0}
umap_model = umap.umap_.UMAP(**umap_args).fit(all_vecs)

# Plot resulting vectors
import matplotlib.pyplot as plt
plt.figure(dpi=80)

words = words.T.cpu().detach().numpy()
words = pca_model.transform(words)
words = umap_model.transform(words)
plt.scatter(
    words[:,0],
    words[:,1],
    color=(0.5, 0.5, 1),
    alpha=0.5,
)

topics = topics_and_covs.cpu().detach().numpy()
topics = pca_model.transform(topics)
topics = umap_model.transform(topics)
plt.scatter(
    topics[:,0],
    topics[:,1],
    color=(0, 0.5, 0.5),
    alpha=1,
)

In [None]:
tm.prior.lambda_

In [None]:
# tm.get_topic_correlations()