## Preliminaries

In [37]:
import os
import pandas as pd
import numpy as np
from collections import Counter

import gensim
from gensim.models import Word2Vec

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.preprocessing import normalize

In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [39]:
# File paths
VOCAB_COL = 'content-inclusion-only'
DOCUMENT_COL = 'content-cleaned'
EMBED_FILE = 'test-embedMatrix-cognitive.txt'
COMMENTS_FILE = 'comments-inclusion-bigrams20.json'
MODEL_SAVE_PATH = 'test-ETM-cognitive.pth'
BEST_MODEL_PATH = 'test-ETM-cognitive.pth'

# Model hyperparameters
T_HIDDEN_SIZE = 800

# Training settings
NUM_TOPICS = 10
BATCH_SIZE = 64

## Helper functions

In [61]:
def read_embedding_file(file_path):
    """
    Create an embedding matrix for the vocabulary from a text file.

    Args:
        file_path (str) : path to file
    Returns:
        numpy.array : VxD matrix, where V is vocab size and D is
            dimension of the embedding space.
    """
    vocab = []
    vecs = []
    with open(file_path, 'r') as f:
        for line in f:
            temp = line.strip().split()
            vocab.append(temp[0])
            vecs.append(list(map(float, temp[1:])))
    matrix = np.array(vecs)
    key_to_index = {v : i for i, v in enumerate(vocab)}
    return matrix, key_to_index

def process_docs_for_training(df, vocab_col, embed_dict):
    docs = df[vocab_col].tolist()
    doc_tokens = [doc.split() for doc in docs]
    vocabulary = embed_dict.keys()

    bag_of_words_docs = []
    for token in doc_tokens:
        bag_of_words = [0] * len(vocabulary)
        word_counts = Counter(token)
        for word, count in word_counts.items():
            if word in embed_dict:
                bag_of_words[embed_dict[word]] = count
        bag_of_words_docs.append(bag_of_words)

    normalized_bag_of_words = normalize(bag_of_words_docs, norm='l1', axis=1)
    return bag_of_words_docs, normalized_bag_of_words

def process_docs_for_inference(df, vocab_col, embed_dict):
    docs = df[vocab_col].tolist()
    doc_tensors = []
    for doc in docs:
        tensor = torch.zeros(len(embed_dict.keys()))
        for word in doc.split():
            if word in embed_dict:
                tensor[embed_dict[word]] += 1
        doc_tensors.append(tensor)
    return torch.stack(doc_tensors)

def get_top_words(model, vocab, top_n=10):
    """
    Get the n top words for each topic.
    """
    with torch.no_grad():
        beta = model.get_beta().numpy()
    top_words = {}
    for topic_id in range(beta.shape[0]):
        top_word_indices = np.argsort(beta[topic_id])[-top_n:]  # Get indices of top_n words
        top_words[topic_id] = [vocab[i] for i in top_word_indices]
    return top_words

def get_top_doc_indices(model, bow_tensor, normalized_bow_tensor, n=10):
    with torch.no_grad():
        thetas, _ = model.get_theta(normalized_bow_tensor)
        thetas = thetas.numpy()
    top_docs = {}
    num_topics = thetas.shape[1]
    for topic_id in range(num_topics):
        topic_proportion = thetas[:, topic_id]
        top_doc_indices = np.argsort(topic_proportion)[-n:]
        top_docs[topic_id] = top_doc_indices
    return top_docs

def get_top_docs(df, top_docs_dict):
    for topic_id, indices in top_docs_dict.items():
        df_topic = df.iloc[indices]
        count = 1
        print(f'\nTopic: {topic_id}')
        for i, row in df_topic.iterrows():
            print(f'{count}. {row["content-no-tex"]}')
            print(f'{count}. {row[VOCAB_COL]}')
            count += 1

## Model definition

In [41]:
# Try simple ETM
class ETM(nn.Module):
    # TO DO: encoder dropout? default 0.5
    def __init__(self, num_topics, embedding_matrix, t_hidden_size=100):
        """
        Neural topic model using variational autoencoder.

        Args:
            num_topics (int) : number of topics
            embedding_matrix (numpy.array) : VxD matrix, where V is the vocabulary
                size and D is the embedding dimension
            t_hidden_size (int) : dimension of hidden space q(theta)
        """
        super(ETM, self).__init__()

        # Model parameters
        self.num_topics = num_topics
        self.vocab_size = embedding_matrix.shape[0]
        self.rho_size = embedding_matrix.shape[1]
        self.t_hidden_size = t_hidden_size
        # emsize?
        # training?

        # Optimization hyperparameters
        self.enc_drop = 0.5
        self.t_dropout = nn.Dropout(self.enc_drop)
        self.theta_act = nn.ReLU()

        # Define word embedding
        self.rho = nn.Embedding(self.vocab_size, self.rho_size)
        self.rho.weight.data.copy_(torch.from_numpy(embedding_matrix).float())
        self.rho.weight.requires_grad = False # freeze weights

        # Define topic embedding
        self.alphas = nn.Linear(self.rho_size, num_topics, bias=False)

        # Variational distribution for untransformed topic proportion
        self.q_theta = nn.Sequential(
            nn.Linear(self.vocab_size, self.t_hidden_size),
            self.theta_act,
            nn.Linear(self.t_hidden_size, self.t_hidden_size),
            self.theta_act
        )
        self.mu_q_theta = nn.Linear(self.t_hidden_size, num_topics, bias=True)
        self.log_sigma_q_theta = nn.Linear(self.t_hidden_size, num_topics, bias=True)

    def reparameterize(self, mu, log_sigma):
        """
        Returns a sample from the reparametrized Gaussian distribution TO DO

        Args:
            mu : mean
            log_sigma : variance
        """
        std = torch.exp(0.5 * log_sigma)
        eps = torch.randn_like(std)
        return eps.mul_(std).add_(mu)

    def encode(self, bag_of_words):
        """
        Get parameters of variational distribution for \theta.

        Args:
            bag_of_words (torch.Tensor) : tensor of shape [batch_size] x [vocab_size]
        """
        q_theta = self.q_theta(bag_of_words)
        if self.enc_drop > 0:
            q_theta = self.t_dropout(q_theta)
        mu_theta = self.mu_q_theta(q_theta)
        log_sigma_theta = self.log_sigma_q_theta(q_theta)
        kl_theta = -0.5 * torch.sum(1 + log_sigma_theta - mu_theta.pow(2) - log_sigma_theta.exp())
        return mu_theta, log_sigma_theta, kl_theta

    def get_beta(self):
        """
        Get 'traditional' topic (i.e., distribution over words) induced by word
        embeddings \rho and topic embedding \alpha_k.
        """
        logit = self.alphas(self.rho.weight) # torch.mm(self.rho.weight, self.alphas)
        return F.softmax(logit, dim=0).transpose(1, 0) # softmax over vocab dimension

    def get_theta(self, normalized_bag_of_words):
        """
        Get topic proportion for the normalized "bag of words" document.
        """
        mu_theta, log_sigma_theta, kl_theta = self.encode(normalized_bag_of_words)
        z = self.reparameterize(mu_theta, log_sigma_theta) # topic assignment
        theta = F.softmax(z, dim=1)
        return theta, kl_theta

    def decode(self, theta, beta):
        """
        Get log-probabilities of a topic given a document, assigning small
        values to avoid computing log of 0.

        Args:
            theta (torch.Tensor)
            beta (torch.Tensor)
        """
        results = torch.mm(theta, beta)
        almost_zeros = torch.full_like(results, 1e-6)
        results_without_zeros = results.add(almost_zeros)
        return torch.log(results_without_zeros)

    def forward(self, bag_of_words, normalized_bag_of_words, theta=None):
        # Get topic proportions
        if theta is None:
            theta, kl_theta = self.get_theta(normalized_bag_of_words)
        else:
            kl_theta = None

        # Get topic as a distribution over words
        beta = self.get_beta()

        # Get prediction loss
        preds = self.decode(theta, beta)
        reconstruction_loss = -(preds * bag_of_words).sum(1)
        reconstruction_loss = reconstruction_loss.mean() # aggregate
        return reconstruction_loss, kl_theta

## Import data

In [42]:
embed_matrix, embed_dict = read_embedding_file(EMBED_FILE)
vocabulary = embed_dict.keys()
embed_matrix.shape

(652, 300)

In [43]:
df_comments = pd.read_json(COMMENTS_FILE)
df = df_comments
df.columns

Index(['author', 'author-href', 'time', 'comment-href', 'comment-id',
       'content', 'child-ids', 'blog', 'post-id', 'id', 'in-reply-to',
       'in-reply-to-href', 'project-id', 'content-no-tex', 'tex-count',
       'content-cleaned', 'content-inclusion-only', 'content-bigrams'],
      dtype='object')

In [44]:
bow, normalized_bow = process_docs_for_training(df, VOCAB_COL, embed_dict)

In [45]:
bow_tensor = torch.tensor(bow, dtype=torch.float32)
normalized_bow_tensor = torch.tensor(normalized_bow, dtype=torch.float32)

print(bow_tensor.shape, normalized_bow_tensor.shape)

torch.Size([15924, 652]) torch.Size([15924, 652])


## Train model

In [46]:
model = ETM(num_topics=NUM_TOPICS, embedding_matrix=embed_matrix, t_hidden_size=T_HIDDEN_SIZE)
optimizer = optim.Adam(model.parameters(), lr=0.002)

In [47]:
def train_for_epochs(model, optimizer, num_epochs, bow_tensor, normalized_bow_tensor,
              batch_size=BATCH_SIZE, model_save_path=MODEL_SAVE_PATH):
    best_loss = float('inf')
    for epoch in range(num_epochs):

        model.train()
        total_loss = 0
        num_batches = len(bow_tensor) // batch_size
        print(f'Training for epoch {epoch+1} with {num_batches} batches...')

        for i in range(num_batches):
            batch_bow = bow_tensor[i*batch_size:(i+1)*batch_size]
            batch_normalized_bow = normalized_bow_tensor[i*batch_size:(i+1)*batch_size]
            optimizer.zero_grad()

            # Forward pass
            reconstruction_loss, kl_theta = model(batch_bow, batch_normalized_bow)

            # Backward pass
            loss = reconstruction_loss + (kl_theta if kl_theta is not None else 0)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / num_batches
        print(f'Epoch {epoch+1} complete.')
        print(f'\tAverage loss: {avg_loss}')

        # Check if the current epoch loss is the best
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(model.state_dict(), model_save_path)


train_for_epochs(
    model, optimizer, num_epochs=50,
    bow_tensor=bow_tensor, normalized_bow_tensor=normalized_bow_tensor
)

Training for epoch 1 with 248 batches...
Epoch 1 complete.
	Average loss: 109.40112630782589
Training for epoch 2 with 248 batches...
Epoch 2 complete.
	Average loss: 105.86757279980567
Training for epoch 3 with 248 batches...
Epoch 3 complete.
	Average loss: 104.9123694204515
Training for epoch 4 with 248 batches...
Epoch 4 complete.
	Average loss: 104.31167884026804
Training for epoch 5 with 248 batches...
Epoch 5 complete.
	Average loss: 103.89229049990254
Training for epoch 6 with 248 batches...
Epoch 6 complete.
	Average loss: 103.60667559408373
Training for epoch 7 with 248 batches...
Epoch 7 complete.
	Average loss: 103.48274450917398
Training for epoch 8 with 248 batches...
Epoch 8 complete.
	Average loss: 103.3148743875565
Training for epoch 9 with 248 batches...
Epoch 9 complete.
	Average loss: 103.22208962901946
Training for epoch 10 with 248 batches...
Epoch 10 complete.
	Average loss: 103.12413995496688
Training for epoch 11 with 248 batches...
Epoch 11 complete.
	Average 

## Make inferences

In [48]:
best_model = ETM(num_topics=NUM_TOPICS, embedding_matrix=embed_matrix, t_hidden_size=T_HIDDEN_SIZE)
best_model.load_state_dict(torch.load(BEST_MODEL_PATH))
model.eval()

ETM(
  (t_dropout): Dropout(p=0.5, inplace=False)
  (theta_act): ReLU()
  (rho): Embedding(652, 300)
  (alphas): Linear(in_features=300, out_features=10, bias=False)
  (q_theta): Sequential(
    (0): Linear(in_features=652, out_features=800, bias=True)
    (1): ReLU()
    (2): Linear(in_features=800, out_features=800, bias=True)
    (3): ReLU()
  )
  (mu_q_theta): Linear(in_features=800, out_features=10, bias=True)
  (log_sigma_q_theta): Linear(in_features=800, out_features=10, bias=True)
)

In [49]:
doc_tensors = process_docs_for_inference(df, VOCAB_COL, embed_dict)
normalized_doc_tensors = F.normalize(doc_tensors, p=1, dim=1) # TO DO: check args
print(doc_tensors.shape, normalized_doc_tensors.shape)

torch.Size([15924, 652]) torch.Size([15924, 652])


In [50]:
# Get top words
get_top_words(model, list(embed_dict.keys()))

{0: ['need',
  'enough',
  'know',
  'right',
  'find',
  'case',
  'work',
  'see',
  'get',
  'think'],
 1: ['seems',
  'could',
  'find',
  'should',
  'think',
  'case',
  'see',
  'get',
  'will',
  'would'],
 2: ['case',
  'could',
  'work',
  'think',
  'using',
  'should',
  'use',
  'will',
  'would',
  'also'],
 3: ['know',
  'example',
  'use',
  'using',
  'think',
  'work',
  'case',
  'see',
  'get',
  'also'],
 4: ['need',
  'seems',
  'different',
  'could',
  'should',
  'think',
  'see',
  'would',
  'will',
  'also'],
 5: ['example',
  'seems',
  'case',
  'should',
  'think',
  'could',
  'get',
  'would',
  'will',
  'also'],
 6: ['most',
  'using',
  'example',
  'case',
  'could',
  'use',
  'should',
  'will',
  'also',
  'would'],
 7: ['case',
  'use',
  'actually',
  'find',
  'using',
  'think',
  'work',
  'see',
  'get',
  'also'],
 8: ['seems',
  'need',
  'using',
  'work',
  'could',
  'different',
  'use',
  'get',
  'think',
  'should'],
 9: ['made',
 

In [62]:
# Get top documents
top_docs = get_top_doc_indices(model, bow_tensor, normalized_bow_tensor)
get_top_docs(df, top_docs)


Topic: 0
1. Here’s another suggestion, just using the usual Riemann zeta function: our explicit formula for  is  . Now maybe we can work with a small set of values  , specifically chosen to nullify the effect of a large proportion of the zeros up to height, say,  or so. How? Well here is a first attempt: consider  (and of course we will want to show, say, that  is “large”, so that one of the intervals  has loads of primes). Applying the explicit formula, we find that  . Notice here that  is a fragment of the Riemann zeta function, and its closeness to  can be determined through the usual integration-by-parts analytic continuation  \{t\}tRe(s) > 0\zeta(\rho) = 01/(\rho-1)\rhoF(x) = x(1 + \cdots + 1/n) - \sum_\rho x^\rho/\rho(\rho-1) + EE\rho^2$ in the denominator is good news, assuming I haven’t made an error. Assuming this is all correct, the skeptic in me says that when the dust has settled we get no gain whatsoever.
1.     using       explicit        maybe   work         specificall