In [1]:
# Load libraries
import numpy as np
from LDA_AandB.test_data_generator import simulate_corpus
from LDA_AandB.lda_code import lda, group_docs
from LDA_AandB.lda_code_opt import lda_optimized

In the LDA framework, documents are assumed to be generated under the following stochastic process:

For each document $m$, sample topic distribution $ \theta_m \sim Dirichlet(\alpha)$

For each topic $k$, sample word distribution $ \phi_k \sim Dirichlet(\beta)$

For each word $n$ in each document,

1) Sample topic $z_n \sim Cat(\theta_m)$

2) Sample word $w_n \sim Cat(\phi_{z_n})$

To assess the correctness of our LDA algorithm, we simulate data under this stochastic process. We then train this data on our algorithm and compare the parameter estimates to the true parameters.

We simulate a corpus of 10 documents containing 100 unique "words". Documents in the corpus are composed of 2 different topics and contain between 150 and 200 words.

In [2]:
# Set seed
np.random.seed(101)

In [3]:
# Set corpus parameters
V = 100
N_min = 150
N_max = 200
K = 2
M = 10

In [4]:
# Set true parameters
alpha_true = np.random.randint(1, 15, K)
beta_true = np.random.randint(1, 10, V)

In [5]:
print("alpha:", alpha_true)
print("beta:", beta_true)

alpha: [12  2]
beta: [7 8 9 5 9 6 1 6 9 2 4 9 4 4 3 9 4 8 1 8 9 5 4 4 8 5 9 8 7 5 3 8 8 8 1 5 2
 9 4 2 9 5 4 3 4 4 8 5 9 7 4 8 7 6 7 3 3 2 4 4 4 5 6 6 9 4 7 9 8 6 8 6 5 5
 8 4 3 4 3 4 6 7 7 6 6 2 8 5 8 5 6 2 7 3 7 4 9 2 8 5]


In [6]:
# Generate simulated dataset
bow, theta_true, phi_true = simulate_corpus(alpha_true, beta_true, M, N_min, N_max)

In [9]:
# Train data on LDA implementation
theta, phi = lda_optimized(bow, 2)

In [10]:
theta

array([[0.51685393, 0.48314607],
       [0.48108108, 0.51891892],
       [0.56060606, 0.43939394],
       [0.09210526, 0.90789474],
       [0.74712644, 0.25287356],
       [0.62983425, 0.37016575],
       [0.53005464, 0.46994536],
       [0.47126437, 0.52873563],
       [0.64044944, 0.35955056],
       [0.48387097, 0.51612903]])

In [11]:
theta_true

array([[0.78265292, 0.21734708],
       [0.94563303, 0.05436697],
       [0.79502595, 0.20497405],
       [0.95955196, 0.04044804],
       [0.9598253 , 0.0401747 ],
       [0.80936626, 0.19063374],
       [0.93771192, 0.06228808],
       [0.85612763, 0.14387237],
       [0.89547549, 0.10452451],
       [0.92376433, 0.07623567]])

In [None]:
group_docs(theta, K)

In [None]:
group_docs(theta_true, K)

In [None]:
np.mean((theta - theta_true)**2)

In [None]:
np.mean(abs(theta - theta_true))

In [None]:
np.argmax(theta, axis = 1)

In [None]:
np.argmax(theta_true, axis = 1)

In [None]:
np.mean(np.argmax(theta, axis = 1) == np.argmax(theta_true, axis = 1))

The accuracy of our LDA depends on the choice of the hyperparameters $\alpha$ and $\beta$. The closer these hyperparameters are to the true values of the dataset, the better the algorithm's estimates of the topic and word distributions. 

When the hyperparameters $\alpha$ and $\beta$ are chosen to be the true values, our LDA algorithm estimates the true topic and word distributions very well:

In [None]:
# Train data on LDA implementation
theta, phi = lda(bow, K, 1, 1, 10000)

In [None]:
theta

In [None]:
theta_true

In [None]:
np.mean((theta - theta_true)**2)

However, in real-world scenarios we don't know what the true values of $\alpha$ and $\beta$ are. In the case where the chosen hyperparameters are not the true values from the data, our LDA algorithm's estimates are less accurate.

In [None]:
# Load libraries
from LDA_AandB.lda_code_opt import lda_optimized

In [None]:
theta, phi = lda_optimized(bow, 2)

In [None]:
theta