In [11]:
# Load libraries
import numpy as np
from LDA_AandB.test_data_generator import simulate_corpus
from LDA_AandB.lda_code import lda, group_docs
from LDA_AandB.lda_code_opt import lda_optimized

In the LDA framework, documents are assumed to be generated under the following stochastic process:

For each document $m$, sample topic distribution $ \theta_m \sim Dirichlet(\alpha)$

For each topic $k$, sample word distribution $ \phi_k \sim Dirichlet(\beta)$

For each word $n$ in each document,

1) Sample topic $z_n \sim Cat(\theta_m)$

2) Sample word $w_n \sim Cat(\phi_{z_n})$

To assess the correctness of our LDA algorithm, we simulate data under this stochastic process. We then train this data on our algorithm and compare the parameter estimates to the true parameters.

We simulate a corpus of 10 documents containing 100 unique "words". Documents in the corpus are composed of 2 different topics and contain between 150 and 200 words.

In [2]:
# Set seed
np.random.seed(101)

In [3]:
# Set corpus parameters
V = 100
N_min = 150
N_max = 200
K = 2
M = 10

In [4]:
# Set true parameters
alpha_true = np.random.randint(1, 15, K)
beta_true = np.random.randint(1, 10, V)

In [5]:
print("alpha:", alpha_true)
print("beta:", beta_true)

alpha: [12  2]
beta: [7 8 9 5 9 6 1 6 9 2 4 9 4 4 3 9 4 8 1 8 9 5 4 4 8 5 9 8 7 5 3 8 8 8 1 5 2
 9 4 2 9 5 4 3 4 4 8 5 9 7 4 8 7 6 7 3 3 2 4 4 4 5 6 6 9 4 7 9 8 6 8 6 5 5
 8 4 3 4 3 4 6 7 7 6 6 2 8 5 8 5 6 2 7 3 7 4 9 2 8 5]


In [6]:
# Generate simulated dataset
bow, theta_true, phi_true = simulate_corpus(alpha_true, beta_true, M, N_min, N_max)

In [7]:
# Train data on LDA implementation
from 
theta, phi = lda_optimized(bow, K, alpha_true, beta_true, 1000)

NameError: name 'lda_optimized' is not defined

In [8]:
theta

NameError: name 'theta' is not defined

In [9]:
theta_true

array([[0.78265292, 0.21734708],
       [0.94563303, 0.05436697],
       [0.79502595, 0.20497405],
       [0.95955196, 0.04044804],
       [0.9598253 , 0.0401747 ],
       [0.80936626, 0.19063374],
       [0.93771192, 0.06228808],
       [0.85612763, 0.14387237],
       [0.89547549, 0.10452451],
       [0.92376433, 0.07623567]])

In [10]:
group_docs(theta, K)

NameError: name 'theta' is not defined

In [132]:
group_docs(theta_true, K)

Documents labeled in group 1 : [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]
Documents labeled in group 2 : []


In [133]:
np.mean((theta - theta_true)**2)

0.16912950840187382

In [134]:
np.mean(abs(theta - theta_true))

0.3716375518415733

In [135]:
np.argmax(theta, axis = 1)

array([0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1])

In [136]:
np.argmax(theta_true, axis = 1)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [137]:
np.mean(np.argmax(theta, axis = 1) == np.argmax(theta_true, axis = 1))

0.46

The accuracy of our LDA depends on the choice of the hyperparameters $\alpha$ and $\beta$. The closer these hyperparameters are to the true values of the dataset, the better the algorithm's estimates of the topic and word distributions. 

When the hyperparameters $\alpha$ and $\beta$ are chosen to be the true values, our LDA algorithm estimates the true topic and word distributions very well:

In [None]:
# Train data on LDA implementation
theta, phi = lda(bow, K, 1, 1, 10000)

In [None]:
theta

In [None]:
theta_true

However, in real-world scenarios we don't know what the true values of $\alpha$ and $\beta$ are. In the case where the chosen hyperparameters are not the true values from the data, our LDA algorithm's estimates are less accurate.

In [12]:
# Load libraries
from LDA_AandB.lda_code_opt import lda_optimized

In [13]:
theta, phi = lda_optimized(bow, 2)

In [14]:
theta

array([[0.88383838, 0.11616162],
       [0.91414141, 0.08585859],
       [0.98484848, 0.01515152],
       [0.99494949, 0.00505051],
       [0.96969697, 0.03030303],
       [0.88383838, 0.11616162],
       [0.98989899, 0.01010101],
       [0.92929293, 0.07070707],
       [0.97979798, 0.02020202],
       [0.99494949, 0.00505051]])

In [15]:
phi

array([[0.01197263, 0.02280502, 0.01767389, 0.01254276, 0.01482326,
        0.01083238, 0.00114025, 0.01311288, 0.01824401, 0.00570125,
        0.00285063, 0.01311288, 0.00741163, 0.00171038, 0.0022805 ,
        0.02565564, 0.00342075, 0.01539339, 0.00057013, 0.01311288,
        0.02166477, 0.00627138, 0.01140251, 0.00798176, 0.01026226,
        0.01083238, 0.01140251, 0.01425314, 0.01368301, 0.01083238,
        0.00798176, 0.00741163, 0.00855188, 0.01425314, 0.004561  ,
        0.00855188, 0.00114025, 0.01482326, 0.00798176, 0.0022805 ,
        0.02109464, 0.00912201, 0.00627138, 0.004561  , 0.004561  ,
        0.00741163, 0.00855188, 0.00684151, 0.01539339, 0.01368301,
        0.01311288, 0.01824401, 0.00798176, 0.00627138, 0.01824401,
        0.00342075, 0.01197263, 0.00342075, 0.00627138, 0.00570125,
        0.004561  , 0.00285063, 0.00513113, 0.01425314, 0.01596351,
        0.00285063, 0.01368301, 0.02109464, 0.00969213, 0.01083238,
        0.00855188, 0.01026226, 0.00855188, 0.01

In [16]:
np.mean((theta - theta_true)**2)

0.007441361120289558

In [20]:
np.mean(np.argmax(theta, axis = 1) == np.argmax(theta_true, axis = 1)) # Complete agreement

1.0

In [17]:
theta, phi = lda_optimized(bow, 2)

In [18]:
theta

array([[0.92929293, 0.07070707],
       [0.79292929, 0.20707071],
       [0.91414141, 0.08585859],
       [0.99494949, 0.00505051],
       [0.97979798, 0.02020202],
       [0.86363636, 0.13636364],
       [0.97474747, 0.02525253],
       [0.83838384, 0.16161616],
       [0.97474747, 0.02525253],
       [0.98484848, 0.01515152]])

In [19]:
theta, phi = lda_optimized(bow, 2)

theta

array([[0.96969697, 0.03030303],
       [0.94444444, 0.05555556],
       [0.85858586, 0.14141414],
       [0.98989899, 0.01010101],
       [0.96464646, 0.03535354],
       [0.86363636, 0.13636364],
       [0.93939394, 0.06060606],
       [0.84343434, 0.15656566],
       [0.99494949, 0.00505051],
       [0.98989899, 0.01010101]])