In [53]:
# Load libraries
import numpy as np
from LDA_AandB.test_data_generator import simulate_corpus
from LDA_AandB.lda_code import lda, group_docs

In the LDA framework, documents are assumed to be generated under the following stochastic process:

For each document $m$, sample topic distribution $ \theta_m \sim Dirichlet(\alpha)$

For each topic $k$, sample word distribution $ \phi_k \sim Dirichlet(\beta)$

For each word $n$ in each document,

1) Sample topic $z_n \sim Cat(\theta_m)$

2) Sample word $w_n \sim Cat(\phi_{z_n})$

To assess the correctness of our LDA algorithm, we simulate data under this stochastic process. We then train this data on our algorithm and compare the parameter estimates to the true parameters.

We simulate a corpus of 10 documents containing 100 unique "words". Documents in the corpus are composed of 2 different topics and contain between 150 and 200 words.

In [100]:
# Set seed
np.random.seed(101)

In [101]:
# Set corpus parameters
V = 100
N_min = 150
N_max = 200
K = 3
M = 200

In [102]:
# Set true parameters
alpha_true = np.random.randint(1, 10, K)
beta_true = np.random.randint(1, 10, V)

In [103]:
print("alpha:", alpha_true)
print("beta:", beta_true)

alpha: [2 7 8]
beta: [9 5 9 6 1 6 9 2 4 9 4 4 3 9 4 8 1 8 9 5 4 4 8 5 9 8 7 5 3 8 8 8 1 5 2 9 4
 2 9 5 4 3 4 4 8 5 9 7 4 8 7 6 7 3 3 2 4 4 4 5 6 6 9 4 7 9 8 6 8 6 5 5 8 4
 3 4 3 4 6 7 7 6 6 2 8 5 8 5 6 2 7 3 7 4 9 2 8 5 2 9]


In [104]:
# Generate simulated dataset
bow, theta_true, phi_true = simulate_corpus(alpha_true, beta_true, M, N_min, N_max)

In [105]:
# Train data on LDA implementation
theta, phi = lda(bow, K, 1, 1, 100)

In [106]:
theta

array([[0.14772727, 0.53409091, 0.31818182],
       [0.32919255, 0.25465839, 0.41614907],
       [0.59585492, 0.11398964, 0.29015544],
       [0.08383234, 0.50299401, 0.41317365],
       [0.19375   , 0.15      , 0.65625   ],
       [0.54594595, 0.1027027 , 0.35135135],
       [0.43243243, 0.21081081, 0.35675676],
       [0.675     , 0.21      , 0.115     ],
       [0.37172775, 0.14136126, 0.48691099],
       [0.01183432, 0.50887574, 0.47928994],
       [0.77987421, 0.11949686, 0.10062893],
       [0.05847953, 0.77192982, 0.16959064],
       [0.48717949, 0.32051282, 0.19230769],
       [0.3625731 , 0.26315789, 0.37426901],
       [0.16410256, 0.75897436, 0.07692308],
       [0.2606383 , 0.2606383 , 0.4787234 ],
       [0.3989071 , 0.21311475, 0.38797814],
       [0.14917127, 0.18232044, 0.66850829],
       [0.4691358 , 0.16049383, 0.37037037],
       [0.02538071, 0.65989848, 0.31472081],
       [0.53333333, 0.0969697 , 0.36969697],
       [0.34444444, 0.34444444, 0.31111111],
       [0.

In [107]:
theta_true

array([[0.3524457 , 0.28015502, 0.36739928],
       [0.15586289, 0.54676451, 0.2973726 ],
       [0.11131409, 0.4063943 , 0.4822916 ],
       [0.07528973, 0.54600846, 0.3787018 ],
       [0.19372131, 0.37756367, 0.42871502],
       [0.08346642, 0.56031089, 0.35622269],
       [0.05451921, 0.23500161, 0.71047918],
       [0.07800252, 0.4689704 , 0.45302708],
       [0.10155864, 0.39310845, 0.50533291],
       [0.04577643, 0.66094479, 0.29327878],
       [0.00900421, 0.51716167, 0.47383412],
       [0.13498435, 0.27926605, 0.5857496 ],
       [0.25588522, 0.4350384 , 0.30907637],
       [0.05551627, 0.43746505, 0.50701868],
       [0.10071926, 0.15802962, 0.74125112],
       [0.18732816, 0.3238325 , 0.48883934],
       [0.06980619, 0.45818851, 0.4720053 ],
       [0.06463122, 0.80823071, 0.12713808],
       [0.13924761, 0.3128047 , 0.54794768],
       [0.15960374, 0.31710924, 0.52328702],
       [0.07810016, 0.44232687, 0.47957297],
       [0.18609133, 0.49111519, 0.32279348],
       [0.

In [108]:
group_docs(theta, K)

Documents labeled in group 1 : [  2   5   6   7  10  12  16  18  20  21  22  24  28  29  32  34  35  38
  39  40  42  44  45  46  47  49  50  52  53  55  56  57  58  62  63  64
  67  70  74  78  82  83  84  92  95  97  98 101 103 110 118 121 123 124
 129 140 142 144 146 151 152 153 156 158 159 160 164 170 171 173 176 183
 184 187 188 189 190 194 195]
Documents labeled in group 2 : [  0   3   9  11  14  19  26  27  31  33  37  51  60  61  66  69  77  79
  80  81  88  89  93  99 100 105 106 109 112 113 114 115 116 117 120 128
 136 137 141 143 145 147 148 149 150 154 155 161 166 169 172 177 178 180
 181 185 186 191 193 196]
Documents labeled in group 3 : [  1   4   8  13  15  17  23  25  30  36  41  43  48  54  59  65  68  71
  72  73  75  76  85  86  87  90  91  94  96 102 104 107 108 111 119 122
 125 126 127 130 131 132 133 134 135 138 139 157 162 163 165 167 168 174
 175 179 182 192 197 198 199]


In [109]:
group_docs(theta_true, K)

Documents labeled in group 1 : [157]
Documents labeled in group 2 : [  1   3   5   7   9  10  12  17  21  22  28  29  30  31  34  38  39  43
  44  45  49  52  58  59  65  68  70  74  75  81  82  85  86  87  90  93
  94  95  97  98 102 104 108 109 110 111 118 119 124 125 126 128 130 131
 132 134 137 143 149 151 153 154 156 160 166 170 171 175 179 180 188 190
 191 193 194 196]
Documents labeled in group 3 : [  0   2   4   6   8  11  13  14  15  16  18  19  20  23  24  25  26  27
  32  33  35  36  37  40  41  42  46  47  48  50  51  53  54  55  56  57
  60  61  62  63  64  66  67  69  71  72  73  76  77  78  79  80  83  84
  88  89  91  92  96  99 100 101 103 105 106 107 112 113 114 115 116 117
 120 121 122 123 127 129 133 135 136 138 139 140 141 142 144 145 146 147
 148 150 152 155 158 159 161 162 163 164 165 167 168 169 172 173 174 176
 177 178 181 182 183 184 185 186 187 189 192 195 197 198 199]


In [110]:
np.mean((theta - theta_true)**2)

0.08832143365391604

In [111]:
np.mean(abs(theta - theta_true))

0.24425016196645377

In [112]:
np.argmax(theta, axis = 1)

array([1, 2, 0, 1, 2, 0, 0, 0, 2, 1, 0, 1, 0, 2, 1, 2, 0, 2, 0, 1, 0, 0,
       0, 2, 0, 2, 1, 1, 0, 0, 2, 1, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 0, 2,
       0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 2,
       1, 0, 2, 1, 0, 2, 2, 2, 0, 2, 2, 1, 0, 1, 1, 1, 0, 0, 0, 2, 2, 2,
       1, 1, 2, 2, 0, 1, 2, 0, 2, 0, 0, 1, 1, 0, 2, 0, 2, 1, 1, 2, 2, 1,
       0, 2, 1, 1, 1, 1, 1, 1, 0, 2, 1, 0, 2, 0, 0, 2, 2, 2, 1, 0, 2, 2,
       2, 2, 2, 2, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 2, 0, 0, 0, 1, 2, 2, 0, 2, 1, 2, 2, 1, 0, 0, 1, 0, 2, 2,
       0, 1, 1, 2, 1, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 2,
       2, 2])

In [113]:
np.argmax(theta_true, axis = 1)

array([2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1,
       1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1,
       1, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1,
       2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1,
       2, 2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1,
       1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1,
       1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1,
       1, 2, 1, 0, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1,
       2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2,
       2, 2])

In [89]:
np.mean(np.argmax(theta, axis = 1) == np.argmax(theta_true, axis = 1))

0.5

The accuracy of our LDA depends on the choice of the hyperparameters $\alpha$ and $\beta$. The closer these hyperparameters are to the true values of the dataset, the better the algorithm's estimates of the topic and word distributions. 

When the hyperparameters $\alpha$ and $\beta$ are chosen to be the true values, our LDA algorithm estimates the true topic and word distributions very well:

In [None]:
# Train data on LDA implementation
theta, phi = lda(bow, K, 1, 1, 10000)

In [None]:
theta

In [None]:
theta_true

In [None]:
np.mean((theta - theta_true)**2)

However, in real-world scenarios we don't know what the true values of $\alpha$ and $\beta$ are. In the case where the chosen hyperparameters are not the true values from the data, our LDA algorithm's estimates are less accurate.