In [17]:
from LDA_AandB.lda_code import initialize, gibbs
from LDA_AandB.test_data_generator import generate_matrices

In [18]:
from LDA_AandB.lda_optimized import Fastest_Gibbs, Full_Gibbs

In [19]:
# Load libraries
import numpy as np
from LDA_AandB.test_data_generator import simulate_corpus
from LDA_AandB.lda_code import lda, group_docs

In [20]:
# Set seed
np.random.seed(10)

In [21]:
# Set corpus parameters
V = 10
N_min = 10
N_max = 20
K = 2
M = 10

In [22]:
# Set true parameters
alpha_true = np.random.randint(1, 15, K)
beta_true = np.random.randint(1, 10, V)

In [23]:
# Generate simulated dataset
bow, theta_true, phi_true = simulate_corpus(alpha_true, beta_true, M, N_min, N_max)

In [24]:
# Train data on LDA implementation
theta, phi = lda(bow, K)

In [25]:
## Create the initial matrices and prior values
alpha = np.ones(K)
beta  = np.ones(V) 

# Get corpus parameters
M, V = bow.shape
doc_lens = np.sum(bow, axis = 1, dtype = 'int')

# Create word dictionary
w = {}
for m in range(M):
    w[m] = []
    for v in range(V):
        for n in range(int(bow[m, v])):
            w[m].append(v)

# Initialize values for Gibbs sampler   
z, N_1, N_2, N_3 = initialize(w, K, M, V, doc_lens)
W, Z = generate_matrices(w,z)

In [26]:
%timeit -r10 gibbs(w, K, M, V, doc_lens, alpha, beta, N_1, N_2, N_3, z, 10)

34 ms ± 1.61 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [27]:
%timeit -r10 Fastest_Gibbs(W, K, M, V, doc_lens, alpha, beta, N_1, N_2, N_3, Z, 10)

29.9 ms ± 469 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [28]:
%timeit -r10 Full_Gibbs(W, K, M, V, doc_lens, alpha, beta, N_1, N_2, N_3, Z, 10)

29.6 ms ± 1.48 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [13]:
36.1/28.9

1.2491349480968859

In [29]:
A,B,C = gibbs(w, K, M, V, doc_lens, alpha, beta, N_1, N_2, N_3, z, 10)

ValueError: sum(pvals[:-1]) > 1.0

In [16]:
A_fast, B_fast, C_fast = Full_Gibbs(W, K, M, V, doc_lens, alpha, beta, N_1, N_2, N_3, Z, 100)

  p = p/(np.sum(p))


In [None]:
# np.array_equal(A, A_fast), np.array_equal(B, B_fast), np.array_equal(C, C_fast)

In [None]:
theta = np.zeros((M, K))
for m in range(M):
    for k in range(K):
        theta[m , k] = (A_fast[m, k] + alpha[k])/(doc_lens[m] + sum(alpha))
np.round(theta, 2)