In [21]:
from LDA_AandB.lda_code import initialize, gibbs
from LDA_AandB.test_data_generator import generate_matrices

In [23]:
from LDA_AandB.lda_optimized import Fastest_Gibbs, Full_Gibbs

In [15]:
# Load libraries
import numpy as np
from LDA_AandB.test_data_generator import simulate_corpus
from LDA_AandB.lda_code import lda, group_docs

In [16]:
# Set seed
np.random.seed(10)

In [17]:
# Set corpus parameters
V = 10
N_min = 10
N_max = 20
K = 2
M = 10

In [18]:
# Set true parameters
alpha_true = np.random.randint(1, 15, K)
beta_true = np.random.randint(1, 10, V)

In [19]:
# Generate simulated dataset
bow, theta_true, phi_true = simulate_corpus(alpha_true, beta_true, M, N_min, N_max)

In [20]:
# Train data on LDA implementation
theta, phi = lda(bow, K)

In [22]:
## Create the initial matrices and prior values
alpha = np.ones(K)
beta  = np.ones(V) 

# Get corpus parameters
M, V = bow.shape
doc_lens = np.sum(bow, axis = 1, dtype = 'int')

# Create word dictionary
w = {}
for m in range(M):
    w[m] = []
    for v in range(V):
        for n in range(int(bow[m, v])):
            w[m].append(v)

# Initialize values for Gibbs sampler   
z, N_1, N_2, N_3 = initialize(w, K, M, V, doc_lens)
W, Z = generate_matrices(w,z)

In [11]:
%timeit -r10 gibbs(w, K, M, V, doc_lens, alpha, beta, N_1, N_2, N_3, z, 10)

36.1 ms ± 1.75 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [13]:
%timeit -r10 Fastest_Gibbs(W, K, M, V, doc_lens, alpha, beta, N_1, N_2, N_3, Z, 10)

29.1 ms ± 1.66 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [14]:
%timeit -r10 Full_Gibbs(W, K, M, V, doc_lens, alpha, beta, N_1, N_2, N_3, Z, 10)

28.9 ms ± 853 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [2]:
36.1/28.9

1.2491349480968859

In [24]:
A,B,C = gibbs(w, K, M, V, doc_lens, alpha, beta, N_1, N_2, N_3, z, 10000)

ValueError: not enough values to unpack (expected 3, got 2)

In [26]:
A_fast, B_fast, C_fast = Full_Gibbs(W, K, M, V, doc_lens, alpha, beta, N_1, N_2, N_3, Z, 100)

In [25]:
# np.array_equal(A, A_fast), np.array_equal(B, B_fast), np.array_equal(C, C_fast)

NameError: name 'A' is not defined

In [28]:
theta = np.zeros((M, K))
for m in range(M):
    for k in range(K):
        theta[m , k] = (A_fast[m, k] + alpha[k])/(doc_lens[m] + sum(alpha))
np.round(theta, 2)

array([[ 1.07, -0.07],
       [ 1.36, -0.36],
       [ 1.22, -0.22],
       [ 1.  ,  0.  ],
       [-0.18,  1.18],
       [ 0.  ,  1.  ],
       [ 1.27, -0.27],
       [-0.11,  1.11],
       [ 1.18, -0.18],
       [ 1.  ,  0.  ]])