In [1]:
# Load libraries
import numpy as np
from numba import jit
import numba


In [3]:
# from lda_code import initialize, gibbs
# from test_data_generator import generate_matrices

In [4]:
from lda_optimized import Fastest_Gibbs, Full_Gibbs

In [13]:
# Load libraries
import numpy as np
from test_data_generator import simulate_corpus
from lda_code import lda, group_docs, gibbs

In [6]:
# Set seed
np.random.seed(10)

In [7]:
# Set corpus parameters
V = 10
N_min = 10
N_max = 20
K = 2
M = 10

In [8]:
# Set true parameters
alpha_true = np.random.randint(1, 15, K)
beta_true = np.random.randint(1, 10, V)

In [9]:
# Generate simulated dataset
bow, theta_true, phi_true = simulate_corpus(alpha_true, beta_true, M, N_min, N_max)

In [10]:
from lda_optimized import Fastest_Gibbs
from lda_code import topic_dist, word_dist

In [11]:
from lda_code_opt import lda_optimized

In [12]:
lda_optimized(bow, K)

(array([[0.94444444, 0.05555556],
        [0.88888889, 0.11111111],
        [0.77777778, 0.22222222],
        [0.88888889, 0.11111111],
        [0.83333333, 0.16666667],
        [0.77777778, 0.22222222],
        [0.72222222, 0.27777778],
        [0.94444444, 0.05555556],
        [0.83333333, 0.16666667],
        [0.88888889, 0.11111111]]),
 array([[0.12403101, 0.0620155 , 0.04651163, 0.00775194, 0.02325581,
         0.20930233, 0.01550388, 0.23255814, 0.24031008, 0.03875969],
        [0.11111111, 0.03703704, 0.03703704, 0.03703704, 0.11111111,
         0.18518519, 0.03703704, 0.11111111, 0.11111111, 0.22222222]]))

In [14]:
lda(bow, K)

(array([[0.46666667, 0.57142857],
        [0.66666667, 0.28571429],
        [1.13333333, 0.07142857],
        [0.4       , 0.42857143],
        [0.86666667, 0.28571429],
        [0.8       , 0.07142857],
        [0.26666667, 0.78571429],
        [0.86666667, 0.35714286],
        [0.66666667, 0.5       ],
        [0.66666667, 0.5       ]]),
 array([[0.11764706, 0.00980392, 0.02941176, 0.00980392, 0.01960784,
         0.30392157, 0.01960784, 0.17647059, 0.25490196, 0.05882353],
        [0.12962963, 0.14814815, 0.07407407, 0.01851852, 0.07407407,
         0.01851852, 0.01851852, 0.27777778, 0.14814815, 0.09259259]]))

In [36]:
def initialize(W, K, M, V, doc_lens):
    """Initializes values for collapsed gibbs sampler"""
    
    # Set initial z randomly
    max_len = np.max(doc_lens)
    Z = np.zeros((M, max_len), dtype = 'int')
    for m in range(M):
        for n in range(doc_lens[m]):
            Z[m,n] = get_multinom(np.ones(K)/K)
    
    # Create count matrices
    N_1 = np.zeros((M, K))
    for m in range(M):
        for k in range(K):
            N_1[m, k] = np.sum(np.array(Z[m,:]) == k)
            
    N_2 = np.zeros((K, V))
    for m in range(M):
        for n in range(doc_lens[m]):
            WordTopic = Z[m,n]
            TopicWord = W[m,n]
            N_2[WordTopic, TopicWord] += 1
            
    N_3 = np.zeros(K)
    for m in range(M):
        for n in range(doc_lens[m]):
            WordTopic = Z[m,n]
            N_3[Z[m,n]] += 1
            
    return((Z, N_1, N_2, N_3))


In [41]:
def lda_optimized(bow, K, alpha = 1, beta = 1, n_iter = 1000):
    """LDA implementation using collapsed Gibbs sampler"""
    
    # Get corpus parameters
    M, V = bow.shape
    doc_lens = np.sum(bow, axis = 1, dtype = 'int')
    
    # Create word dictionary
    max_len = np.max(doc_lens)
    W = np.zeros((M, max_len), dtype='int')
    
    for m in range(M):
        d = 0
        for v in range(V):
            for n in range(int(bow[m, v])):
                W[m,d] = v
                d+=1
    
    # Initialize values for Gibbs sampler   
    Z, N_1, N_2, N_3 = initialize(W, K, M, V, doc_lens)
    
    
    # Set symmetric hyperparameters
    alpha = np.ones(K) * alpha
    beta  = np.ones(V) * beta
        
    
    # Run Gibbs sampler
    A, B, C= Full_Gibbs(W, K, M, V, doc_lens, alpha, beta, N_1, N_2, N_3, Z, n_iter)
    
    
    # Estimate topic and word distributions
    theta = topic_dist(N_1, doc_lens, alpha, M, K)
    phi   = word_dist(N_2, beta, V, K)
    return Z, W 


In [42]:
Z, W = lda(bow, K)

In [44]:
W

array([[0, 1, 2, 5, 5, 5, 7, 7, 7, 7, 8, 8, 8, 0, 0, 0],
       [0, 0, 0, 1, 5, 5, 5, 8, 8, 8, 8, 9, 0, 0, 0, 0],
       [0, 0, 5, 5, 5, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8],
       [1, 5, 7, 7, 8, 8, 8, 8, 8, 9, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 2, 2, 5, 5, 5, 5, 7, 7, 7, 7, 8, 8, 0],
       [0, 0, 5, 5, 5, 5, 5, 7, 7, 9, 9, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 4, 7, 7, 7, 7, 8, 8, 8, 9, 0, 0, 0],
       [1, 2, 2, 5, 5, 5, 5, 5, 5, 7, 7, 7, 7, 8, 9, 9],
       [0, 0, 1, 4, 4, 5, 5, 6, 7, 8, 8, 8, 8, 8, 9, 0],
       [0, 0, 4, 5, 5, 5, 7, 7, 7, 7, 8, 8, 8, 8, 9, 0]])

In [17]:
Z, N_1, N_2, N_3 = initialize(W, K, M, V, doc_lens)

TypingError: Failed at nopython (nopython frontend)
Invalid usage of Function(<built-in function array>) with parameters (array(float64, 1d, C))
 * parameterized
In definition 0:
    TypingError: array(float64, 1d, C) not allowed in a homogenous sequence
    raised from /opt/conda/lib/python3.6/site-packages/numba/typing/npydecl.py:434
[1] During: resolving callee type: Function(<built-in function array>)
[2] During: typing of call at <ipython-input-11-891193b58666> (16)

In [16]:
#     M, V = bow.shape
#     doc_lens = np.sum(bow, axis = 1, dtype = 'int')
    
#     # Create word dictionary
#     max_len = np.max(doc_lens)
#     W = np.zeros((M, max_len), dtype='int')
    
#     for m in range(M):
#         d = 0
#         for v in range(V):
#             for n in range(int(bow[m, v])):
#                 W[m,d] = v
#                 d+=1

In [19]:
@jit(nopython=True, cache = False)
def test_fun(a):
    return a

In [20]:
def test_fun2(a):
    return(test_fun(a))