In [10]:
# Load libraries
import numpy as np

In [17]:
def generate_dists(alpha, beta, M, K, V):
    """Generates topic and word distributions"""
    
    # Generate word distributions
    phi = np.zeros((K, V))
    for k in range(K):
        phi[k, :] = np.random.dirichlet(beta)
    
    # Generate topic distributions
    theta = np.zeros((M, K))
    for m in range(M):
        theta[m,:] = np.random.dirichlet(alpha)
    
    return((phi, theta))

In [18]:
def generate_words(phi, theta, M, N_min, N_max):
    """Generates 'words' for corpus"""
    
    doc_lens = np.random.randint(N_min, N_max, M)
    z = {}
    w = {}
    for m in range(M):
        z[m] = []
        w[m] = []
        for n in range(doc_lens[m]):
            z[m].extend(np.nonzero(np.random.multinomial(1, theta[m,:]))[0])
            w[m].extend(np.nonzero(np.random.multinomial(1, phi[z[m][n], :]))[0])
    
    return w

In [19]:
def make_bow(w, M, V):
    """Creates bag-of-words matrix from corpus"""
    
    bow = np.zeros((M, V))
    for m in range(M):
        for v in range(V):
            bow[m, v] = len(np.where(np.array(w[m]) == v)[0])
    
    return bow

In [20]:
def simulate_corpus(alpha, beta, M, N_min, N_max):
    """Generates test data for LDA"""
    
    # Get corpus parameters
    K = len(alpha)
    V = len(beta)
    
    # Generate topic and word distributions
    phi, theta = generate_dists(alpha, beta, M, K, V)
    
    # Generate words
    w = generate_words(phi, theta, M, N_min, N_max)
    
    # Make bag-of-words matrix
    bow = make_bow(w, M, V)
    
    return((bow, phi, theta))