In [18]:
# Load libraries
import numpy as np

In [19]:
# Set seed
np.random.seed(10)

In [20]:
def initialize(w, K, M, V, doc_lens):
    """Initializes values for collapsed gibbs sampler"""
    
    # Set initial z randomly
    z = {}
    for m in range(M):
        z[m] = []
        for n in range(doc_lens[m]):
            z[m].extend(np.nonzero(np.random.multinomial(1, np.ones(K)/K))[0])
    
    # Create count matrices
    N_1 = np.zeros((M, K))
    for m in range(M):
        for k in range(K):
            N_1[m, k] = sum(np.array(z[m]) == k)
            
    N_2 = np.zeros((K, V))
    for m in range(M):
        for n in range(doc_lens[m]):
            N_2[z[m][n], w[m][n]] += 1
            
    N_3 = np.zeros(K)
    for m in range(M):
        for n in range(doc_lens[m]):
            N_3[z[m][n]] += 1
            
    return((z, N_1, N_2, N_3))

In [21]:
def gibbs(w, K, M, V, doc_lens, alpha, beta, N_1, N_2, N_3, z, n_iter):
    """Runs gibbs sampler to get estimated latent topics"""
    
    for i in range(n_iter):
        for m in range(M):
            for n in range(doc_lens[m]):
                N_1[m, z[m][n]] -= 1
                N_2[z[m][n], w[m][n]] -= 1
                N_3[z[m][n]] -= 1
                p = np.zeros(K)
                for k in range(K):
                    p[k] = (N_1[m, k] + alpha[k])*((N_2[k, w[m][n]] + beta[w[m][n]])/(N_3[k] + sum(beta)))
                p /= sum(p)
                z[m][n] = np.nonzero(np.random.multinomial(1, p))[0][0]
                N_1[m, z[m][n]] += 1
                N_2[z[m][n], w[m][n]] += 1
                N_3[z[m][n]] += 1
                
    return((N_1, N_2))

In [22]:
def topic_dist(N_1, doc_lens, alpha, M, K):
    """Calculates MC estimates for topic distributions using results from Gibbs sampler"""
    
    theta = np.zeros((M, K))
    for m in range(M):
        for k in range(K):
            theta[m, k] = (N_1[m, k] + alpha[k])/(doc_lens[m] + sum(alpha))
            
    return theta

In [23]:
def word_dist(N_2, beta, V, K):
    """Calculates MC estimates for word distributions using results from Gibbs sampler"""
    
    phi = np.zeros((K, V))
    for k in range(K):
        for v in range(V):
            phi[k, v] = (N_2[k, v] + beta[v]) / (sum(N_2[k, :]) + sum(beta))
            
    return phi

In [24]:
def lda(bow, K, alpha = 1, beta = 1, n_iter = 1000):
    """LDA implementation using collapsed Gibbs sampler"""
    
    # Get corpus parameters
    M, V = bow.shape
    doc_lens = np.sum(bow, axis = 1, dtype = 'int')
    
    # Create word dictionary
    w = {}
    for m in range(M):
        w[m] = []
        for v in range(V):
            for n in range(int(bow[m, v])):
                w[m].append(v)
    
    # Initialize values for Gibbs sampler   
    z, N_1, N_2, N_3 = initialize(w, K, M, V, doc_lens)
    
    
    # Set symmetric hyperparameters
    alpha = np.ones(K) * alpha
    beta  = np.ones(V) * beta
    
    # Run Gibbs sampler
    N_1, N_2 = gibbs(w, K, M, V, doc_lens, alpha, beta, N_1, N_2, N_3, z, n_iter)
    
    # Estimate topic and word distributions
    theta = topic_dist(N_1, doc_lens, alpha, M, K)
    phi   = word_dist(N_2, beta, V, K)
    
    return((theta, phi))

In [25]:
# Doc params
V = 10
N_min = 10
N_max = 20
K = 3
M = 2

In [26]:
# Set true params
alpha_true = np.array([2, 1, 2])
beta_true = np.random.randint(1, 2, V)

In [27]:
# Generate data
phi_true = np.zeros((K, V))
for k in range(K):
    phi_true[k, :] = np.random.dirichlet(beta_true)

In [28]:
theta_true = np.zeros((M, K))
for m in range(M):
    theta_true[m,:] = np.random.dirichlet(alpha_true)

In [29]:
doc_lens = np.random.randint(N_min, N_max, M)
z_true = {}
w = {}
for m in range(M):
    z_true[m] = []
    w[m] = []
    for n in range(doc_lens[m]):
        z_true[m].extend(np.nonzero(np.random.multinomial(1, theta_true[m,:]))[0])
        w[m].extend(np.nonzero(np.random.multinomial(1, phi_true[z_true[m][n], :]))[0])

In [30]:
bow = np.zeros((M, V))
for m in range(M):
    for v in range(V):
        bow[m, v] = len(np.where(np.array(w[m]) == v)[0])

In [36]:
theta, phi = lda(bow, K, 1, 1, 1000)

In [262]:
np.round(results[1][1,:], 2)

array([0.04, 0.02, 0.13, 0.07, 0.02, 0.02, 0.07, 0.22, 0.26, 0.15])

In [263]:
np.round(phi_true[1,:], 2)

array([0.09, 0.24, 0.  , 0.06, 0.13, 0.07, 0.1 , 0.03, 0.19, 0.1 ])

In [101]:
def get_key_words(phi, n_words, words = None):
    """Gets key words from each topic after LDA is performed"""
    K = len(phi)
    for k in range(K):
        biggest_probs = sorted(phi[k, :], reverse = False)[:n_words]
        key_words = [i for i in range(len(phi[k, :])) if phi[k, i] in biggest_probs]
        if words is not None:
            print("Key words for topic", k + 1, ": ", [words[i] for i in key_words])
        else:
            print("Key words for topic", k + 1, ": ", key_words)

In [102]:
get_key_words(phi, 5, ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"])

Key words for topic 1 :  ['one', 'two', 'four', 'five', 'six', 'seven', 'eight', 'ten']
Key words for topic 2 :  ['two', 'three', 'five', 'six', 'seven']
Key words for topic 3 :  ['one', 'three', 'five', 'six', 'ten']
