In [1]:
! pip install stop_words



You are using pip version 8.0.3, however version 8.1.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [1]:
import collections

s1 = "The quick brown fox"
s2 = "Brown fox jumps over the jumps jumps jumps"
s3 = "The the the lazy dog elephant."
s4 = "The the the the the dog peacock lion tiger elephant"

docs = collections.OrderedDict()
docs["s1"] = s1
docs["s2"] = s2
docs["s3"] = s3
docs["s4"] = s4
for k, v in docs.items():
    print(k,v)
    

s1 The quick brown fox
s2 Brown fox jumps over the jumps jumps jumps
s3 The the the lazy dog elephant.
s4 The the the the the dog peacock lion tiger elephant


In [2]:
len(docs)

4

**Function to make Corpus Matrix**

Function returns a list, 0th element is a 3 dimensional array, each element corresponds to a document in the corpus where the columns are the unique words in that document and the rows are the words in that document (stop words removed).  Second element in list is the column names for each document.

In [3]:
import string
import stop_words
import numpy as np
from stop_words import get_stop_words

stopWords = get_stop_words('english')

#Function to make document, word matricies for LDA#
def make_word_matrix(corpus):
    D = len(corpus)
    #Define list to store corpus data#
    c = []
    #Define list to store order of words for each document#
    wordOrder = []
    #Define table to remove punctuation
    table = dict.fromkeys(map(ord, string.punctuation))
   
    #For each document in docs, caculate frequency of the words#
    for i in corpus:
        #Remove punctuation 
        text = docs[i].translate(table)
        #Splits string by blankspace and goes to lower case#
        words = text.lower().split()
    
        #Remove stop words#
        text = [word for word in words if word not in stopWords]
        
        #Find total number of words in each document#
        N = len(text)
        
        #Find number of unique words in each document#
        V = len(set(text))
        Vwords = list(set(text))
        wordOrder.append(Vwords)
        #Create matrix to store words for each document#
        wordsMat = np.zeros((N, V))
        count = 0
        for word in text:
            v = Vwords.index(word)
            wordsMat[count, v] = 1
            count = count + 1
        c.append(wordsMat)

    return [c, wordOrder]



In [4]:
corpusMatrix = make_word_matrix(docs)
corpusMatrix[0][0].shape

(3, 3)

In [5]:
corpusMatrix

[[array([[ 1.,  0.,  0.],
         [ 0.,  1.,  0.],
         [ 0.,  0.,  1.]]), array([[ 0.,  1.,  0.],
         [ 0.,  0.,  1.],
         [ 1.,  0.,  0.],
         [ 1.,  0.,  0.],
         [ 1.,  0.,  0.],
         [ 1.,  0.,  0.]]), array([[ 0.,  1.,  0.],
         [ 0.,  0.,  1.],
         [ 1.,  0.,  0.]]), array([[ 0.,  0.,  0.,  0.,  1.],
         [ 1.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  1.,  0.,  0.],
         [ 0.,  1.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  1.,  0.]])],
 [['quick', 'brown', 'fox'],
  ['jumps', 'brown', 'fox'],
  ['elephant', 'lazy', 'dog'],
  ['peacock', 'tiger', 'lion', 'elephant', 'dog']]]

**Variational Inference**

In [6]:
import numpy as np
import scipy
from scipy import special

**E-Step:** This function uses variational inference to perform the E step in the EM algorithm to estimate the paramteters in the model.  The output of this function are the matricies gamma and phi, where gamma (k vector) is the Dirichlet paramteters and the matrix phi (N x k, where k is the number of topics) are the multinomial paramters.  See page 1004 of paper for derivation.

In [7]:
def Estep(k, d, alpha, beta, corpusMatrix, tol):    
    
    #storing the total number of words and the number of unique words
    N = corpusMatrix[0][d].shape[0]
    V = corpusMatrix[0][d].shape[1]
    
    #initialize phi and gamma
    oldPhi  = np.full(shape = (N,k), fill_value = 1/k)
    gamma = alpha + N/k
    newPhi = oldPhi
    converge = 0 
    
    
    count = 0
    
    while converge == 0:
        newPhi  = np.zeros(shape = (N,k))
        for n in range(0, N):
            for i in range(0,k):
                newPhi[n,i] = (beta[i, list(corpusMatrix[0][d][n,:]).index(1)])*np.exp(scipy.special.psi(gamma[i]))
        newPhi = newPhi/np.sum(newPhi, axis = 1)[:, np.newaxis] #normalizing the rows of new phi

        for i in range(0,k):
            gamma[i] = alpha[i] + np.sum(newPhi[:, i]) #updating gamma


        criteria = (1/(N*k)*np.sum((newPhi - oldPhi)**2))**0.5
        if criteria < tol:
            converge = 1
        else:
            oldPhi = newPhi
            count = count +1
            converge = 0
    return (newPhi, gamma)

**Parameter Estimation**

**M Step:** In the E step above, we maximized a lower bound with respect to gamma and phi, and in the M step, for fixed values of these variational parameters, we maximize the lower bound of the log likelihood with repsect to alpha and beta to update these values (combined, these two steps give approximate empirical Bayes estimates for the LDA model).  See pg. 1006 and appendix A.2 for derivation.  

The alphaUpdate() function uses the linear Newton-Rhapson method to update the Dirichlet parameters, alpha, while the Mstep() function maximizes for alpha and beta.

In [27]:
#Update alpha using linear Newton-Rhapson Method#
#Following derivation in #
#http://arxiv.org/pdf/1405.0099.pdf
def alphaUpdate(alphaOld, phiMat, tol):
    N = phiMat.shape[0]
    K = phiMat.shape[1]
    x = np.zeros(shape = (K, 1))
    d = np.zeros(shape = (K, 1))
    v = np.zeros(shape = (K, 1))
    g = np.zeros(shape = (K, 1))
    alphaNew = np.zeros(shape = (K, 1))
    alphaOld = alphaOld
    converge = 0
    while converge == 0: 
        Z = 0
        for k in range(0, K):
            d[k] = -scipy.special.polygamma(1, alphaOld[k])
            v[k] = 1/N * sum(np.log(phiMat[:, k]))
            g[k] = scipy.special.psi(sum(alphaOld)) - scipy.special.psi(alphaOld[k]) + v[k]
            x[k] = g[k] - alphaOld[k]*d[k]
            Z = Z + alphaOld[k]/x[k]
        c = scipy.special.polygamma(1,sum(alphaOld))
        Z = Z*c
        S = 0
        for k in range(0, K):
            S = S + 1/((1+Z)/x[k])
        for k in range(0, K):
            alphaNew[k] = S + x[k]*(1-c*alphaOld[k]*S)
        if np.linalg.norm(alphaOld - alphaNew) < tol:
            converge = 1
        else:
            converge = 0
            alphaOld = alphaNew
    return alphaNew

In [29]:
def Mstep(k, d, phi, alpha, corpusMatrix, tol):
    #Calculate beta#
    V = corpusMatrix[0][d].shape[1]
    beta = np.zeros(shape = (k,V))
    for i in range(0,k):
        for n in range(0,V):
            beta[i,n] = np.sum(phi[:,i][:, np.newaxis]*(corpusMatrix[0][d][n,:])[np.newaxis, :])
    #Normalize the columns of beta#
    beta = beta/np.sum(beta, axis = 0)[np.newaxis, :]
    
    
    ##Update ALPHA##
    alphaNew = alphaUpdate(alpha, phi, tol)
    return(alphaNew, beta)



**LDA Function:**
Finally, we implement the entire Latent Dirichlet Allocation method in the LDA function, which takes as its arguments k (the number of topics), D (the number of documents in the corpus), a corpus matrix (the output from make_word_matrix above) and a tolerance (which sets the convergence criteria for the while loops).  For each document d, the function runs until the alpha or beta parameters converge, by first running the E step and then the M step for each document separately.  The final values of phi, gamma, alpha and beta are returned for all D documents in a list.

In [30]:
#k = number of topics, D = number of documents#
#corpus matrix is output of make_word_matrix# 
def LDA(k, D, corpusMatrix, tol):

    
    output = []
    #looping through the number of documents
    for d in range(0,D): #D is the number of documents
        converge = 0
        #initialize alpha and beta for first iteration
        alphaOld = np.random.rand(k)
        V = corpusMatrix[0][d].shape[1]
        betaOld = np.random.rand(k, V)
        betaOld = betaOld/np.sum(betaOld, axis = 0)[np.newaxis, :]
        while converge == 0:
            phi, gamma = Estep(k, d, alphaOld, betaOld, corpusMatrix, tol)
            alphaNew, betaNew = Mstep(k, d, phi, alphaOld, corpusMatrix, tol)
            if np.linalg.norm(alphaOld - alphaNew) < tol or np.linalg.norm(betaOld - betaNew) < tol:
                converge =1
            else: 
                converge =0
                alphaOld = alphaNew
                betaOld = betaNew
        output.append([phi, gamma, alphaNew, betaNew])
        
    return output

In [31]:
LDA(5, 4, corpusMatrix, 1)

IndexError: index 2 is out of bounds for axis 0 with size 2

## Future Steps

- Compare output to Python package
- Test on corpus in paper
- Model topics on different corpus
- Time and optimize (use Cython, quite slow now)
- Run collaborative filtering
- Compare to Gibbs sampling

In [8]:
k=2
d = 0
alphaTest = np.random.rand(k)
alphaOld = 10*np.random.rand(k)
V = corpusMatrix[0][d].shape[1]
betaOld = np.random.rand(k, V)
betaOld = betaOld/np.sum(betaOld, axis = 0)[np.newaxis, :]
alphaOld
phiTest, gammaTest = Estep(k, 0, alphaOld, betaOld, corpusMatrix, 0.01)
print(phiTest)

[[ 0.49726696  0.50273304]
 [ 0.32909243  0.67090757]
 [ 0.04107347  0.95892653]]


In [80]:
k=2
d = 0
alphaTest = np.random.rand(k)
alphaOld = 10*np.random.rand(k)
V = corpusMatrix[0][d].shape[1]
betaOld = np.random.rand(k, V)
betaOld = betaOld/np.sum(betaOld, axis = 0)[np.newaxis, :]
alphaOld
phiTest, gammaTest = Estep(k, 0, alphaOld, betaOld, corpusMatrix, 0.01)

In [81]:
phiMat = phiTest
N = phiMat.shape[0]
K = phiMat.shape[1]

x = np.zeros(K)
v = np.zeros(K)
g = np.zeros(K)
h = np.zeros(K)
step = np.zeros(K)
count = 0
while count <10:
    z = float(scipy.special.polygamma(1, sum(alphaOld)))    
    for k in range(0, K):
        h[k] = float(scipy.special.polygamma(1, alphaOld[k]))
        v[k] = 1/N * np.sum(np.log(phiMat[:, k]))
        g[k] = scipy.special.psi(sum(alphaOld)) - scipy.special.psi(alphaOld[k]) + v[k]
    
#     print(h[0], v[0], g[0])
#     print("split")
#     print(h[1], v[1], g[1])
    c = np.sum(g/h)/(1/z + sum(1/h))
   
    for i in range(0, K):
        step[i] = (g[i]-c)/h[i]

    print(step)
    
    alphaNew = alphaOld - step
    #print(alphaNew)
    alphaOld = alphaNew
    count = count +1
alphaNew

[-0.90171323  0.90108287]
[-2.29559354  1.93060937]
[-6.28750117  4.25734708]
[-15.22861482   5.37680603]
[-29.81524248   0.37218972]
[ -6.30005019e+01  -4.55279351e-02]
[ -1.33610223e+02  -1.39137263e-02]
[ -2.83281205e+02  -1.02095567e-03]
[ -6.00580752e+02  -5.55058681e-06]
[ -1.27327363e+03  -1.85247373e-10]


array([ 2410.72014456,    -3.        ])

In [47]:
float(scipy.special.polygamma(1, sum(alphaOld))) 

2.573261725155857

In [None]:
def alphaUpdate(alphaOld, phiMat, tol):
    N = phiMat.shape[0]
    K = phiMat.shape[1]
    x = np.zeros(K)
    h = np.zeros(K)
    v = np.zeros(K)
    g = np.zeros(K)
    step = np.zeros(K)
    alphaNew = np.zeros(K)
    alphaOld = alphaOld
    converge = 0
    while converge == 0: 
  
     
        z = float(scipy.special.polygamma(1, sum(alphaOld)))    
        for k in range(0, K):
            h[k] = -N*float(scipy.special.polygamma(1, alphaOld[k]))
            v[k] = 1/N * np.sum(np.log(phiMat[:, k]))
            g[k] = N*scipy.special.psi(sum(alphaOld)) - N*scipy.special.psi(alphaOld[k]) + N*v[k]
        c = np.sum(g/h)/(1/z + sum(1/h))
        for i in range(0, K):
            step[i] = (g[i]-c)/h[i]
        alphaNew = alphaOld -step    
        if np.linalg.norm(alphaOld - alphaNew) < tol:
            converge = 1
        else:
            converge = 0
            alphaOld = alphaNew
    return alphaNew

In [None]:
k = 2
alphaOld = np.random.rand(k)
alphaOld
alphaUpdate(alphaOld, phiTest, .1)