In [1]:
! pip install stop_words



You are using pip version 8.0.3, however version 8.1.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [1]:
import collections

s1 = "The quick brown fox"
s2 = "Brown fox jumps over the jumps jumps jumps"
s3 = "The the the lazy dog elephant."
s4 = "The the the the the dog peacock lion tiger elephant"

docs = collections.OrderedDict()
docs["s1"] = s1
docs["s2"] = s2
docs["s3"] = s3
docs["s4"] = s4
for k, v in docs.items():
    print(k,v)
    

s1 The quick brown fox
s2 Brown fox jumps over the jumps jumps jumps
s3 The the the lazy dog elephant.
s4 The the the the the dog peacock lion tiger elephant


In [2]:
len(docs)

4

**Function to make Corpus Matrix**

Function returns a list, 0th element is a 3 dimensional array, each element corresponds to a document in the corpus where the columns are the unique words in that document and the rows are the words in that document (stop words removed).  Second element in list is the column names for each document.

In [3]:
import string
import stop_words
import numpy as np
from stop_words import get_stop_words

stopWords = get_stop_words('english')

#Function to make document, word matricies for LDA#
def make_word_matrix(corpus):
    D = len(corpus)
    #Define list to store corpus data#
    c = []
    #Define list to store order of words for each document#
    wordOrder = []
    #Define table to remove punctuation
    table = dict.fromkeys(map(ord, string.punctuation))
   
    #For each document in docs, caculate frequency of the words#
    for i in corpus:
        #Remove punctuation 
        text = docs[i].translate(table)
        #Splits string by blankspace and goes to lower case#
        words = text.lower().split()
    
        #Remove stop words#
        text = [word for word in words if word not in stopWords]
        
        #Find total number of words in each document#
        N = len(text)
        
        #Find number of unique words in each document#
        V = len(set(text))
        Vwords = list(set(text))
        wordOrder.append(Vwords)
        #Create matrix to store words for each document#
        wordsMat = np.zeros((N, V))
        count = 0
        for word in text:
            v = Vwords.index(word)
            wordsMat[count, v] = 1
            count = count + 1
        c.append(wordsMat)

    return [c, wordOrder]



In [4]:
corpusMatrix = make_word_matrix(docs)
corpusMatrix[0][0].shape

(3, 3)

In [8]:
corpusMatrix

[[array([[ 0.,  1.,  0.],
         [ 0.,  0.,  1.],
         [ 1.,  0.,  0.]]), array([[ 0.,  0.,  1.],
         [ 1.,  0.,  0.],
         [ 0.,  1.,  0.],
         [ 0.,  1.,  0.],
         [ 0.,  1.,  0.],
         [ 0.,  1.,  0.]]), array([[ 0.,  0.,  1.],
         [ 0.,  1.,  0.],
         [ 1.,  0.,  0.]]), array([[ 0.,  1.,  0.,  0.,  0.],
         [ 0.,  0.,  1.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  1.],
         [ 0.,  0.,  0.,  1.,  0.],
         [ 1.,  0.,  0.,  0.,  0.]])],
 [['fox', 'quick', 'brown'],
  ['fox', 'jumps', 'brown'],
  ['elephant', 'dog', 'lazy'],
  ['elephant', 'dog', 'peacock', 'tiger', 'lion']]]

**Variational Inference**

In [5]:
import numpy as np
import scipy
from scipy import special

In [44]:
def Estep(k, d, corpusMatrix, tol):    
    
    #storing the total number of words and the number of unique words
    N = corpusMatrix[0][d].shape[0]
    V = corpusMatrix[0][d].shape[1]
    
    #initialize phi and gamma
    alpha = np.random.rand(k)
    phi = np.full(shape = (N,k), fill_value = 1/k)
    beta = np.random.rand(k, V)
    beta = beta/np.sum(beta, axis = 0)[np.newaxis, :]
    gamma = alpha + N/k
    converge = 0 
    
    
    count = 0
    
    while converge == 0:
        #creating a place to store the updated phi
        newPhi  = np.full(shape = (N,k), fill_value = 1/k)
        for n in range(0, N):
            for i in range(0,k):
                newPhi[n,i] = (beta[i, list(corpusMatrix[0][d][n,:]).index(1)])*scipy.special.psi(gamma[i])
        newPhi = newPhi/np.sum(newPhi, axis = 1)[:, np.newaxis] #normalizing the rows of new phi
        newGamma = alpha + np.sum(newPhi) #updating gamma
        
        #checking for convergence
        if np.linalg.norm(newGamma - gamma) + np.linalg.norm(newPhi - phi) < tol:
            converge = 1
        else:
            converge = 0
            gamma = newGamma
            phi = newPhi
            count = count +1
    return (newPhi, newGamma)

In [49]:
phi1, gamma1 = Estep(3, 0, corpusMatrix, 0.0001)
print(phi1)
print(gamma1)

[[ 0.40277735  0.5761788   0.02104385]
 [ 0.56060093  0.3454057   0.09399336]
 [ 0.31682687  0.51244966  0.17072347]]
[ 3.23088311  3.83187632  3.62542694]


In [53]:
def Mstep(k, phi, corpusMatrix):
    #Calculate beta#
    V = corpusMatrix[0][d].shape[1]
    beta = np.zeros(shape = (k,V))
    for i in range(0,k):
        for n in range(0,V):
            beta[i,n] = sum(phi[:,i]*list(corpusMatrix[0][d][n,:]))
    #Normalize the columns of beta#
    beta = beta/np.sum(beta, axis = 0)[np.newaxis, :]
    
    
    ##Update ALPHA, see Apendix A.2##
    
    return(beta)



In [54]:
Mstep(3, phi1, corpusMatrix)

array([[ 0.56060093,  0.31682687,  0.40277735],
       [ 0.3454057 ,  0.51244966,  0.5761788 ],
       [ 0.09399336,  0.17072347,  0.02104385]])

In [None]:
#lengths and number of topics will go here

k = 2
D = 1
tol = 0.1

#need to define number of documents and number of topics here, 

#looping through the number of documents
for d in range(0,D): #D is the number of documents
    phi, gamma = Estep(k, d, corpusMatrix, tol)