In [158]:
import numpy as np
from scipy.sparse import csr_matrix
#from sklearn.metrics.pairwise import pairwise_distances
import os

FolderToParse = "BagsOfWords/"
DocList = []
for document in os.listdir( FolderToParse ):
    # load documents
    FileToLoad = FolderToParse + document
    f = open(FileToLoad,'rb')
    words = f.read().decode('ascii', 'ignore')
    f.close()
    words = words.split()
    DocList.append(words)
X_words = DocList

corpus=np.unique(np.concatenate(X_words),axis=0)
N_D = len(X_words) # number of documents
N_V = len(corpus)  # number of vocabulary - all available words
N_K = 10 # set the number of topics
print("We have {0} documents and {1} words in total!".format(N_D, N_V))

We have 101 documents and 3409 words in total!


In [111]:
# initialize term - document matrix
#TDM = csr_matrix((N_V,N_D))
TDM = np.zeros((N_V,N_D)) # we start by a full matrix and then transform it
     
# create doc-word list of lists
for doc in range(N_D):
    temp = np.unique( X_words[doc] ) # get the different words on this document
    for i in range( len(temp) ):
        word = temp[i] 
        count = len([ x for x in X_words[doc] if x == word])
        # we must get the index of this word in the (total) corpus
        TDM[ np.where(corpus == word) , doc] = count
        
# sanity check
for doc in range(N_D):
    if sum(TDM[:,doc])!=len(X_words[doc]):
        print("Doc-{0} has a problem!".format(i))
TDM = csr_matrix(TDM)
print("The Term-Doc matrix is {0:.2f}% dense.".format(csr_matrix.count_nonzero(TDM)/np.prod(TDM.shape)*100))
#print("The Term-Doc matrix is {0:.2f}% dense.".format(np.count_nonzero(TDM)/np.prod(TDM.shape)*100))

The Term-Doc matrix is 2.57% dense.


In [159]:
# Implement LDA
# Number matrix (Replacing words in documents with word IDs)
N_K = 10 # set the number of topics
N_W = N_V
X_number = np.copy(X_words)
for doc_number in range(X_number.shape[0]):
    for doc_length in range(len(X_number[doc_number])):
        X_number[doc_number][doc_length]=  np.where(corpus==X_number[doc_number][doc_length])[0][0]
        
# Dirichlet priors
alpha = 1 # Choice of alpha affects document clustering 
gamma = 1

Z = np.copy(X_number)
for doc_number in range(Z.shape[0]):
    for doc_length in range(len(Z[doc_number])):
        Z[doc_number][doc_length]= np.random.randint(N_K)
        
# Pi := document topic distribution
Pi = np.zeros([N_D, N_K])
for i in range(N_D):
    Pi[i] = np.random.dirichlet(alpha*np.ones(N_K))
    
# B := word topic distribution
B = np.zeros([N_K, N_W])

for k in range(N_K):
    B[k] = np.random.dirichlet(gamma*np.ones(N_W))
    
for iterations in range(10):  #Need at least 1000 iterations for Gibbs sampling to work!
    
    
    # Updating Z matrix
    for doc_number in range(Z.shape[0]):     
        for doc_length in range(len(Z[doc_number])):
            
             # Calculate params for Z
            p_iv = np.exp(np.log(Pi[i]) + np.log(B[:, X_number[doc_number][ doc_length]]))
            p_iv /= np.sum(p_iv)

             # Resample word topic assignment Z
            Z[doc_number][doc_length] = np.random.multinomial(1, p_iv).argmax()
     
    # Updating Pi   
    for i in range(N_D):
        m = np.zeros(N_K)

        # Gather sufficient statistics
        for k in range(N_K):
            m[k] = np.sum(Z[i] == k)

        # Resample doc topic dist.
        Pi[i, :] = np.random.dirichlet(alpha + m)
        


    
    #Updating B
    
    for k in range(N_K):
        print(k)
        n = np.zeros(N_W)
    
        #Gather statistics
        
        for v in range(N_W):
            for doc_number in range(Z.shape[0]):
                for doc_length in range(len(Z[doc_number])):
                    n[v] += (X_number[doc_number][ doc_length]==v) and (Z[doc_number][doc_length] ==k)
        
        # Resample word topic distribution
        
        B[k,:] = np.random.dirichlet(gamma+n)

0


KeyboardInterrupt: 

In [134]:
# calculate distances 
Distances = np.zeros((N_D,N_D))
for i in range(N_D):
    for j in range(i+1,N_D):
        # we use the standard euclidean distance == norm-2 for vectors
        Distances[i,j] = np.linalg.norm( csr_matrix.todense(TDM[:,i] - TDM[:,j]),2 ) #GetDistance
        
# Now compare the distances for different clusters....