### Initialisations

In [1]:
from gensim.corpora.dictionary import Dictionary
import os
import numpy as np

# select the number of documents to parse
Subset = 100 #10769
filewithTFIDF = "TFIDF_"+str(Subset)+"docs.npy" # this is used later
Distances = np.load(str(Subset)+'New_Doc_Distances.npy')

N_Topics = 5

FolderToParse = "BoW_100random/"#"BagsOfWords/"
DocList = []
for document in os.listdir( FolderToParse ):
    # load documents
    FileToLoad = FolderToParse + document
    f = open(FileToLoad,'rb')
    words = f.read().decode('ascii', 'ignore')
    f.close()
    words = words.split() # tokenize
    DocList.append(words)
    
DocList = DocList[:Subset]
# Load the names of species
with open('list_of_species.txt', encoding='utf-8', errors='ignore') as f:
    Names = f.readlines()
Names = [x.strip() for x in Names]

# Create dictionary [ID to word]
common_dictionary = Dictionary(DocList)
# Create text to words mappings & count
common_corpus = [common_dictionary.doc2bow(text) for text in DocList]
print("We have loaded {0} documents and {1} words in total!".format(len(DocList), len(common_dictionary)))



We have loaded 100 documents and 4893 words in total!


### Other utilities

In [2]:
def ComputeDistances(D, C1, C2):
    """
        Function that, given two sets of indices C1, C2 and a matrix D with  
        distances calculated for every pair, it computes the average distance.
    """
    S=0
    for i in range(len(C1)):
        for j in range(len(C2)):
            S += D[ C1[i], C2[j] ]
    return S/(len(C1)*len(C2))
     
def EvalClustering(D, Clustering):
    """
        Function that, given a set clusters and a matrix D with distances calculated 
        for every pair of points, evaluates the accuracy of the partition.
        Intra : the average distance for points within one cluster
        Inter : the average distance between points from different clusters.
    """
    N_K = len(Clustering)
    ClusterDist = np.zeros( (N_K,N_K) )
    for c1 in range(N_K):
        if len(Clustering[c1])>0:
            for c2 in range(c1,N_K):
                #first we compute the intra-cluster distance
                if len(Clustering[c2])>0:
                    ClusterDist[c1,c2] = ComputeDistances(D, list(Clustering[c1]), list(Clustering[c2]))
    # evaluate
    intra = np.mean(np.diag(ClusterDist))
    print('Mean Within-Cluster distance = {0:.3f}.'.format(intra))
    inter = np.sum(np.triu(ClusterDist,1))*2/(N_K-1)/N_K
    print('Mean Inter-Cluster distance = {0:.3f}.'.format(inter))
    return intra, inter, ClusterDist

### 1. LDA by gensim

In [4]:
from gensim.models import LdaModel
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=N_Topics, alpha='asymmetric', random_state=1)
# Now produce probabilities based on the Corpus
LDAvectors = []
for i in range(len(DocList)):
    # first we translate using the dictionary that we have already
    temp = [ common_dictionary.doc2bow(text.split()) for text in DocList[i] ]
    vector = lda[temp[0]]
    LDAvectors.append( vector )
print('LDA is complete!')

LDA is complete!


#### 1b. Evaluate - gensim LDA with BW

In [5]:
#LDAvectors = np.load("LDAvectors_gensim_"+str(Subset)+".npy")

# Clustering #
ClustersLDA = { i:[] for i in range(N_Topics)}# initialize the dictictionary of clusters
ClustersNames = { i:[] for i in range(N_Topics)} 
Labels = []
for i in range(len(DocList)):
    distr = [ x[1] for x in LDAvectors[i]]
    # find the argmax{distr} - ATTENTION: ties ???
    label = distr.index(max(distr))
    ClustersLDA[label].append(i)
    ClustersNames[label].append(Names[i])
    Labels.append( label )
    
# Evaluate
t = EvalClustering(Distances, ClustersLDA)

Mean Within-Cluster distance = 4.882.
Mean Inter-Cluster distance = 6.838.


#### 1c. Evaluate - gensim LDA with KMeans

In [6]:
from sklearn.cluster import KMeans
# load data (again)
#X =  np.load("LDAvectors_gensim_"+str(Subset)+".npy")

# possibly needs transform
X = LDAvectors
X = np.array([[P[1] for P in Z] for Z in X])
#print("n_samples: %d, n_features: %d" % X.shape)

km = KMeans(n_clusters=N_Topics, init='k-means++', max_iter=100, n_init=1, random_state=10)
print("Clustering sparse data with %s" % km)
km.fit(X)

# create the clusters
Clusters = { i:[] for i in range(N_Topics)}
Labels = list(km.labels_)
for i in range(len(DocList)):
    Clusters[Labels[i]].append(i)
    
# Evaluate
t = EvalClustering(Distances, Clusters)

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=5, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=10, tol=0.0001, verbose=0)
Mean Within-Cluster distance = 4.882.
Mean Inter-Cluster distance = 5.061.


### 2. LDA by Scikit-learn

In [13]:
from sklearn.decomposition import LatentDirichletAllocation

X = np.load(filewithTFIDF)

print("n_samples: %d, n_features: %d" % X.shape)
LDA_SKL = LatentDirichletAllocation(n_topics = N_Topics, max_iter=500, random_state=50)
LDA_SKL.fit(X)
print("Clustering sparse data with %s" % LDA_SKL)
# get doc-topic distributions
LDA_SKLvectors = LDA_SKL.transform(X)

n_samples: 100, n_features: 4893




Clustering sparse data with LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=500,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=5,
             perp_tol=0.1, random_state=50, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)


#### 2b. Evaluate scikit LDA with BW

In [8]:
# Clustering - Black and white approach, as before
ClustersSKL = { i:[] for i in range(N_Topics)}
LabelsSKL = []
for i in range(len(DocList)):
    distr = list(LDA_SKLvectors[i])
    # find the argmax{distr} - ATTENTION: ties ???
    label = distr.index(max(distr))
    ClustersSKL[label].append(i)
    LabelsSKL.append( label )
    
# Evaluate
t = EvalClustering(Distances, ClustersSKL)

Mean Within-Cluster distance = 6.444.
Mean Inter-Cluster distance = 6.020.


#### 2c. Evaluate scikit LDA with K-Means

In [10]:
X = LDA_SKLvectors

km = KMeans(n_clusters=N_Topics, init='k-means++', max_iter=100, n_init=1, random_state=10)
print("Clustering sparse data with %s" % km)
km.fit(X)

# create the clusters
Clusters = { i:[] for i in range(N_Topics)}
Labels = list(km.labels_)
for i in range(len(DocList)):
    Clusters[Labels[i]].append(i)
    
# Evaluate
t = EvalClustering(Distances, Clusters)

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=5, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=10, tol=0.0001, verbose=0)
Mean Within-Cluster distance = 6.444.
Mean Inter-Cluster distance = 6.692.


### 3. Benchmark technique: K-Means on TFIDF

In [11]:
from sklearn.cluster import KMeans

X =  np.load(filewithTFIDF)

print("n_samples: %d, n_features: %d" % X.shape)
km = KMeans(n_clusters=N_Topics, init='k-means++', max_iter=100, n_init=1, random_state=10)
print("Clustering sparse data with %s" % km)
km.fit(X)

# create the clusters
ClustersKM = { i:[] for i in range(N_Topics)}
LabelsKM = list(km.labels_)
for i in range(len(DocList)):
    ClustersKM[LabelsKM[i]].append(i)
    
# Evaluate
t = EvalClustering(Distances, ClustersKM)

n_samples: 100, n_features: 4893
Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=5, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=10, tol=0.0001, verbose=0)
Mean Within-Cluster distance = 1.307.
Mean Inter-Cluster distance = 9.162.


#### 3b. Advanced approach: SVD + K-Means

In [12]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
# Dimensionality reduction
svd = TruncatedSVD(50)
# we use the same X as before
X = svd.fit_transform(X)
print("n_samples: %d, n_features: %d" % X.shape)
km = KMeans(n_clusters=N_Topics, init='k-means++', max_iter=100, n_init=1, random_state=10)
print("Clustering sparse data with %s" % km)
km.fit(X)

# create the clusters
ClustersKM = { i:[] for i in range(N_Topics)}
LabelsKM = list(km.labels_)
for i in range(len(DocList)):
    ClustersKM[LabelsKM[i]].append(i)
    
# Evaluate
t = EvalClustering(Distances, ClustersKM)

n_samples: 100, n_features: 50
Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=5, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=10, tol=0.0001, verbose=0)
Mean Within-Cluster distance = 1.309.
Mean Inter-Cluster distance = 13.452.


### 4. Our LDA Implementation

In [14]:
# Implement LDA
# Number matrix (Replacing words in documents with word IDs)
from time import time
N_K = 5 # N_Topics # set the number of topics
N_D = len(DocList)
corpus=np.unique(np.concatenate(DocList),axis=0)
N_W = corpus.shape[0] # words in the vocabulary

# SELECT #iterations
T=100

X_number = np.copy(DocList)
for doc_number in range(X_number.shape[0]):
    for doc_length in range(len(X_number[doc_number])):
        X_number[doc_number][doc_length]=  np.where(corpus==X_number[doc_number][doc_length])[0][0]
        
# Dirichlet priors
alpha = 1 # Choice of alpha affects document clustering 
gamma = 1

#Z = np.copy(X_number)
#for doc_number in range(Z.shape[0]):
#    for doc_length in range(len(Z[doc_number])):
#        Z[doc_number][doc_length]= np.random.randint(N_K)
        
Z = []#[np.array(N_D, dtype=object)]
for doc in range(N_D):
    Z.append( np.random.randint(N_K, size=len(DocList[doc])) )
        
# Pi := document topic distribution
Pi = np.zeros([N_D, N_K])
for i in range(N_D):
    Pi[i] = np.random.dirichlet(alpha*np.ones(N_K))

A = Pi
#print(A)

# B := word topic distribution
B = np.zeros([N_K, N_W])
for k in range(N_K):
    B[k] = np.random.dirichlet(gamma*np.ones(N_W))
t0 = time()    
print("Starting the big loop...")    
for iterations in range(T):  #Need at least 1000 iterations for Gibbs sampling to work!

    # Updating Z matrix
    for doc_number in range(N_D):     
        for doc_length in range(len(Z[doc_number])):     
            # Calculate params for Z
            p_iv = np.exp(np.log(Pi[doc_number]) + np.log( B[:, X_number[doc_number][ doc_length]] ))
            p_iv /= np.sum(p_iv)

             # Resample word topic assignment Z
            Z[doc_number][doc_length] = np.random.multinomial(1, p_iv).argmax()
    # Updating Pi   
    for i in range(N_D):
        # Gather sufficient statistics
        ###m = np.zeros(N_K)
        ###for k in range(N_K):
        ###    m[k] = np.sum(Z[i] == k)
        
        m = np.array( [np.sum(Z[i] == k) for k in range(N_K)] )
        # Resample doc topic dist.
        Pi[i, :] = np.random.dirichlet(alpha + m)
        
    #Updating B
    for k in range(N_K):
        #print(k)
        n = np.zeros(N_W) 
    
        #Gather statistics       
        for v in range(N_W):
            for doc_number in range(N_D):
                n[v] = len([ x for x in np.where(X_number[doc_number] == v) if Z[doc_number][x] ==k ])
                ###for doc_length in range(len(Z[doc_number])):
                ###    n[v] += (X_number[doc_number][ doc_length]==v) and (Z[doc_number][doc_length] ==k)
        
        # Resample word topic distribution
        B[k,:] = np.random.dirichlet(gamma+n)
    #progress check
    if (iterations-1)%10==0:
        print("More than {0} % is completed! Rate = {1}".format(100*iterations/T, (time()-t0)/iterations))
print('LDA is complete! Total time = {0}'.format(time()-t0))

Starting the big loop...


KeyboardInterrupt: 

#### 4b. Our LDA with BW

In [17]:
Pi = np.load("prob100it5topgeorge.npy")
# Clustering #
ClustersOUR = { i:[] for i in range(N_Topics)}# initialize the dictictionary of clusters
for i in range(len(DocList)):
    #distr = [ x[1] for x in Pi[i]]
    # find the argmax{distr} - ATTENTION: ties ???
    label = np.argmax(Pi[i])
    ClustersOUR[label].append(i)
    
# Evaluate
t = EvalClustering(Distances, ClustersOUR)

Mean Within-Cluster distance = 6.307.
Mean Inter-Cluster distance = 7.007.


#### 4c. Our LDA with KMeans

In [19]:
X = Pi
km = KMeans(n_clusters=N_Topics, init='k-means++', max_iter=100, n_init=1, random_state=1)
print("Clustering LDA data with %s" % km)
km.fit(X)

# create the clusters
ClustersXX = { i:[] for i in range(N_Topics)}
LabelsXX = list(km.labels_)
for i in range(len(DocList)):
    ClustersXX[LabelsXX[i]].append(i)
    
# Evaluate
t = EvalClustering(Distances, ClustersXX)

Clustering LDA data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=5, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=1, tol=0.0001, verbose=0)
Mean Within-Cluster distance = 5.132.
Mean Inter-Cluster distance = 5.799.


### 6. Clustering with random assigning

In [20]:
ClustersRAND = { i:[] for i in range(N_Topics)}# initialize the dictictionary of clusters
for i in range(len(DocList)):
    #distr = [ x[1] for x in Pi[i]]
    # find the argmax{distr} - ATTENTION: ties ???
    label = np.random.randint(N_Topics)
    ClustersRAND[label].append(i)
    
# Evaluate
t = EvalClustering(Distances, ClustersRAND)

Mean Within-Cluster distance = 6.283.
Mean Inter-Cluster distance = 7.337.


## Comparison of different LDA results

In [None]:
def KLdivergence(P,Q):
    #import numpy as np
    n = P.shape[0]
    total = []
    for subject in range(n):
        D = 0
        for x in range(P.shape[1]):
            D +=P[subject][x]*np.log( P[subject][x]/Q[subject][x] )
        total.append(D)
    return total #np.mean(total)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

X = np.load(filewithTFIDF)

print("n_samples: %d, n_features: %d" % X.shape)
N_Topics = 5
LDA_SKL = LatentDirichletAllocation(n_topics = N_Topics, max_iter=1100, random_state=0)
LDA_SKL.fit(X)
LDA_skl1 = LDA_SKL.transform(X)

LDA_SKL = LatentDirichletAllocation(n_topics = N_Topics, max_iter=1100, random_state=30)
LDA_SKL.fit(X)
LDA_skl2 = LDA_SKL.transform(X)

In [None]:
LDAour1 = np.load("prob100it5topgeorge.npy")
LDAour2 = np.load("prob100it5topcristiana.npy")

#LDAgen1 = LDAvectors
#LDAgen2 = LDAvectors
#LDAgen1 = np.array([[P[1] for P in Z] for Z in LDAgen1])
#LDAgen2 = np.array([[P[1] for P in Z] for Z in LDAgen2])

compare = [LDAour1, LDAour2, LDAgen1, LDAgen2, LDA_skl1, LDA_skl2]

DL_All = np.zeros((len(compare), len(compare)))
for i in range(len(compare)):
    for j in range(len(compare)):
        DL_All[i,j] = np.around(np.mean( KLdivergence(compare[i], compare[j])),decimals=3 )
DL_All

In [None]:
# print for latex
#f = open('Table_KL.txt','w')
names = ["ourLDA-1", "ourLDA-2", "gensimLDA-1" ,"gensimLDA-2", "skLDA-1", "skLDA-2"]
for i in range(len(compare)):
    line = "\\textit{"+ names[i] +"}"
    for j in range(len(compare)):
        line += " & " + str(DL_All[i,j])
    line += "\\\\ "
#     f.write("$ {0} $ & {1} & {2} & {3:.2f} & {4} & {5:.3f} & {6} \\\\ \n".format( shortname, N, M, np.mean(degrees), concomps, largest, diam ))
    print(line )