### Initialisations

In [9]:
from gensim.corpora.dictionary import Dictionary
import os
import numpy as np

# select which documents to parse
FolderToParse = "BoW_100random/" #"BoW_all/" #
Subset = 100 #10769 #
N_Topics = 10

DocList = []
for document in os.listdir( FolderToParse ):
    # load documents
    FileToLoad = FolderToParse + document
    f = open(FileToLoad,'rb')
    words = f.read().decode('ascii', 'ignore')
    f.close()
    words = words.split() # tokenize
    DocList.append(words)
    
DocList = DocList[:Subset]
# Load the names of species
with open('list_of_species.txt', encoding='utf-8', errors='ignore') as f:
    Names = f.readlines()
Names = [x.strip() for x in Names]

# Create dictionary [ID to word]
common_dictionary = Dictionary(DocList)
# Create text to words mappings & count
common_corpus = [common_dictionary.doc2bow(text) for text in DocList]
print("We have loaded {0} documents and {1} words in total!".format(len(DocList), len(common_dictionary)))

We have loaded 100 documents and 4893 words in total!


### Other utilities

In [10]:
#Distances = np.load('100New_Doc_Distances.npy')
#Distances = np.load('10769_Doc_Distances.npy')
#TF_IDF = np.load('10769_Doc_TFIDF.npy')
TF_IDF = np.load('TFIDF_100Docs.npy')
def ComputeDistances(D, C1, C2):
    """
        Function that, given two sets of indices C1, C2 and a matrix D with  
        distances calculated for every pair, it computes the average distance.
    """
    S=0
    for i in range(len(C1)):
        for j in range(len(C2)):
            S += D[ C1[i], C2[j] ]
    return S/(len(C1)*len(C2))
     
def EvalClustering(D, Clustering):
    """
        Function that, given a set clusters and a matrix D with distances calculated 
        for every pair of points, evaluates the accuracy of the partition.
        Intra : the average distance for points within one cluster
        Inter : the average distance between points from different clusters.
    """
    N_K = len(Clustering)
    ClusterDist = np.zeros( (N_K,N_K) )
    for c1 in range(N_K):
        if len(Clustering[c1])>0:
            for c2 in range(c1,N_K):
                #first we compute the intra-cluster distance
                if len(Clustering[c2])>0:
                    ClusterDist[c1,c2] = ComputeDistances(D, list(Clustering[c1]), list(Clustering[c2]))
    # evaluate
    intra = np.mean(np.diag(ClusterDist))
    print('Mean Within-Cluster distance = {0:.3f}.'.format(intra))
    inter = np.sum(np.triu(ClusterDist,1))*2/(N_K-1)/N_K
    print('Mean Inter-Cluster distance = {0:.3f}.'.format(inter))
    return intra, inter, ClusterDist

### 1. LDA by gensim

In [11]:
from gensim.models import LdaModel
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=N_Topics,alpha='asymmetric', random_state=1)
# Now produce probabilities based on the Corpus
LDAvectors = []
for i in range(len(DocList)):
    # first we translate using the dictionary that we have already
    temp = [ common_dictionary.doc2bow(text.split()) for text in DocList[i] ]
    vector = lda[temp[0]]
    LDAvectors.append( vector )
print('LDA is complete!')

# Clustering #
ClustersLDA = { i:[] for i in range(N_Topics)}# initialize the dictictionary of clusters
ClustersNames = { i:[] for i in range(N_Topics)} 
Labels = []
for i in range(len(DocList)):
    distr = [ x[1] for x in LDAvectors[i]]
    # find the argmax{distr} - ATTENTION: ties ???
    label = distr.index(max(distr))
    ClustersLDA[label].append(i)
    ClustersNames[label].append(Names[i])
    Labels.append( label )
    
# Evaluate
t = EvalClustering(Distances, ClustersLDA)

LDA is complete!
Mean Within-Cluster distance = 3.271.
Mean Inter-Cluster distance = 3.811.


### 2. LDA by Scikit-learn

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#vectorizer = TfidfVectorizer(max_df=0.3, max_features=60000, min_df=1, stop_words='english', use_idf=True)
#X = vectorizer.fit_transform(RawTexts)
print("n_samples: %d, n_features: %d" % X.shape)
LDA_SKL = LatentDirichletAllocation(n_topics = N_Topics, max_iter=100, random_state=20)
LDA_SKL.fit(TF_IDF)
print("Clustering sparse data with %s" % LDA_SKL)
# get doc-topic distributions
LDA_SKLvectors = LDA_SKL.transform(TF_IDF)

# Clustering - Black and white approach, as before
ClustersSKL = { i:[] for i in range(N_Topics)}
LabelsSKL = []
for i in range(len(DocList)):
    distr = list(LDA_SKLvectors[i])
    # find the argmax{distr} - ATTENTION: ties ???
    label = distr.index(max(distr))
    ClustersSKL[label].append(i)
    LabelsSKL.append( label )
    
# Evaluate
t = EvalClustering(Distances, ClustersSKL)

n_samples: 100, n_features: 4746




Clustering sparse data with LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=100,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=10,
             perp_tol=0.1, random_state=20, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
Mean Within-Cluster distance = 6.037.
Mean Inter-Cluster distance = 7.015.


### 3. K-Means with SVD

In [None]:
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
# Dimensionality reduction
svd = TruncatedSVD(1000)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
# we use the same X as before
vectorizer = TfidfVectorizer(max_df=0.3, max_features=60000, min_df=1, stop_words='english', use_idf=True)
X = vectorizer.fit_transform(RawTexts)
X = lsa.fit_transform(X)
print("n_samples: %d, n_features: %d" % X.shape)
km = KMeans(n_clusters=N_Topics, init='k-means++', max_iter=100, n_init=1, random_state=10)
print("Clustering sparse data with %s" % km)
km.fit(X)

# create the clusters
ClustersKM = { i:[] for i in range(N_Topics)}
LabelsKM = list(km.labels_)
for i in range(len(DocList)):
    ClustersKM[LabelsKM[i]].append(i)
    
# Evaluate
t = EvalClustering(Distances, ClustersKM)

### 4. Our LDA Implementation

In [3]:
# Implement LDA
# Number matrix (Replacing words in documents with word IDs)
from time import time
N_K = 10 # set the number of topics
N_D = len(DocList)
corpus=np.unique(np.concatenate(DocList),axis=0)
N_W = corpus.shape[0] # words in the vocabulary

# SELECT #iterations
T=200

X_number = np.copy(DocList)
for doc_number in range(X_number.shape[0]):
    for doc_length in range(len(X_number[doc_number])):
        X_number[doc_number][doc_length]=  np.where(corpus==X_number[doc_number][doc_length])[0][0]
        
# Dirichlet priors
alpha = 1 # Choice of alpha affects document clustering 
gamma = 1

#Z = np.copy(X_number)
#for doc_number in range(Z.shape[0]):
#    for doc_length in range(len(Z[doc_number])):
#        Z[doc_number][doc_length]= np.random.randint(N_K)
        
Z = []#[np.array(N_D, dtype=object)]
for doc in range(N_D):
    Z.append( np.random.randint(N_K, size=len(DocList[doc])) )
        
# Pi := document topic distribution
Pi = np.zeros([N_D, N_K])
for i in range(N_D):
    Pi[i] = np.random.dirichlet(alpha*np.ones(N_K))

A = Pi
#print(A)

# B := word topic distribution
B = np.zeros([N_K, N_W])
for k in range(N_K):
    B[k] = np.random.dirichlet(gamma*np.ones(N_W))
t0 = time()    
print("Starting the big loop...")    
for iterations in range(T):  #Need at least 1000 iterations for Gibbs sampling to work!

    # Updating Z matrix
    for doc_number in range(N_D):     
        for doc_length in range(len(Z[doc_number])):     
            # Calculate params for Z
            p_iv = np.exp(np.log(Pi[doc_number]) + np.log( B[:, X_number[doc_number][ doc_length]] ))
            p_iv /= np.sum(p_iv)

             # Resample word topic assignment Z
            Z[doc_number][doc_length] = np.random.multinomial(1, p_iv).argmax()
    # Updating Pi   
    for i in range(N_D):
        # Gather sufficient statistics
        ###m = np.zeros(N_K)
        ###for k in range(N_K):
        ###    m[k] = np.sum(Z[i] == k)
        
        m = np.array( [np.sum(Z[i] == k) for k in range(N_K)] )
        # Resample doc topic dist.
        Pi[i, :] = np.random.dirichlet(alpha + m)
        
    #Updating B
    for k in range(N_K):
        #print(k)
        n = np.zeros(N_W) 
    
        #Gather statistics       
        for v in range(N_W):
            for doc_number in range(N_D):
                n[v] = len([ x for x in np.where(X_number[doc_number] == v) if Z[doc_number][x] ==k ])
                ###for doc_length in range(len(Z[doc_number])):
                ###    n[v] += (X_number[doc_number][ doc_length]==v) and (Z[doc_number][doc_length] ==k)
        
        # Resample word topic distribution
        B[k,:] = np.random.dirichlet(gamma+n)
    #progress check
    if (iterations-1)%10==0:
        print("More than {0} % is completed! Rate = {1}".format(100*iterations/T, (time()-t0)/iterations))
print('LDA is complete! Total time = {0}'.format(time()-t0))

Starting the big loop...
More than 0.5 % is completed! Rate = 15.583176136016846
More than 5.5 % is completed! Rate = 8.467772375453603
More than 10.5 % is completed! Rate = 8.127799102238246
More than 15.5 % is completed! Rate = 7.995456357156077
More than 20.5 % is completed! Rate = 7.928807020187378
More than 25.5 % is completed! Rate = 7.885160441492118
More than 30.5 % is completed! Rate = 7.858097189762553
More than 35.5 % is completed! Rate = 7.840304519089175
More than 40.5 % is completed! Rate = 7.810828244244611
More than 45.5 % is completed! Rate = 7.789841837935395
More than 50.5 % is completed! Rate = 7.773011214662306
More than 55.5 % is completed! Rate = 7.759479709573694
More than 60.5 % is completed! Rate = 7.74611644705465
More than 65.5 % is completed! Rate = 7.735399602933694
More than 70.5 % is completed! Rate = 7.7316522175538624
More than 75.5 % is completed! Rate = 7.724011069102003
More than 80.5 % is completed! Rate = 7.721369192466973
More than 85.5 % is comp

In [11]:
Pi

array([[  1.25163001e-01,   7.45902832e-03,   2.18166674e-01,
          9.39012946e-02,   2.03020445e-01,   3.30213475e-02,
          9.74024380e-02,   1.10195251e-01,   4.21572959e-02,
          6.95132240e-02],
       [  1.25730488e-01,   8.62988484e-02,   1.07052894e-01,
          4.43372664e-02,   5.75517753e-02,   2.32861093e-01,
          1.49164860e-02,   1.10148593e-01,   4.95088245e-02,
          1.71593732e-01],
       [  6.55129902e-02,   3.31137958e-02,   1.55299114e-01,
          3.82462844e-03,   1.40547186e-01,   1.42811152e-01,
          1.93331860e-01,   2.09379986e-01,   1.40927725e-03,
          5.47700114e-02],
       [  2.30510511e-02,   1.53281808e-01,   5.35784829e-03,
          1.13053684e-01,   1.10534329e-01,   1.35126979e-02,
          2.86771815e-01,   6.94185105e-02,   1.84845773e-01,
          4.01724833e-02],
       [  9.02751446e-02,   2.20174004e-01,   2.27261182e-02,
          6.73949588e-02,   9.51389977e-02,   1.89224407e-01,
          1.48331274e-02

In [None]:
print('LDA is complete! Total time = {0}'.format(time()-t0))

In [None]:
# save it!
LDAdistr = Pi
#np.save('100New_LDAdistr.npy',LDAdistr)

In [6]:
# Clustering #
ClustersOUR = { i:[] for i in range(N_Topics)}# initialize the dictictionary of clusters
for i in range(len(DocList)):
    #distr = [ x[1] for x in Pi[i]]
    # find the argmax{distr} - ATTENTION: ties ???
    label = np.argmax(Pi[i])
    ClustersOUR[label].append(i)
    
# Evaluate
t = EvalClustering(Distances, ClustersOUR)

Mean Within-Cluster distance = 5.943.
Mean Inter-Cluster distance = 7.034.


### 5. LDA and Improved Clustering

In [None]:
# create doc-word list of lists
"""
corpus=np.unique(np.concatenate(DocList))
TDM = np.zeros((len(common_dictionary),N_D)) # we start by a full matrix and then transform it
for doc in range(N_D):
    temp = np.unique( DocList[doc] ) # get the different words on this document
    #temp2 = np.zeros((N_V,1))
    for i in range( len(temp) ):
        word = temp[i]
        count = len([ x for x in DocList[doc] if x == word])
        # we must get the index of this word in the (total) corpus
        TDM[ np.where(corpus == word) , doc] = count

# sanity check
for doc in range(N_D):
    if sum(TDM[:,doc])!=len(DocList[doc]):
        print("Doc-{0} has a problem!".format(doc))
  
TDM = np.transpose(TDM)
"""
#print("n_samples: %d, n_features: %d" % TDM.shape)
#LDA_SKL = LatentDirichletAllocation(n_topics = 100, max_iter=50, random_state=1)
#LDA_SKL.fit( TDM )
# get doc-topic distributions
#LDA_SKLvectors = LDA_SKL.transform(TDM)
km = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=1, random_state=1)
print("Clustering LDA data with %s" % km)
km.fit(LDA_SKLvectors)

# create the clusters
ClustersXX = { i:[] for i in range(N_Topics)}
LabelsXX = list(km.labels_)
for i in range(len(DocList)):
    ClustersXX[LabelsXX[i]].append(i)
    
# Evaluate
t = EvalClustering(Distances, ClustersXX)

### 6. Clustering with random assigning

In [None]:
ClustersRAND = { i:[] for i in range(N_Topics)}# initialize the dictictionary of clusters
for i in range(len(DocList)):
    #distr = [ x[1] for x in Pi[i]]
    # find the argmax{distr} - ATTENTION: ties ???
    label = np.random.randint(N_K)
    ClustersRAND[label].append(i)
    
# Evaluate
t = EvalClustering(Distances, ClustersRAND)