In [1]:
import numpy as np
from scipy.sparse import csr_matrix, save_npz
#from sklearn.metrics.pairwise import pairwise_distances
import os

FolderToParse = "BagsOfWords/"
DocList = []
for document in os.listdir( FolderToParse ):
    # load documents
    FileToLoad = FolderToParse + document
    f = open(FileToLoad,'rb')
    words = f.read().decode('ascii', 'ignore')
    f.close()
    words = words.split()
    DocList.append(words)
X_words = DocList#[:100]

corpus=np.unique(np.concatenate(X_words))
N_D = len(X_words) # number of documents
N_V = len(corpus)  # number of vocabulary - all available words
N_K = 10 # set the number of topics
print("We have {0} documents and {1} words in total!".format(N_D, N_V))

We have 10769 documents and 62959 words in total!


In [31]:
from time import time
# initialize term - document matrix
#TDM = csr_matrix((N_V,N_D))
TDM = np.zeros((N_V,N_D)) # we start by a full matrix and then transform it
#TDM = [] 
t0 = time()
# create doc-word list of lists
for doc in range(N_D):
    temp = np.unique( X_words[doc] ) # get the different words on this document
    #temp2 = np.zeros((N_V,1))
    for i in range( len(temp) ):
        word = temp[i] 
        count = len([ x for x in X_words[doc] if x == word])
        # we must get the index of this word in the (total) corpus
        TDM[ np.where(corpus == word) , doc] = count
        #temp2[ np.where(corpus == word) ] = count
    #TDM.append(temp2)
    # progress check
    if ((doc+1) % 500) == 0 :
        print('More than {0} documents have been processed! Rate = {1}'.format(doc+1, (time()-t0)/doc))

# sanity check
for doc in range(N_D):
    if sum(TDM[:,doc])!=len(X_words[doc]):
        print("Doc-{0} has a problem!".format(doc))
TDM = csr_matrix(TDM)
print("The Term-Doc matrix is {0:.2f}% dense.".format(csr_matrix.count_nonzero(TDM)/np.prod(TDM.shape)*100))
#print("The Term-Doc matrix is {0:.2f}% dense.".format(np.count_nonzero(TDM)/np.prod(TDM.shape)*100))

More than 500 documents have been processed! Rate = 0.10709242782516326
More than 1000 documents have been processed! Rate = 0.10519153530055934
More than 1500 documents have been processed! Rate = 0.1092377464480206
More than 2000 documents have been processed! Rate = 0.11986428573764879
More than 2500 documents have been processed! Rate = 0.12131917853506148
More than 3000 documents have been processed! Rate = 0.12405843629802056
More than 3500 documents have been processed! Rate = 0.13286942493987103
More than 4000 documents have been processed! Rate = 0.13615808775497096
More than 4500 documents have been processed! Rate = 0.13429439096987103
More than 5000 documents have been processed! Rate = 0.13676846191915995
More than 5500 documents have been processed! Rate = 0.13730150293189627
More than 6000 documents have been processed! Rate = 0.13681055792452276
More than 6500 documents have been processed! Rate = 0.13575175905763268
More than 7000 documents have been processed! Rate = 

In [33]:
# SAVE IT
import scipy.sparse
#scipy.sparse.save_npz('Term_Doc_Matrix_All.npz',TDM)

In [None]:
# 1. Black and white clustering - the number of clusters is N_K
# initialize the dictictionary of clusters
Clusters = { i:[] for i in range(N_K)}
ClustersNames = { i:[] for i in range(N_K)} 
Labels = []
for i in range(len(DocList)):
    distr = [ x[1] for x in Pi[i]]
    # find the argmax{distr} - ATTENTION: ties ???
    label = distr.index(max(distr))
    Clusters[label].append(i)
    ClustersNames[label].append(Names[i])
    Labels.append( label )

In [None]:
# calculate distances 
Distances = np.zeros((N_D,N_D))
for i in range(N_D):
    for j in range(i+1,N_D):
        # we use the standard euclidean distance == norm-2 for vectors
        Distances[i,j] = np.linalg.norm( csr_matrix.todense(TDM[:,i] - TDM[:,j]),2 ) #GetDistance
# Now compare the distances for different clusters....
ClusterDist = np.zeros( (N_K,N_K))
for c1 in range(N_K):
    if len(Clusters[c1])>0:
        for c2 in range(c1,N_K):
            #first we compute the intra-cluster distance
            if len(Clusters[c2])>0:
                ClusterDist[c1,c2] = ComputeDistances(Distances, list(Clusters[c1]), list(Clusters[c2]))

def ComputeDistances(D, C1, C2):
    """
        Function that, given two sets of indices C1, C2 and a matrix D with  
        distances calculated for every pair, it computes the average distance.
    """
    S=0
    for i in range(len(C1)):
        for j in range(len(C2)):
            S += S + D[ C1[i], C2[j] ]
    return S/(len(C1)*len(C2))

In [None]:
scipy.sparse.save_npz('Doc_Doc_Distances_All.npz',Distances)