In [131]:
import numpy as np
from scipy.sparse import csr_matrix, save_npz
#from sklearn.metrics.pairwise import pairwise_distances
import os

FolderToParse = "BoW_100random/"# "BagsOfWords/" # 
DocList = []
totalwords = 0
for document in os.listdir( FolderToParse ):
    # load documents
    FileToLoad = FolderToParse + document
    f = open(FileToLoad,'rb')
    words = f.read().decode('ascii', 'ignore')
    f.close()
    words = words.split()
    DocList.append(words)
    # counter for sanity checks
    totalwords += len(words)
X_words = DocList#[:100]

corpus=np.unique(np.concatenate(X_words))
N_D = len(X_words) # number of documents
N_V = len(corpus)  # number of vocabulary - all available words
N_K = 10 # set the number of topics
print("We have {0} documents and {1} words in total!".format(N_D, N_V))
print("Total number of words found= %d" % totalwords )
def ComputeDistances(D, C1, C2):
    """
        Function that, given two sets of indices C1, C2 and a matrix D with  
        distances calculated for every pair, it computes the average distance.
    """
    S=0
    for i in range(len(C1)):
        for j in range(len(C2)):
            S += D[ C1[i], C2[j] ]
    return S/(len(C1)*len(C2))
     
def EvalClustering(D, Clustering):
    """
        Function that, given a set clusters and a matrix D with distances calculated 
        for every pair of points, evaluates the accuracy of the partition.
        Intra : the average distance for points within one cluster
        Inter : the average distance between points from different clusters.
    """
    N_K = len(Clustering)
    ClusterDist = np.zeros( (N_K,N_K) )
    for c1 in range(N_K):
        if len(Clustering[c1])>0:
            for c2 in range(c1,N_K):
                #first we compute the intra-cluster distance
                if len(Clustering[c2])>0:
                    ClusterDist[c1,c2] = ComputeDistances(D, list(Clustering[c1]), list(Clustering[c2]))
    # evaluate
    intra = np.mean(np.diag(ClusterDist))
    print('Mean Within-Cluster distance = {0:.3f}.'.format(intra))
    inter = np.sum(np.triu(ClusterDist,1))*2/(N_K-1)/N_K
    print('Mean Inter-Cluster distance = {0:.3f}.'.format(inter))
    return intra, inter, ClusterDist

We have 100 documents and 4893 words in total!
Total number of words found= 12557


In [2]:
from time import time
# initialize term - document matrix
#TDM = csr_matrix((N_V,N_D))
TDM = np.zeros((N_V,N_D)) # we start by a full matrix and then transform it
#TDM = [] 
t0 = time()
# create doc-word list of lists
for doc in range(N_D):
    temp = np.unique( X_words[doc] ) # get the different words on this document
    #temp2 = np.zeros((N_V,1))
    for i in range( len(temp) ):
        word = temp[i] 
        count = len([ x for x in X_words[doc] if x == word])
        # we must get the index of this word in the (total) corpus
        TDM[ np.where(corpus == word) , doc] = count
        #temp2[ np.where(corpus == word) ] = count
    #TDM.append(temp2)
    # progress check
    if ((doc+1) % 500) == 0 :
        print('More than {0} documents have been processed! Rate = {1}'.format(doc+1, (time()-t0)/doc))

# sanity check
for doc in range(N_D):
    if sum(TDM[:,doc])!=len(X_words[doc]):
        print("Doc-{0} has a problem!".format(doc))
TDM = csr_matrix(TDM)
print("The Term-Doc matrix is {0:.2f}% dense.".format(csr_matrix.count_nonzero(TDM)/np.prod(TDM.shape)*100))
#print("The Term-Doc matrix is {0:.2f}% dense.".format(np.count_nonzero(TDM)/np.prod(TDM.shape)*100))
# SAVE IT
# scipy.sparse
#scipy.sparse.save_npz('Term_Doc_Matrix_All.npz',TDM)

The Term-Doc matrix is 2.57% dense.


In [124]:
# create tf-idf matrix
TF_IDF = np.zeros((N_V,N_D))
# first compute the frequencies of each word among documents - sum of rows in TDM
IDF = np.transpose(TDM.sum(axis=1))
IDF = np.array( [np.log(N_D/tf) for tf in IDF] )[0][0] # trick to get rid of the (1,N_V) dimension thing
TF_IDF = np.transpose(TDM)*np.diag(IDF)

filetosave = 'TFIDF_'+str(N_D)+'docs.npy'
np.save(filetosave,TF_IDF)
#TF_IDF = csr_matrix(TF_IDF)
#save_npz(filetosave,TF_IDF)

In [127]:
# calculate distances 
Distances = np.zeros((N_D,N_D))
for i in range(N_D):
    for j in range(i+1,N_D):
        # we use the standard euclidean distance == norm-2 for vectors
        Distances[i,j] = np.linalg.norm( csr_matrix.todense(TDM[:,i] - TDM[:,j]),2 ) #GetDistance

In [136]:
from sklearn.cluster import KMeans

print("n_samples: %d, n_features: %d" % TF_IDF.shape)
km = KMeans(n_clusters=N_K, init='k-means++', max_iter=100, n_init=1, random_state=10)
print("Clustering sparse data with %s" % km)
km.fit(TF_IDF)

# create the clusters
ClustersKM = { i:[] for i in range(N_K)}
LabelsKM = list(km.labels_)
for i in range(len(DocList)):
    ClustersKM[LabelsKM[i]].append(i)
    
# Evaluate
t = EvalClustering(Distances, ClustersKM)

n_samples: 100, n_features: 4893
Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=10, n_init=1, n_jobs=None, precompute_distances='auto',
    random_state=10, tol=0.0001, verbose=0)
Mean Within-Cluster distance = 1.078.
Mean Inter-Cluster distance = 15.525.


In [144]:
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD

# Dimensionality reduction
svd = TruncatedSVD(50)
X = svd.fit_transform(TF_IDF)
print("n_samples: %d, n_features: %d" % X.shape)
km = KMeans(n_clusters=N_K, init='k-means++', max_iter=100, n_init=1, random_state=10)
print("Clustering sparse data with %s" % km)
km.fit(X)

# create the clusters
ClustersKM = { i:[] for i in range(N_K)}
LabelsKM = list(km.labels_)
for i in range(len(DocList)):
    ClustersKM[LabelsKM[i]].append(i)
    
# Evaluate
t = EvalClustering(Distances, ClustersKM)

n_samples: 100, n_features: 50
Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=10, n_init=1, n_jobs=None, precompute_distances='auto',
    random_state=10, tol=0.0001, verbose=0)
Mean Within-Cluster distance = 0.562.
Mean Inter-Cluster distance = 18.173.


In [150]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# we need the raw texts as input for this method
FolderToParse = "BoW_100random/"#"Data_Part/" #pagecontent_as_text/"
RawTexts = []; N=0
Subset=100
N_Topics=10
for document in os.listdir( FolderToParse ):
    # load document
    FileToLoad = FolderToParse + document
    # Load text file of wikipedia entry on bird species
    f = open(FileToLoad,'rb')
    # we ignore non-printable (strange) letters and symbols !
    text = f.read().decode('ascii', 'ignore')
    f.close()
    RawTexts.append(text)
RawTexts = RawTexts[:Subset] # just in case we need a subset of the dataset
    
vectorizer = TfidfVectorizer(max_df=0.3, max_features=60000, min_df=1, stop_words='english', use_idf=True)
X = vectorizer.fit_transform(RawTexts)

#X = TF_IDF
print("n_samples: %d, n_features: %d" % X.shape)
LDA_SKL = LatentDirichletAllocation(n_topics = N_Topics, max_iter=100, random_state=20)
LDA_SKL.fit(X)
print("Clustering sparse data with %s" % LDA_SKL)
# get doc-topic distributions
LDA_SKLvectors = LDA_SKL.transform(X)

# Clustering - Black and white approach, as before
ClustersSKL = { i:[] for i in range(N_Topics)}
LabelsSKL = []
for i in range(len(DocList)):
    distr = list(LDA_SKLvectors[i])
    # find the argmax{distr} - ATTENTION: ties ???
    label = distr.index(max(distr))
    ClustersSKL[label].append(i)
    LabelsSKL.append( label )
    
# Evaluate
t = EvalClustering(Distances, ClustersSKL)

n_samples: 100, n_features: 4746




Clustering sparse data with LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=100, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=10, perp_tol=0.1,
             random_state=20, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
Mean Within-Cluster distance = 5.966.
Mean Inter-Cluster distance = 7.173.


In [132]:
ClustersRAND = { i:[] for i in range(N_K)}# initialize the dictictionary of clusters
for i in range(len(DocList)):
    #distr = [ x[1] for x in Pi[i]]
    # find the argmax{distr} - ATTENTION: ties ???
    label = np.random.randint(N_K)
    ClustersRAND[label].append(i)
    
# Evaluate
t = EvalClustering(Distances, ClustersRAND)

Mean Within-Cluster distance = 6.175.
Mean Inter-Cluster distance = 6.522.


In [11]:
#save
#from scipy.sparse import save_npz
np.save('100New_Doc_Distances.npy',Distances)

In [130]:
# Now compare the distances for different clusters....
ClusterDist = np.zeros( (N_K,N_K))
for c1 in range(N_K):
    if len(Clusters[c1])>0:
        for c2 in range(c1,N_K):
            #first we compute the intra-cluster distance
            if len(Clusters[c2])>0:
                ClusterDist[c1,c2] = ComputeDistances(Distances, list(Clusters[c1]), list(Clusters[c2]))

def ComputeDistances(D, C1, C2):
    """
        Function that, given two sets of indices C1, C2 and a matrix D with  
        distances calculated for every pair, it computes the average distance.
    """
    S=0
    for i in range(len(C1)):
        for j in range(len(C2)):
            S += S + D[ C1[i], C2[j] ]
    return S/(len(C1)*len(C2))

NameError: name 'Clusters' is not defined

In [11]:
TDM = np.load("Term_Doc_Matrix_All.npy")
type(TDM)

numpy.ndarray

In [22]:
TDM.sum()

<62959x10769 sparse matrix of type '<class 'numpy.float64'>'
	with 1301308 stored elements in Compressed Sparse Row format>

In [18]:
print("The Term-Doc matrix is {0:.2f}% dense.".format(1301308/(62959*10769)*100))

The Term-Doc matrix is 0.19% dense.


In [None]:
scipy.sparse.save_npz('Doc_Doc_Distances_All.npz',Distances)

In [35]:
with open('list_of_species.txt', encoding='utf-8', errors='ignore') as f:
    Names = f.readlines()
Names = [x.strip() for x in Names]
Names_100New = []
for document in os.listdir( "BoW_100random/" ):
    temp = Names[ int(document[:-8])-1 ]
    #print( temp )
    Names_100New.append( temp )

White-rimmed brush finch
White-throated jacamar
Willard's sooty boubou
Yellow-bellied tit
Yellow-eyed starling
Black-faced dacnis
Black-crowned tchagra
Ahanta francolin
Black-hooded sierra finch
Black-tipped cotinga
Blue-black kingfisher
Blue-winged teal
Brasília tapaculo
Brown sicklebill
Brush cuckoo
Bush blackcap
Abbott's babbler
Caquetá seedeater
Cherry-throated tanager
Andaman masked owl
Chestnut-hooded laughingthrush
Chubut steamer duck
Cocoi heron
Common ringed plover
Crested barbet
Cuban gnatcatcher
Dieffenbach's rail
Dusky-faced tanager
Elegant crested tinamou
European stonechat
Arctic tern
Flores green pigeon
Franklin's gull
Gillett's lark
Golden-naped woodpecker
Great parrotbill
Green racket-tail
Grey heron
Grey-collared oriole
Grey-throated tit-flycatcher
Harwood's francolin
Australasian gannet
Holub's golden weaver
Ihering's antwren
Jacobin cuckoo
Kadavu fantail
Labrador duck
Lemon-bellied white-eye
Line-fronted canastero
Long-billed white-eye
MacGregor's honeyeater
Malayan

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# we need the raw texts as input for this method
FolderToParse = "BoW_100random/"#"Data_Part/" #pagecontent_as_text/"
RawTexts = []; N=0
for document in os.listdir( FolderToParse ):
    # load document
    FileToLoad = FolderToParse + document
    # Load text file of wikipedia entry on bird species
    f = open(FileToLoad,'rb')
    # we ignore non-printable (strange) letters and symbols !
    text = f.read().decode('ascii', 'ignore')
    f.close()
    RawTexts.append(text)
RawTexts = RawTexts[:Subset] # just in case we need a subset of the dataset
    
vectorizer = TfidfVectorizer(max_df=0.3, max_features=60000, min_df=1, stop_words='english', use_idf=True)
X = vectorizer.fit_transform(RawTexts)
print("n_samples: %d, n_features: %d" % X.shape)
LDA_SKL = LatentDirichletAllocation(n_topics = N_Topics, max_iter=100, random_state=20)
LDA_SKL.fit(X)
print("Clustering sparse data with %s" % LDA_SKL)
# get doc-topic distributions
LDA_SKLvectors = LDA_SKL.transform(X)

# Clustering - Black and white approach, as before
ClustersSKL = { i:[] for i in range(N_Topics)}
LabelsSKL = []
for i in range(len(DocList)):
    distr = list(LDA_SKLvectors[i])
    # find the argmax{distr} - ATTENTION: ties ???
    label = distr.index(max(distr))
    ClustersSKL[label].append(i)
    LabelsSKL.append( label )
    
# Evaluate
t = EvalClustering(Distances, ClustersSKL)