In [54]:
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans
from scipy.io import loadmat
import numpy as np
import math
import time

In [59]:
def load_data(mat_name):
    data_set = loadmat(mat_name)
    x = data_set['fea']
    y_true = data_set['gnd'].flatten()
    return x, y_true

def filter_data(x, y_true, top_or_last, count): #top: True , last: False
    classes = np.unique(y_true)
    classes_count={i:np.sum(y_true==i) for i in classes}

    if top_or_last:
        classes_ind = np.argpartition(list(classes_count.values()), -count)[-count:]
    else:
        classes_ind = np.argpartition(list(classes_count.values()), +count)[:+count]
    classes = [list(classes_count.keys())[i] for i in classes_ind]

    x, y_true = x[np.where(np.isin(y_true, classes))], y_true[np.where(np.isin(y_true, classes))]
    
    return x, y_true, classes

In [63]:
def calculate_input_class(classes, model_labels, y_true):
    input_class = {i:[] for i in classes}
    cluster_class = {i:[] for i in classes}
    
    for predicted_y, group in zip(model_labels, y_true):
        input_class[group].append(predicted_y)
        cluster_class[predicted_y].append(group)
        
    return input_class, cluster_class

def calculate_purity_and_entropy(input_class, cluster_class, classes, n, m):
    purity = 0
    for i in classes:
        maxn = 0
        for j in classes:
            nij = np.sum(input_class[j]==i)
            maxn = max(maxn, nij)
        purity = purity+maxn/n
    
    entropy = 0
    for i in classes:
        for j in classes:
            nij = np.sum(input_class[j]==i)
            ni = len(
                [i])
            entropy = entropy + (nij*math.log((nij+1)/(ni+1)))

    entropy = entropy*(-1/(n*math.log(m)))
        
    return purity, entropy

In [61]:
def run(case, title):
    print('case {} ({}) has called.'.format(case, title))
    if case==1:
        x, y_true = load_data("Reuters21578.mat")
        x, y_true, classes = filter_data(x, y_true, True, 20)
    elif case==2:
        x, y_true = load_data("Reuters21578.mat")
        x, y_true, classes = filter_data(x, y_true, True, 10)
    elif case==3:
        x, y_true = load_data("TDT2_all.mat")
        x, y_true, classes = filter_data(x, y_true, True, 30)
        x, y_true, classes = filter_data(x, y_true, True, 10)
    elif case==4:
        x, y_true = load_data("TDT2_all.mat")
        x, y_true, classes = filter_data(x, y_true, True, 30)
        x, y_true, classes = filter_data(x, y_true, False, 10)
    print('case {}: data loaded!'.format(case))
    
    start_time = time.time()
    print('case {}: KMeans started...'.format(case))
    model = KMeans(n_clusters=len(classes))
    model.fit(x)
    model_lables = np.array(classes)[model.labels_]
    print('case {}: KMeans ended in {:.2f} seconds!'.format(case, time.time()-start_time))
    
    input_class, cluster_class = calculate_input_class(classes, model_lables, y_true)
    purity, entropy = calculate_purity_and_entropy(input_class, cluster_class, classes, x.shape[0], len(classes))
    print('case {}: KMeans purity = {:.4}  KMeans entropy = {:.4f}.'.format(case,purity, entropy))
    
    start_time = time.time()
    print('case {}: NMF started...'.format(case))
    model = NMF(n_components =len(classes), init='nndsvda')
    W = model.fit_transform(x)
    model_lables_order = np.argmax(W, axis=1)
    model_lables = np.array(classes)[model_lables_order]
    print('case {}: NMF ended in {:.2f} seconds!'.format(case, time.time()-start_time))
        
    input_class, cluster_class = calculate_input_class(classes, model_lables, y_true)
    purity, entropy = calculate_purity_and_entropy(input_class, cluster_class, classes, x.shape[0], len(classes))
    print('case {}: NMF purity = {:.4f}  NMF entropy = {:.4f}. \n'.format(case,purity, entropy))
    

In [62]:
run(1, 'Reuters-Top10')
run(2, 'Reuters-Top20')
run(3, 'TDT2-Top10')
run(4, 'TDT2-Last10')

case 1 (Reuters-Top10) has called.
case 1: data loaded!
case 1: KMeans started...
case 1: KMeans ended in 22.51 seconds!
case 1: KMeans purity = 0.6313  KMeans entropy = 0.3811.
case 1: NMF started...




case 1: NMF ended in 5.00 seconds!
case 1: NMF purity = 0.7642  NMF entropy = 0.2499. 

case 2 (Reuters-Top20) has called.
case 2: data loaded!
case 2: KMeans started...
case 2: KMeans ended in 12.71 seconds!
case 2: KMeans purity = 0.7261  KMeans entropy = 0.3943.
case 2: NMF started...
case 2: NMF ended in 1.26 seconds!
case 2: NMF purity = 0.7863  NMF entropy = 0.3015. 

case 3 (TDT2-Top10) has called.
case 3: data loaded!
case 3: KMeans started...
case 3: KMeans ended in 14.40 seconds!
case 3: KMeans purity = 0.4843  KMeans entropy = 0.6152.
case 3: NMF started...
case 3: NMF ended in 1.64 seconds!
case 3: NMF purity = 0.8332  NMF entropy = 0.2466. 

case 4 (TDT2-Last10) has called.
case 4: data loaded!
case 4: KMeans started...
case 4: KMeans ended in 0.74 seconds!
case 4: KMeans purity = 0.415  KMeans entropy = 0.6529.
case 4: NMF started...
case 4: NMF ended in 0.26 seconds!
case 4: NMF purity = 0.8178  NMF entropy = 0.2210. 

