In [2]:
%matplotlib inline
from __future__ import print_function
import numpy as np
import numpy.linalg as LA
import matplotlib.pyplot as plt
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
import time
from numpy import matmul as MM
from sklearn.metrics import normalized_mutual_info_score as nmi
import pycuda.driver as drv
import time


start = drv.Event()
end = drv.Event()

In [3]:
def load_dataset(name='seeds'):
    if name == 'seeds':
        with open('datasets/seeds_dataset.txt') as ds:
            lines = ds.readlines()
            X = []
            Y = []
            for line in lines:
                x = line.split()
                y = x[-1]
                x = x[:-1]
                X.append(x)
                Y.append(y)
            X = np.array(X, dtype=np.float)
            Y = np.array(Y, dtype=np.uint8)
            return X, Y
    if name == 'wine':
         with open('datasets/wine_dataset.txt') as ds:
            lines = ds.readlines()
            X = []
            Y = []
            for line in lines:
                x = line.strip().split(',')
                y = x[0]
                x = x[1:]
                X.append(x)
                Y.append(y)
            X = np.array(X, dtype=np.float)
            Y = np.array(Y, dtype=np.uint8)
            return X, Y  
        
    if name == 'soy':
        with open('datasets/soybean_dataset.txt') as ds:
            lines = ds.readlines()
            X = []
            Y = []
            for line in lines:
                x = line.strip().split(',')
                y = x[-1]
                x = x[:-1]
                X.append(x)
                Y.append(int(y[-1]) - 1)
            X = np.array(X, dtype=np.float)
            Y = np.array(Y, dtype=np.uint8)
            return X, Y  
    if name == 'hand':
        with open('datasets/handjob_dataset.txt') as ds:
            lines = ds.readlines()
            X = []
            Y = []
            for line in lines:
                x = line.strip().split(',')
                y = x[0]
                x = x[1:]
                X.append(x)
                Y.append(y)
            X = np.array(X, dtype=np.float)
            Y = np.array(Y, dtype=np.uint8)
            Y = Y if np.min(Y) == 0 else Y-1
            return X, Y  
    if name == 'olive':
        with open('datasets/olive_dataset.txt') as ds:
            lines = ds.readlines()
            X = []
            Y = []
            for line in lines:
                x = line.strip().split(',')
                y = x[0]
                x = x[1:]
                X.append(x)
                Y.append(y)
            X = np.array(X, dtype=np.float)
            Y = np.array(Y, dtype=np.uint8)
            Y = Y if np.min(Y) == 0 else Y-1
            return X, Y
    if name == 'symbol':
        with open('datasets/symbol_dataset.txt') as ds:
            lines = ds.readlines()
            X = []
            Y = []
            for line in lines:
                x = line.strip().split(',')
                y = x[0]
                x = x[1:]
                X.append(x)
                Y.append(y)
            X = np.array(X, dtype=np.float)
            Y = np.array(Y, dtype=np.uint8)
            Y = Y if np.min(Y) == 0 else Y-1
            return X, Y  
    if name == 'plane':
        with open('datasets/plane_dataset.txt') as ds:
            lines = ds.readlines()
            X = []
            Y = []
            for line in lines:
                x = line.strip().split(',')
                y = x[0]
                x = x[1:]
                X.append(x)
                Y.append(y)
            X = np.array(X, dtype=np.float)
            Y = np.array(Y, dtype=np.uint8)
            Y = Y if np.min(Y) == 0 else Y-1
            return X, Y  
    if name == 'mnist':
        with open('datasets/mnist_dataset.txt') as ds:
            lines = ds.readlines()
            X = []
            Y = []
            for line in lines:
                x = line.strip().split(',')
                y = x[0]
                x = x[1:]
                X.append(x)
                Y.append(int(float(y)))
            X = np.array(X, dtype=np.float)
            Y = np.array(Y, dtype=np.uint8)
            Y = Y if np.min(Y) == 0 else Y-1
            return X, Y  

In [4]:
def normalize_dataset(X):
    n, d = X.shape
    for i in range(d):
        mu = np.mean(X[:, i])
        std = np.std(X[:, i]) + 1e-10
        X[:, i] = (X[:, i] - mu)/std
    return X

In [45]:
def random_V(d):
    
    
    V = np.random.uniform(0, 1, [d, d])
    
    
    start.record()
    start.synchronize()
    
    V = LA.qr(V, mode='complete')[0]
    
    end.record()
    end.synchronize()
    
    QR_time = start.time_till(end)*1e-3
    return V, QR_time

In [46]:
def projection_matrices(d, m):
    Pc = np.zeros([d, m], dtype=np.uint8)
    Pc[:m, :m] = np.eye(m, dtype=np.uint8)
    # Pn = Pc ^ np.eye(d, dtype=np.uint8)
    return Pc

In [61]:
def sub_kmeans(X, k):
    
    n, d = X.shape
    V, QR_time = random_V(d)
    m = d/2
    
    # Algorithm line 5
    #################################################################################
    start = drv.Event()
    end = drv.Event()
    start.record()
    start.synchronize()
    
    
    mu_D = np.mean(X, axis=0, keepdims=True)
    
    end.record()
    end.synchronize()
    mu_d_time = start.time_till(end)*1e-3
    #################################################################################
    
    # Algorithm line 6
    #################################################################################
    start.record()
    start.synchronize()
    
    S_D = MM((X - mu_D).T, (X - mu_D)) # Check this using loop
    
    end.record()
    end.synchronize()
    S_d_time = start.time_till(end)*1e-3
    #################################################################################
    
    argmin_times = []
    mu_Si_times = []
    eig_times = []
    
    
    mu_is = X[np.random.choice(n, k)]
    itr = 1
    assignment_unchanged = 0
    while True:
        
        Pc = projection_matrices(d, m)
        
        # Algorithm line 11
        #################################################################################
        start.record()
        start.synchronize()
        
        PcV = MM(Pc.T, V.T)[None, :, :] # 1,m,d
        PcVmu_is = MM(PcV, mu_is[:, :, None]) # k,m,1
 
        X_trans = MM(PcV.squeeze(0), X.T).T # n, m
        Mus_trans = PcVmu_is.squeeze(-1) # k, m
        sq_diff = np.square(X_trans[:, None, :] - Mus_trans[None, :, :]) # n, k, m
        sq_diff = np.sum(sq_diff, axis=-1)

        
        if itr % 2 == 0:
            Cnew = np.argmin(sq_diff, axis=1)
            points_changed = np.sum(1 - np.equal(C, Cnew).astype(np.uint8))
            if points_changed == 0:
                assignment_unchanged += 1
            if assignment_unchanged >= 2:
                break
            C = Cnew
        else:
            C = np.argmin(sq_diff, axis=1)
            
        end.record()
        end.synchronize()
        argmin_times.append(start.time_till(end)*1e-3)
        #################################################################################    
            
        
        # Algorithm line 14
        #################################################################################
        start.record()
        start.synchronize()
        
        counts = {i:0 for i in range(k)}
        mu_is = np.zeros([k, d])
        S_is = np.zeros([k, d, d])
        for i, x in enumerate(X):
            c_id = C[i]
            mu_is[c_id] += x
            counts[c_id] += 1

        mu_is = np.array([mu_is[i]/counts[i] for i in range(k)])

                
        C_matrics = {i:[] for i in range(k)}

        for i, x in enumerate(X):
            c_id = C[i]
            x_minus_mu_isi = (x - mu_is[c_id]).T[:, None]
            C_matrics[c_id].append(x_minus_mu_isi)
        for ki in C_matrics:
            CX_m = np.array(C_matrics[ki]).squeeze()
            S_is[ki] = MM(CX_m.T, CX_m)
        
        end.record()
        end.synchronize()
        mu_Si_times.append(start.time_till(end)*1e-3)
        #################################################################################    
            
        
        # Algorithm line 16
        #################################################################################
        start.record()
        start.synchronize()
        
        Evals, Evecs = LA.eig(np.sum(S_is, axis=0) - S_D)
        
        end.record()
        end.synchronize()
        eig_times.append(start.time_till(end)*1e-3)
        #################################################################################    
          
        idx = np.argsort(Evals)
        V = Evecs[:, idx]
        maxVal = min(Evals)
        if itr > 2:
            m = np.sum([1 for i in Evals if i/maxVal > 1e-8])
            m = max(1, m)
        
        itr += 1
        
    return C, V, m, QR_time, mu_d_time, S_d_time, argmin_times, mu_Si_times, eig_times, itr

In [56]:
def times(QR_time, mu_d_time, S_d_time, argmin_times, mu_Si_times, eig_times, overall_all, itr, m, nmi):
    
    print('itr: ', np.mean(itr))
    max_nmi_idx = np.argmax(nmi)
    print('m: ', m[max_nmi_idx])
    print('nmi: ', nmi[max_nmi_idx])
    
    print("Orthogonal time gpu: ", np.mean(QR_time))
    print("mu_d_time gpu: ", np.mean(mu_d_time))
    print("S_d_time gpu:", np.mean(S_d_time))
    
    argmin_times = np.asarray(argmin_times)
    mu_Si_times = np.asarray(mu_Si_times)
    eig_times = np.asarray(eig_times)
    
    print("Argim_time_avg gpu: ", np.mean(argmin_times))
    print("Argim_time_std gpu: ", np.std(argmin_times))
    print("mu_Si_time_avg gpu: ", np.mean(mu_Si_times))
    print("mu_Si_time_std gpu: ", np.std(mu_Si_times))
    print("eig_times_avg gpu: ", np.mean(eig_times))
    print("eig_times_std gpu: ", np.std(eig_times))

    print("Total time: ", np.mean(overall_all))

In [67]:
X, Y = load_dataset('hands')
X = normalize_dataset(X)

l_QR_time, l_mu_d_time, l_S_d_time, l_argmin_times, l_mu_Si_times, l_eig_times, l_overall_time = [],[],[],[],[],[], []
l_itr = []
l_nmi, l_m = [], []


for i in range(1):
    print(i)
    # Overall time
    #################################################################################
    start_overall = drv.Event()
    end_overall = drv.Event()
    start_overall.record()
    start_overall.synchronize()
    
    C, V,  m, QR_time, mu_d_time, S_d_time, argmin_times, mu_Si_times, eig_times, \
    itr = sub_kmeans(X, 10)
    
    end_overall.record()
    end_overall.synchronize()
    overall_time = start_overall.time_till(end_overall)*1e-3
    #################################################################################    
    
    l_QR_time.append(QR_time)
    l_mu_d_time.append(mu_d_time)
    l_S_d_time.append(S_d_time)
    l_argmin_times+=argmin_times
    l_mu_Si_times+=mu_Si_times
    l_eig_times+=eig_times
    l_overall_time.append(overall_time)
    
    
    
    Pc = projection_matrices(X.shape[1], m)
    trans = V.T
    X_rotated = MM(trans[None, :, :], np.transpose(X[:, None, :], [0, 2, 1]))
    X_rotated = X_rotated.squeeze(-1).T
    acc = nmi(Y, C)
    l_nmi.append(acc)
    l_m.append(m)
    l_itr.append(itr)    
#     plt.scatter(X_rotated[0], X_rotated[1], c=C)
#     plt.show()

times(l_QR_time, l_mu_d_time, l_S_d_time, l_argmin_times, l_mu_Si_times, l_eig_times, l_overall_time, l_itr, l_m, l_nmi)

0
itr:  134.0
m:  9
nmi:  0.41308760441309406
Orthogonal time gpu:  0.09209814453125
mu_d_time gpu:  0.007203231811523438
S_d_time gpu: 0.3998182373046875
Argim_time_avg gpu:  0.15583019927749062
Argim_time_std gpu:  0.13483156629988047
mu_Si_time_avg gpu:  0.30643791256094344
mu_Si_time_std gpu:  0.015315673194303115
eig_times_avg gpu:  2.493979132401316
eig_times_std gpu:  0.05070027096532897
Total time:  394.650875


In [None]:
for i in range(9):
    for j in range(i+1, 9):
        plt.scatter(X_rotated[i], X_rotated[j], c=C)
        plt.show()

In [None]:
X_rotated.shape

In [None]:
X_scala = np.loadtxt('/home/abdul/Documents/adm-project/SubKmeans/result/sample_result.dat', delimiter=';')

In [None]:
X_scala.shape

In [None]:
C = X_scala[:, 0]
X_scala = X_scala[:, 1:]

In [None]:
nmi(Y, C)

In [None]:
plt.scatter(X_scala[:, 0], X_scala[:, 1], c=C)
plt.show()

In [None]:
C

In [None]:
X, Y = load_dataset('wine')
X = normalize_dataset(X)


In [None]:
with open('/home/abdul/Documents/adm-project/SubKmeans/datasets/sample.dat', 'w') as f:
    for i, x in enumerate(X):
        f.write(str(Y[i]))
        f.write(';')
        f.write(';'.join(['%.15f'%j for j in x]))
        f.write('\n')

In [None]:
def sub_kmeans_loopy(X, k):
    n, d = X.shape
    V = random_V(d)
    m = d/2
    mu_D = np.mean(X, axis=0, keepdims=True)
    S_D = np.zeros([d, d])
    
    Pc = projection_matrices(d, m)
    for e, x in enumerate(X):
        x_minus_muD = (x - mu_D).T
        S_D += MM(x_minus_muD, x_minus_muD.T)
    mu_is = X[np.random.choice(n, k)]
    itr = 0
    while itr < 200:
        C = {i:[] for i in range(k)}
        ass = np.zeros(n, dtype=np.uint8) - 1
        Pc = projection_matrices(d, m)
        trans = MM(Pc.T, V.T)
        for k_num, x in enumerate(X):
            j = -1
            min_dist = 1e10
            for i, mu_i in enumerate(mu_is):
                vec_dist = MM(trans, x[:, None]) -  MM(trans, mu_i[:, None])
                dist = MM(vec_dist.T, vec_dist)[0, 0]
                if dist < min_dist:
                    min_dist = dist
                    j = i
            C[j].append(x)
            ass[k_num] = j
        S = np.zeros([k, d, d])
        for i in range(k):
            X_in_Ci = np.array(C[i])
            mu_is[i] = np.mean(X_in_Ci, axis=0)
            for x in X_in_Ci:
                x_minus_mu_isi = (x - mu_is[i]).T[:, None]
                S[i] += MM(x_minus_mu_isi, x_minus_mu_isi.T)
        sum_S = np.sum(S, axis=0)
        Evals, Evecs = LA.eig((sum_S - S_D))
#         print(Evals.shape)
        idx = np.argsort(Evals)
#         print(Evals.shape)
        V = Evecs[:, idx]
        maxVal = min(Evals)
        if itr > 1:
            m = np.sum([1 for i in Evals if i/maxVal > 1e-8])
            m = max(1, m)
        itr += 1
        ccc = nmi(Y, ass)
    return ass, V, m
        
            

In [None]:
import tensorflow as tf
mnist = tf.contrib.learn.datasets.load_dataset("mnist")

In [None]:
data = np.concatenate([mnist.test.labels[:, None], mnist.test.images], axis=1)
np.savetxt('datasets/mnist_dataset.txt', data, delimiter=',', fmt='%.8f')