In [None]:
%matplotlib inline
from __future__ import print_function
import numpy as np
import numpy.linalg as LA
import matplotlib.pyplot as plt
from numpy import matmul as MM

In [None]:
def load_dataset(name='seeds'):
    if name == 'seeds':
        with open('datasets/seeds_dataset.txt') as ds:
            lines = ds.readlines()
            X = []
            Y = []
            for line in lines:
                x = line.split()
                y = x[-1]
                x = x[:-1]
                X.append(x)
                Y.append(y)
            X = np.array(X, dtype=np.float)
            Y = np.array(Y, dtype=np.uint8)
            return X, Y
    if name == 'wine':
         with open('datasets/wine_dataset.txt') as ds:
            lines = ds.readlines()
            X = []
            Y = []
            for line in lines:
                x = line.strip().split(',')
                y = x[0]
                x = x[1:]
                X.append(x)
                Y.append(y)
            X = np.array(X, dtype=np.float)
            Y = np.array(Y, dtype=np.uint8)
            return X, Y       

In [None]:
def normalize_dataset(X):
    n, d = X.shape
    for i in range(d):
        mu = np.mean(X[:, i])
        std = np.std(X[:, i])
        X[:, i] = (X[:, i] - mu)/std
    return X

In [None]:
def random_V(d):
    V = np.random.uniform(-1, 1, [d, d])
    V = LA.qr(V, mode='complete')[0]
    return V

In [None]:
def projection_matrices(d, m):
    Pc = np.zeros([d, d], dtype=np.uint8)
    Pc[:m, :m] = np.eye(m, dtype=np.uint8)
    Pn = Pc ^ np.eye(d, dtype=np.uint8)
    return Pc, Pn

In [None]:
def sub_kmeans(X, k):
    n, d = X.shape
    V = random_V(d)
    m = d/2
    mu_D = np.mean(X, axis=0, keepdims=True)
    S_D = MM((X - mu_D).T, (X - mu_D))
    print(S_D.shape)
    mu_is = X[np.random.choice(n, k)]
    print(mu_is.shape)
    itr = 1
    assignment_unchanged = 0
    while True:
        Pc, _ = projection_matrices(d, m)
        PcV = MM(Pc.T, V.T)[None, :, :]
        PcVmu_is = MM(PcV, mu_is[:, :, None])
        #print(PcV.shape)
        sq_diff = np.square(MM(PcV, X[:, :, None])[:, None, :, :] - PcVmu_is[None, :, :, :])
        sq_diff = np.sum(sq_diff, axis=(-1, -2))
        if itr % 5 == 0:
            Cnew = np.argmin(sq_diff, axis=-1)
            points_changed = np.sum(1 - np.equal(C, Cnew).astype(np.uint8))
            if points_changed == 0:
                assignment_unchanged += 1
            if assignment_unchanged >= 2:
                break
            print('Iter %d: %d points changed' % (itr, points_changed))
            C = Cnew
            # print(set(C))
        else:
            C = np.argmin(sq_diff, axis=-1)
        counts = {i:0 for i in range(k)}
        mu_is = np.zeros([k, d])
        S_is = np.zeros([k, d, d])
        for i, x in enumerate(X):
            c_id = C[i]
            mu_is[c_id] += x
            counts[c_id] += 1
        # print(mu_is)
        mu_is = np.array([mu_is[i]/counts[i] for i in range(k)])
        # print(mu_is.shape)
        for i, x in enumerate(X):
            c_id = C[i]
            S_is[c_id] += MM((x - mu_is[c_id]).T, (x - mu_is[c_id]))
        Evals, Evecs = LA.eig(np.sum(S_is, axis=0) - S_D)
        idx = np.argsort(Evals)
        V = Evecs[idx]
        m = np.sum([1 for i in Evals if i < -1e-10])
        m = max(m, 1)
        itr += 1
    return C, V, m

In [None]:
X, Y = load_dataset('wine')
X = normalize_dataset(X)
for i in range(10):
    C, V, m = sub_kmeans(X, 3)
    X_rotated = MM(V, X.T)
    print(m)
    plt.scatter(X_rotated[0], X_rotated[1], c=C)
    plt.show()

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=C)
plt.show()