In [1]:
import numpy as np
import scipy
from scipy.stats import norm
import math
import random
import csv
import pandas as pd

In [2]:
class mykMeans:

    def __init__(self,k,n,d):
        # k is the number of clusters to form
        self.k = k
        # centroid points
        self.centroid = np.zeros(k*d).reshape(k,d)
        # weights are the how much each feature matters
        self.weights = np.ones(d)
        # d is the dimension of the feature vector
        self.d = d
        # n is the number of the samples we have
        self.n = n
        #cluster is the stored value of the data
        self.cluster = np.zeros(n)

    def showValues(self):
        print("k:",self.k)
        print("n:",self.n)
        print("d:",self.d)

    def weights(self,data):
        for i in range (0,self.d):
            # for now the weights are set as 1
            self.weights[i]=1

    def dis(self,data,centroidIndex):
        distance =0
        #using euclidean distance with weights in this case
        for i in range (0,self.d):
            distance = distance + (self.weights[i]*((data[i]-self.centroid[centroidIndex][i])**2))
        return distance

    def updateClass(self,data):
#         print('n in update class: ',self.n)
        for i in range (0,self.n):
            minDistance = math.inf
#             print("i in updateClass: ",i)
            for j in range (0,self.k):
                tempDist = self.dis(data[i],j)
                if(tempDist<minDistance):
                    self.cluster[i] = j
                    minDistance = tempDist
                    # print('cluster: ',j)
        # self.printAllClasses()

    def updateCentroids(self,data):
        converged = True
        tempCentroid = np.zeros(self.k*self.d).reshape(self.k,self.d)
        tempCnumbers = np.zeros(self.k)
        for i in range(0,self.n):
            clusIndex = (int)(self.cluster[i])
            # print('clusindex for i:',i,clusIndex)
            tempCnumbers[clusIndex]+=1
            tempCentroid[clusIndex]+=data[i]
        for i in range(0,self.k):
            t = (tempCentroid[i]/tempCnumbers[i])==self.centroid[i]
            for j in range(0,self.d):
                if(t[j]==False):
                    converged = False
                    break
            self.centroid[i] = tempCentroid[i]/tempCnumbers[i]
        return converged

    def initializeCentroids(self,data):
        #right now it takes only the first k elements
        for i in range(0,self.k):
            self.centroid[i] = data[random.randint(0,self.n-1)]
    
    def initKMeansPlusPlus(self,data):
        ##KMeans++ implementation
        self.centroid[0] = data[random.randint(0,self.n-1)]
        for i in range(1,self.k):
            distArr = []
            for j in range(self.n):
                mindist = math.inf
                for k in range(i):
                    tempDist = self.dis(data[j],k)
                    if mindist>tempDist:
                        mindist=tempDist
                distArr.append(mindist)
            distArr = np.array(distArr)
            probs = distArr/(np.sum(distArr))
            cumprobs = np.cumsum(probs)
            r = scipy.rand()
            for s,t in enumerate(cumprobs):
                if r<t:
                    self.centroid[i] = data[s]
                    break
            
                
    def classify(self,data,max_iter,kmplusplus=0):
        convergence = False
        if(kmplusplus==0):
            self.initializeCentroids(data)
        else:
            print("KMEANS++")
            self.initKMeansPlusPlus(data)
        p=0
        while((convergence==False) and (p!=max_iter)):
            print(p)
            p+=1
            self.updateClass(data)
            convergence = self.updateCentroids(data)
        print("Classification Done")
#         print("Now printing cluster values:")
#         self.printAllClasses()

    def getClass(self,data):
        minDistance = math.inf
        clusterNumber = -1
        for j in range (0,self.k):
            tempDist = self.dis(data,j)
            if(tempDist<minDistance):
                clusterNumber = j
                minDistance = tempDist
        return clusterNumber
    
    def getClassForAll(self,data):
        n = data.shape[0]
        ansList=[]
        for i in range(n):
            ans = self.getClass(data[i])
            ansList.append(ans)
#             print(ans)
        return ansList
        
    def printAllClasses(self):
        for i in range(0,self.n):
            print(self.cluster[i])
            

In [3]:
class PCA:
    def __init__(self,m):
        self.m = m
        
    def dimensionReduction(self,data):
        # data is of the form n X d , with d dimensions
        self.d = data.shape[1]
        self.n = data.shape[0]
        #data -= np.mean(data,axis=0)
        S = np.cov(data.T)
        w,v = np.linalg.eig(S)
        #w is the array of eigen values and v's are the corresponding eigen vectors
        # for w[i] -> v[:,i] is the corresponding eigen vectors
        tempList = []
        for i in range(self.d):
            tempList.append((w[i],i))
        tempList.sort(reverse=True)
        tempVecList = []
        for i in range(self.m):
            tempVecList.append(v[:,tempList[i][1]])
        finalVecMat = np.array(tempVecList) # m X d
        reducedMat = np.matmul(finalVecMat,data.T) # m X n
        return reducedMat.T # n X m , dimension reduced from d -> m
            

In [4]:
###############################################################################################################################
#                                                    Testing kMeans
###############################################################################################################################

In [5]:
###############################################################################################################################
#  for medical data

In [6]:
with open('Medical_data.csv','r') as csv_file:
    csv_reader = list(csv.reader(csv_file,delimiter=","))
    csv_dicReader = csv.DictReader(csv_file)
    my_data = np.array(csv_reader)
    my_data = my_data[1:,:]
    np.random.shuffle(my_data)
    input_data = np.array(my_data[:,1:],dtype=np.float)
    tag_data = np.array(my_data[:, 0],dtype=np.string_)

In [16]:
obj = mykMeans(3,3000,3)
trainingData = input_data[:,:]
#pass 1 as third parameter for kmeans++
obj.classify(trainingData,100,1)

KMEANS++
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
Classification Done


In [17]:
ansList = obj.getClassForAll(input_data[:,:])

In [19]:
import itertools
permu = list(itertools.permutations([0,1,2]))
minall = math.inf
ansClust = []
c1=2
c2=1
c3=0
for i in range(0,3000):
    if(tag_data[i]==b'HEALTHY'):
        ansClust.append(c1)
    elif(tag_data[i]==b'SURGERY'):
        ansClust.append(c2)
    elif(tag_data[i]==b'MEDICATION'):
        ansClust.append(c3)
for i in range(len(permu)):
#     print(i)
    ansListDup= ansList
    errcount =0
    for j in range(len(ansListDup)):
        ansListDup[j] = permu[i][ansListDup[j]]
#         print('permu: '+str(permu[i]))
        if ansListDup[j]!=ansClust[j]:
            errcount+=1
    minall = min(minall,errcount)
print(minall)

1467


In [10]:
### below is the scikit one===================================================================================================

In [12]:
from sklearn.cluster import KMeans
skmeans =KMeans(n_clusters=3)
skmeans.fit(input_data)
ans = skmeans.labels_
ansList = skmeans.labels_
tl = ans==ansClust
errcount=0
for p in range(3000):
    if tl[p]==False:
        errcount+=1
print(errcount)

1541


In [12]:
###############################################################################################################################
#         Fashion -Mnist dataset
###############################################################################################################################

In [20]:
import os
import sys
scriptpath = "./fashion-mnist/utils"

# Add the directory containing your module to the Python path (wants absolute paths)
sys.path.append(os.path.abspath(scriptpath))
import mnist_reader
X_train, Y_train = mnist_reader.load_mnist('./fashion-mnist/data/fashion', kind='train')
X_test, Y_test = mnist_reader.load_mnist('./fashion-mnist/data/fashion', kind='t10k')

In [87]:
    pcaObj = PCA(10)
    reducedTrain = pcaObj.dimensionReduction(X_train)
    reducedTest = pcaObj.dimensionReduction(X_test)

In [89]:
obj = mykMeans(10,10000,10)
# give 1 as third parameter for kmeans
obj.classify(reducedTest,300,1)

KMEANS++
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
Classification Done


In [78]:
ansList = obj.getClassForAll(reducedTest)

In [79]:
###############################Checking answer for fashion Mnist 
maparr =np.zeros(100).reshape(10,10)
for i in range(10000):
    maparr[int(ansList[i])][Y_test[i]]+=1
maplist= np.zeros(10).reshape(10,1)
for i in range(10):
    maxi = -1*math.inf
    index=-1
    for j in range(10):
        if max(maxi,maparr[i][j])!=maxi:
            maxi=maparr[i][j]
            index=j
    maplist[i]=index
for i in range(10000):
    ansList[i] = maplist[int(ansList[i])]
ansList = np.array(ansList)
ansClust = np.array(Y_test)
errcount=0
for p in range(10000):
    if ansClust[p]!=ansList[p]:
        errcount+=1
print(errcount)

5740


In [72]:
##########################Scikit learn for fashion mnist

In [73]:
skmeans =KMeans(n_clusters=10)
skmeans.fit(reducedTest)
# ans = skmeans.labels_

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [74]:
#############################Checking answer for scikit learn for fashion mnist
ansList = skmeans.labels_
maparr =np.zeros(100).reshape(10,10)
for i in range(10000):
    maparr[int(ansList[i])][Y_test[i]]+=1
maplist= np.zeros(10).reshape(10,1)
for i in range(10):
    maxi = -1*math.inf
    index=-1
    for j in range(10):
        if max(maxi,maparr[i][j])!=maxi:
            maxi=maparr[i][j]
            index=j
    maplist[i]=index
for i in range(10000):
    ansList[i] = maplist[int(ansList[i])]
ansList = np.array(ansList)
ansClust = np.array(Y_test)
errcount=0
for p in range(10000):
    if ansClust[p]!=ansList[p]:
        errcount+=1
print(errcount)

4417
