                                           **K-means Clustering** 

---



# Part 1

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import cv2

%matplotlib inline

from sklearn.preprocessing import MinMaxScaler

from google.colab.patches import cv2_imshow
from keras.datasets import cifar10
from sklearn.metrics import silhouette_score
! pip install validclust
from validclust import dunn
from sklearn.metrics import pairwise_distances


Collecting validclust
  Downloading validclust-0.1.1-py2.py3-none-any.whl (8.1 kB)
Installing collected packages: validclust
Successfully installed validclust-0.1.1


In [None]:
# Load cifar-10 data-set

(trainX, trainy), (testX, testy) = cifar10.load_data()

In [None]:
#Convert the images to grayscale using cv2.
#Normalize the images.
#Reshape the images from (10000,32,32) to (10000,1024).



# transform data

testX= np.array([cv2.cvtColor(image,cv2.COLOR_BGR2GRAY) for image in testX])

testX=testX/255






In [None]:
#Kmeans Clustering algorithm

# Step perfromed in the algorithm:-
"""
1) take ramdom points from the data of length equal to the no of clusters as the initial centroids of the clusters
2)find the closest centroid for each point and asign it to them
3) take the mean as each cluster as the new centroid 
4) repeat step 2 and 3 until the difference between new and old centroid is equal to zero or epochs complete
"""
#function for euclidean distance between two data points 
def euclidean_distance(d1, d2):
    return np.sqrt(np.sum((d1 - d2) ** 2))


class KmeansClustering:
    def __init__(self, no_of_cluster, epochs):
        self.no_of_cluster = no_of_cluster
        self.epochs = epochs
        
        # points in a  cluster
        self.clusters=[]
        for _ in range(self.no_of_cluster):
          self.clusters.append([]) 
        # Cluster Centers
        self.centroids = []

    def predict(self, X):
        self.X = X
        self.no_of_points, self.no_of_features = X.shape

        # initializing cluster centroids randomly
        random_points = np.random.choice(self.no_of_points, self.no_of_cluster, replace=False)
        for i in random_points:
          self.centroids.append(self.X[i]) 

        # creating clusters by asigning point closest to the cluster centroid to that clustser
        for _ in range(self.epochs):
            
            self.clusters = self.cluster_create(self.centroids)
           
            # update centorids from the clusters
            old_centroids = self.centroids
            # update centroids to mean of the points in the cluster
            centroids = np.zeros((self.no_of_cluster, self.no_of_features))
            for i, cluster in enumerate(self.clusters):
              new_centroid = np.mean(self.X[cluster], axis=0)
              centroids[i] = new_centroid
            self.centroids=centroids
             
            # check if clusters have changed(old-centroid=new-centroid)
            distances = []
            for i in range(self.no_of_cluster):
              distances.append(euclidean_distance(old_centroids[i], self.centroids[i])) 
        
       
    
            if sum(distances)==0:
              print("Cluster centroids have converged")
              break
          
        # Classify samples as the index of their clusters
        labels = np.empty(self.no_of_points)

        for i, cluster in enumerate(self.clusters):
            for j in cluster:
                labels[j] = i
        return labels



    def cluster_create(self, centroids):
        # Assign points to the closest centroid
        clusters=[]
        for _ in range(self.no_of_cluster):
          clusters.append([])
        
        for i, point in enumerate(self.X):
          distances=[]
          for centroid in centroids:
            distances.append(euclidean_distance(point, centroid)) 
          closest_index = np.argmin(distances)
          clusters[closest_index].append(i)
        return clusters
    





In [None]:
model=KmeansClustering(no_of_cluster=10,epochs=100)


In [None]:
labels=model.predict(testX)

Cluster centroids have converged


In [None]:
# perform ASC and dunn score analysis

dist= pairwise_distances(testX)
print("Dunn score is: ",dunn(dist,labels))
print("ASC score is: ",silhouette_score(testX,labels))

Dunn score is:  0.0902795391612921
ASC score is:  0.05360154438360176


# Part 2

Steps followed:

1. use autoencoders to fit data
2. use the encoder part to generate a sparse dataset
3. use k-means to generate the labels on the sparse dataset and perform asc to 

In [None]:
#loading libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2

%matplotlib inline

from tensorflow import keras
from sklearn.metrics import silhouette_score

from keras.layers import Dense, Input
from tensorflow.keras.optimizers import  Adam
from keras.models import Model
from keras.datasets import cifar10



In [None]:
#loading cifar-10 dataset
(train_x,train_y),(test_x,test_y)= cifar10.load_data()

In [None]:
# transform data

train_x= np.array([cv2.cvtColor(image,cv2.COLOR_BGR2GRAY) for image in train_x])
train_x = train_x.astype('float32') / 255.
train_x = train_x.reshape(50000, 1024)

print(train_x.shape)

(50000, 1024)


In [None]:
#encoder 
input_img = keras.Input(shape=(1024,))
encoded = Dense(128, activation='relu')(input_img)
encoded = Dense(64, activation='relu')(encoded)

#decoder
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(decoded)
decoded = Dense(1024, activation='sigmoid')(decoded)

In [None]:
#autoencoder model
autoencoder = keras.Model(input_img, decoded)

#encoder model
encoder = Model(input_img, encoded)


In [None]:
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

#fitting autoencoder
autoencoder.fit(train_x, train_x,
                epochs=100,
                batch_size=128,
                shuffle=True,
                )

sparse_x = encoder.predict(train_x)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Performing K-Means on the sparse dataset recieeved

In [None]:
from sklearn.cluster import KMeans

In [None]:
#fitting kmeans on the sparse_x
model = KMeans(n_clusters=10, n_init=1, max_iter=1)
model.fit(sparse_x)


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=1,
       n_clusters=10, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [None]:
#Asc score of the learnt model
print("ASC score is: ",silhouette_score(sparse_x,model.labels_))

ASC score is:  0.06722549


In [None]:
import cv2

TypeError: ignored

(1024,)