In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import cifar10
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from tensorflow.keras.applications import VGG16

In [2]:
import ssl
import urllib.request

# Bypass SSL certificate verification
# ssl._create_default_https_context = ssl._create_unverified_context

# # Download CIFAR-10 dataset
# url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
# urllib.request.urlretrieve(url, "cifar-10-python.tar.gz")

(X_train, y_train), (X_test, y_test) = cifar10.load_data()

In [3]:
# Preprocess the data (normalize to [0, 1])
x_train = X_train.astype('float32') / 255.
x_test = X_test.astype('float32') / 255.
print(x_train.shape)
print(x_test.shape)
print(x_train)


(50000, 32, 32, 3)
(10000, 32, 32, 3)
[[[[0.23137255 0.24313726 0.24705882]
   [0.16862746 0.18039216 0.1764706 ]
   [0.19607843 0.1882353  0.16862746]
   ...
   [0.61960787 0.5176471  0.42352942]
   [0.59607846 0.49019608 0.4       ]
   [0.5803922  0.4862745  0.40392157]]

  [[0.0627451  0.07843138 0.07843138]
   [0.         0.         0.        ]
   [0.07058824 0.03137255 0.        ]
   ...
   [0.48235294 0.34509805 0.21568628]
   [0.46666667 0.3254902  0.19607843]
   [0.47843137 0.34117648 0.22352941]]

  [[0.09803922 0.09411765 0.08235294]
   [0.0627451  0.02745098 0.        ]
   [0.19215687 0.10588235 0.03137255]
   ...
   [0.4627451  0.32941177 0.19607843]
   [0.47058824 0.32941177 0.19607843]
   [0.42745098 0.28627452 0.16470589]]

  ...

  [[0.8156863  0.6666667  0.3764706 ]
   [0.7882353  0.6        0.13333334]
   [0.7764706  0.6313726  0.10196079]
   ...
   [0.627451   0.52156866 0.27450982]
   [0.21960784 0.12156863 0.02745098]
   [0.20784314 0.13333334 0.07843138]]

  [[0.7

In [4]:
base_model = VGG16(weights='imagenet', include_top=False, input_shape=x_train.shape[1:])

# Extract features from the pre-trained VGG16 model
features_train = base_model.predict(x_train)
features_test = base_model.predict(x_test)


[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 103ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 112ms/step


In [5]:
features_train = features_train.reshape(features_train.shape[0], -1)
print(features_train.shape)
print(features_train)


(50000, 512)
[[0.1017662  0.         0.84405106 ... 0.         0.39529693 0.        ]
 [0.9722147  0.         0.58577275 ... 0.         0.7981688  0.        ]
 [0.3126762  0.         0.22860837 ... 0.28975034 1.1120024  0.        ]
 ...
 [0.23091513 0.         0.118855   ... 0.2488248  0.5913635  0.        ]
 [0.         0.         0.2835029  ... 0.         0.59139776 0.        ]
 [0.         0.         0.7414491  ... 0.         0.98388207 0.        ]]


In [6]:
features_test = features_test.reshape(features_test.shape[0], -1)
print(features_test.shape)
print(features_test)


(10000, 512)
[[0.2865308  0.         1.2196426  ... 0.09188334 0.50850433 0.        ]
 [0.43598855 0.         0.68447256 ... 0.22535783 0.50218326 0.        ]
 [0.29467416 0.         0.         ... 0.31027997 0.62160414 0.        ]
 ...
 [0.         0.         1.1950839  ... 0.18938634 0.32439792 0.        ]
 [0.7446521  0.         1.1791041  ... 0.17728972 0.50125146 0.        ]
 [0.9902093  0.         1.0467237  ... 0.0069061  0.801184   0.        ]]


In [7]:

num_clusters = 10  #10 clusters for CIFAR-10 dataset
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(features_train)

# Use KMeans to predict cluster assignments for test data
cluster_assignments_test = kmeans.predict(features_test)

  super()._check_params_vs_input(X, default_n_init=10)


In [16]:
centroids = kmeans.cluster_centers_
print(centroids)
print(centroids.shape)
# Reconstruct the image from the cluster centroids
reconstructed_images = centroids[cluster_assignments_test]
print(reconstructed_images.shape)
print(cluster_assignments_test.shape)
print(y_test.shape)
y_test=np.ravel(y_test)

[[2.92101890e-01 1.00408215e-09 4.13032919e-01 ... 2.02255398e-01
  7.94649422e-01 2.35741027e-09]
 [1.54785722e-01 7.31385080e-06 3.47467870e-01 ... 3.59863788e-02
  5.67595422e-01 3.92901711e-09]
 [7.76232898e-01 8.87666829e-10 6.60345018e-01 ... 1.00222364e-01
  7.33931065e-01 1.47515791e-04]
 ...
 [4.72731650e-01 2.59247608e-05 6.67954803e-01 ... 2.83946633e-01
  5.93189657e-01 2.00868992e-04]
 [6.50744200e-01 1.16807397e-03 8.28194618e-01 ... 1.41202673e-01
  4.11073864e-01 1.39591470e-03]
 [6.00492537e-01 7.05767889e-10 6.32232666e-01 ... 7.10819662e-02
  5.32743335e-01 1.64557248e-04]]
(10, 512)
(10000, 512)
(10000,)
(10000,)


In [25]:

from scipy.optimize import linear_sum_assignment
# Calculate cost matrix (negative of the confusion matrix)
confusion_matrix = np.zeros((num_clusters, num_clusters))
for i in range(num_clusters):
    for j in range(num_clusters):
        confusion_matrix[i, j] = np.sum((cluster_assignments_test == i) & (y_test == j))

cost_matrix = -confusion_matrix

# Use Hungarian algorithm to find best matching between clusters and true labels
row_indices, col_indices = linear_sum_assignment(cost_matrix)

# Extract matching indices
matching_indices = col_indices

# Match predicted cluster assignments to true labels
cluster_assignments_matched = np.array([matching_indices[cluster] for cluster in cluster_assignments_test])

# Calculate accuracy
accuracy = np.mean(cluster_assignments_matched == y_test)
print(f"\nAccuracy: {accuracy}")



Accuracy: 0.2223


In [27]:
np.random.seed(42)
random_indices = np.random.choice(len(x_test), 100, replace=False)

# Print predicted and true cluster assignments for 5 examples
for idx in random_indices:
    print(f"Example {idx}: Predicted Cluster - {cluster_assignments_test[idx]}, True Label - {y_test[idx]}")

Example 6252: Predicted Cluster - 0, True Label - 2
Example 4684: Predicted Cluster - 4, True Label - 1
Example 1731: Predicted Cluster - 4, True Label - 5
Example 4742: Predicted Cluster - 7, True Label - 8
Example 4521: Predicted Cluster - 1, True Label - 9
Example 6340: Predicted Cluster - 0, True Label - 3
Example 576: Predicted Cluster - 2, True Label - 8
Example 5202: Predicted Cluster - 1, True Label - 9
Example 6363: Predicted Cluster - 1, True Label - 0
Example 439: Predicted Cluster - 6, True Label - 1
Example 2750: Predicted Cluster - 9, True Label - 5
Example 7487: Predicted Cluster - 1, True Label - 1
Example 5272: Predicted Cluster - 3, True Label - 3
Example 5653: Predicted Cluster - 4, True Label - 5
Example 3999: Predicted Cluster - 3, True Label - 9
Example 6033: Predicted Cluster - 2, True Label - 2
Example 582: Predicted Cluster - 6, True Label - 8
Example 9930: Predicted Cluster - 3, True Label - 8
Example 7051: Predicted Cluster - 0, True Label - 3
Example 8158: P