In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [2]:
### Import necessary libraries ###
import os
import cv2
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, adjusted_rand_score, normalized_mutual_info_score
import random
random.seed(42)

In [3]:
### Hyperparameters ###
re_size = (196, 196)  #(144, 260)
num_classes = 5

In [4]:
### Prepare dataset ###
# Dictionaries for label-int transformation
str2int = {'Crayon_Shin': 0, 'Doraemon': 1, 'Hua_Family': 2, 'Ilu': 3, 'Maruko': 4}
int2str = {0: 'Crayon_Shin', 1: 'Doraemon', 2: 'Hua_Family', 3: 'Ilu', 4: 'Maruko'}


# CustomDataset: load formatted dataset
def CustomDataset(directory):
  data = []
  labels = []

  # Traverse the directories of each class
  for root, dirs, files in os.walk(directory):

    # Traverse the images of each directory
    for d in dirs:
      images = os.listdir(os.path.join(root, d))

      # Image pre-processing: resize & normalize
      for img_path in images:
        img = cv2.imread(os.path.join(root, d, img_path))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, re_size).flatten()
        data.append(img/255.0)

        labels.append(str2int[d])

  # Return images along with corresponding labels
  return np.array(data), np.array(labels)


# Load formatted dataset from the directory "AI_Project1/data"
data, labels = CustomDataset(os.path.join(os.getcwd(), 'gdrive/MyDrive/AI_Project1/data'))

In [None]:
"""
### Apply PCA(optional) ###
from sklearn.decomposition import PCA

n_components = 300

# Define PCA model
pca = PCA(n_components=n_components)

# Fit the model and transfor the dataset
data = pca.fit_transform(data)
"""

In [6]:
### Train KMeans ###
# Define K-means clustering model
kmeans = KMeans(n_clusters=num_classes, n_init=10)

# Fit the model on the dataset
kmeans.fit(data)

# Fetch cluster centers and cluster labels
centers = kmeans.cluster_centers_
clusters = kmeans.labels_

In [None]:
### Evaluation ###
# I. External evaluation metrics
ari = adjusted_rand_score(labels, clusters)

# Show ARI
print("Adjusted Rand Index (ARI): {:.4f}".format(ari))


# II. Accuracy
# (To compute accuracy, we need to convert the cluster labels back to their original ones.)
# Mapping cluster labels to original labels
def get_reference_dict(clusters,labels):
    reference_label = {}
    # For loop to run through each label of cluster label
    for i in range(len(np.unique(clusters))):
        index = np.where(clusters == i,1,0)
        num = np.bincount(labels[index==1]).argmax()
        reference_label[i] = num

    return reference_label

# Mapping predictions to original labels
def get_labels(clusters,refernce_labels):
    temp_labels = np.random.rand(len(clusters))
    for i in range(len(clusters)):
      temp_labels[i] = refernce_labels[clusters[i]]
    return temp_labels

# Labels conversion
reference_labels = get_reference_dict(clusters, labels)
pred_labels = get_labels(clusters, reference_labels)

# Show accuracy
print("Accuracy: {:.4f}".format(accuracy_score(pred_labels,labels)))

In [None]:
### Show results ###
import matplotlib.pyplot as plt

# Show the image and its prediction
for i in range(10):
  image = data[i*100].reshape((re_size[0], re_size[1], 3))

  plt.imshow(image)
  plt.text(0, -0.1, "Pred : {}\nLabel: {}".format(int2str[pred_labels[i*100]], int2str[labels[i*100]]), transform=plt.gca().transAxes)
  plt.axis('off')

  plt.savefig(os.path.join(os.getcwd(), f'gdrive/MyDrive/AI_Project1/result/KMeans/prediction_{i*100+1}.jpg'))
  plt.show()

  plt.clf()