<a href="https://colab.research.google.com/github/bomiiisong/AI_Project/blob/master/Modeling/K_Means.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# K-Means Clustering

Reference : https://towardsdatascience.com/image-clustering-using-k-means-4a78478d2b83

* Google Drive Mount

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


* Dataset Check

In [3]:
!ls -l '/content/drive/My Drive/AI_team3/dataset_final/img_kmeans.zip'

-rw------- 1 root root 51734047 Apr 16 08:38 '/content/drive/My Drive/AI_team3/dataset_final/img_kmeans.zip'


* Unzip File

In [None]:
!unzip /content/drive/My\ Drive/AI_team3/dataset_final/img_kmeans.zip

* Import Packages

In [5]:
from keras.applications import VGG16
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import img_to_array
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import keras
import tensorflow
from tqdm import tqdm
import os
import shutil

* Function to Extract features from the images

In [6]:
# Function to Extract features from the images
def image_feature(direc):
    model = VGG16(weights='imagenet', include_top=False)
    features = [];
    img_name = [];
    for i in tqdm(direc):
        fname='/content/data'+'/'+i
        img=image.load_img(fname,target_size=(640, 640, 3))
        x = img_to_array(img)
        x=np.expand_dims(x,axis=0)
        x=preprocess_input(x)
        feat=model.predict(x)
        feat=feat.flatten()
        features.append(feat)
        img_name.append(i)
    return features,img_name

* Load image path for Extract Features

In [8]:
img_path=os.listdir('/content/data')
img_features,img_name=image_feature(img_path)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


100%|██████████| 544/544 [01:16<00:00,  7.15it/s]


* Cluster Model

In [9]:
#Creating Clusters
k = 2
clusters = KMeans(k, random_state = 827)
clusters.fit(img_features)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=827, tol=0.0001, verbose=0)

* Create dataframe with result of clustering

In [10]:
image_cluster = pd.DataFrame(img_name,columns=['image'])
image_cluster["clusterid"] = clusters.labels_
image_cluster # 0 denotes cat and 1 denotes dog

Unnamed: 0,image,clusterid
0,267.jpg,1
1,56.jpg,0
2,200.jpg,1
3,305.jpg,1
4,411.jpg,0
...,...,...
539,76.jpg,1
540,513.jpg,0
541,274.jpg,1
542,83.jpg,0


* Clustering 결과 저장

* Make folder to seperate images

In [None]:
# Made folder to seperate images
os.mkdir('/content/drive/MyDrive/best')
os.mkdir('/content/drive/MyDrive/worst')

In [12]:
# Images will be seperated according to cluster they belong
for i in range(len(image_cluster)):
    if image_cluster['clusterid'][i]==1:
        # print(image_cluster['image'])
        shutil.move(os.path.join('/content/data', image_cluster['image'][i]), '/content/drive/MyDrive/best')
    else:
        # print(image_cluster['image'])
        shutil.move(os.path.join('/content/data', image_cluster['image'][i]), '/content/drive/MyDrive/worst')

* Best Cluster Visualization

In [14]:
import pathlib
from glob import glob

data_root = pathlib.Path(glob('/content/drive/MyDrive/*')[7])
print(data_root)
# label = ['0','1']

/content/drive/MyDrive/best


In [18]:
import random

all_image_paths = list(data_root.glob('*'))
all_image_paths = [str(path) for path in all_image_paths]

# all_image_paths
random.choice(all_image_paths)

'/content/drive/MyDrive/best/100.jpg'

In [None]:
import PIL.Image as Image
import matplotlib.pyplot as plt

image_count = len(all_image_paths)
print('image_count:', image_count)

plt.figure(figsize=(12,12))
for n in range(15):
  image_path = random.choice(all_image_paths)
  plt.subplot(5,3, n+1)
  plt.imshow(plt.imread(image_path))
  idx = image_path.split('/')[-2]
  plt.title(str(idx)) # 1 -> Best , 0 -> Worst
  plt.axis('off')
plt.show()

* Worst Cluster Visualization

In [20]:
worst_data_root = pathlib.Path(glob('/content/drive/MyDrive/*')[8])
print(worst_data_root)

worst_image_paths = list(worst_data_root.glob('*'))
worst_image_paths = [str(path) for path in worst_image_paths]

random.choice(worst_image_paths)

/content/drive/MyDrive/worst


'/content/drive/MyDrive/worst/140.jpg'

In [None]:
import PIL.Image as Image
import matplotlib.pyplot as plt

image_num = len(worst_image_paths)
print('image_count:', image_num)

plt.figure(figsize=(12,12))
for n in range(15):
  img_path = random.choice(worst_image_paths)
  plt.subplot(5,3, n+1)
  plt.imshow(plt.imread(img_path))
  idx = img_path.split('/')[-2]
  plt.title(str(idx)) # 1 -> Best , 0 -> Worst
  plt.axis('off')
plt.show()