In [1]:
# for loading/processing the images 

#allows us to load an image from a file as a PIL object
from keras.preprocessing.image import load_img 

#allows us to convert the PIL object into a NumPy array
from keras.preprocessing.image import img_to_array 

#prepare your image into the format the model requires. You should load images with the Keras load_img function so that you guarantee the images you load are compatible with the preprocess_input function.
from keras.applications.vgg16 import preprocess_input 

# models 
#pre-trained model we’re going to use
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans

#for reducing the dimensions of our feature vector
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

In [2]:
path = r"/content/drive/MyDrive/Colab Notebooks/ClusteringImages/archive (1)/flower_images/flower_images"
# change the working directory to the path where the images are located
os.chdir(path)

# this list holds all the image filename
flowers = []

# creates a ScandirIterator aliased as files
with os.scandir(path) as files:
  # loops through each file in the directory
    for file in files:
        if file.name.endswith('.png'):
          # adds only the image files to the flowers list
            flowers.append(file.name)

FileNotFoundError: ignored

In [None]:
# view the first 10 flower entries
print(flowers[:10])


['0011.png', '0002.png', '0007.png', '0008.png', '0009.png', '0005.png', '0004.png', '0003.png', '0006.png', '0010.png']


In [None]:
# load the image as a 224x224 array
img = load_img(flowers[0], target_size=(224,224))
# convert from 'PIL.Image.Image' to numpy array
img = np.array(img)

print(img.shape)
(224, 224, 3)

(224, 224, 3)


(224, 224, 3)

In [None]:
reshaped_img = img.reshape(1,224,224,3)
print(reshaped_img.shape)
(1, 224, 224, 3)

(1, 224, 224, 3)


(1, 224, 224, 3)

In [None]:
x = preprocess_input(reshaped_img)


In [None]:
# load model
model = VGG16()
# remove the output layer
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


In [None]:
# load the model first and pass as an argument
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [None]:
data = {}
p = r"/content/drive/MyDrive/Colab Notebooks/ClusteringImages/archive (1)/flower_features.pkl"

# lop through each image in the dataset
for flower in flowers:
    # try to extract the features and update the dictionary
    try:
        feat = extract_features(flower,model)
        data[flower] = feat
    # if something fails, save the extracted features as a pickle file (optional)
    except:
        with open(p,'wb') as file:
            pickle.dump(data,file)
          
 
# get a list of the filenames
filenames = np.array(list(data.keys()))

# get a list of just the features
feat = np.array(list(data.values()))
feat.shape
(210, 1, 4096)

# reshape so that there are 210 samples of 4096 vectors
feat = feat.reshape(-1,4096)
feat.shape
(210, 4096)

# get the unique labels (from the flower_labels.csv)
df = pd.read_csv('flower_labels.csv')
label = df['label'].tolist()
unique_labels = list(set(label))


In [None]:
pca = PCA(n_components=100, random_state=22)
pca.fit(feat)
x = pca.transform(feat)

In [None]:
kmeans = KMeans(n_clusters=len(unique_labels), random_state=22)
kmeans.fit(x)

KMeans(n_clusters=10, random_state=22)

In [None]:
kmeans.labels_

array([7, 9, 7, 3, 9, 3, 3, 1, 4, 9, 9, 8, 7, 6, 0, 3, 7, 9, 1, 7, 3, 1,
       2, 1, 9, 5, 1, 1, 7, 0, 7, 7, 7, 9, 9, 6, 5, 5, 9, 9, 5, 9, 2, 7,
       1, 9, 9, 3, 7, 7, 0, 9, 6, 5, 2, 6, 4, 3, 6, 6, 5, 9, 9, 5, 4, 2,
       2, 0, 2, 7, 1, 3, 9, 8, 2, 4, 0, 7, 9, 3, 4, 6, 5, 8, 7, 7, 9, 6,
       1, 4, 2, 7, 3, 7, 6, 1, 2, 9, 3, 1, 4, 3, 7, 7, 4, 6, 6, 3, 7, 1,
       9, 4, 5, 7, 2, 1, 3, 6, 3, 1, 2, 0, 2, 2, 7, 6, 6, 7, 2, 1, 7, 3,
       9, 0, 1, 0, 1, 6, 6, 9, 9, 6, 6, 4, 7, 5, 6, 7, 1, 6, 3, 9, 5, 9,
       4, 7, 4, 4, 7, 2, 1, 1, 4, 5, 9, 6, 7, 2, 9, 6, 8, 3, 4, 7, 2, 6,
       3, 1, 9, 6, 7, 9, 9, 1, 3, 6, 7, 4, 6, 5, 3, 4, 1, 6, 2, 9, 3, 4,
       1, 2, 2, 9, 4, 7, 9, 5, 2, 7, 1, 0], dtype=int32)

In [None]:
# holds the cluster id and the images { id: [images] }
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)

In [None]:
# view the filenames in cluster 0
groups[0]

['0032.png',
 '0026.png',
 '0056.png',
 '0089.png',
 '0077.png',
 '0138.png',
 '0130.png',
 '0118.png',
 '0197.png']

In [None]:
groups[1]

['0003.png',
 '0039.png',
 '0029.png',
 '0021.png',
 '0024.png',
 '0027.png',
 '0069.png',
 '0087.png',
 '0061.png',
 '0106.png',
 '0102.png',
 '0094.png',
 '0132.png',
 '0139.png',
 '0122.png',
 '0127.png',
 '0113.png',
 '0164.png',
 '0151.png',
 '0145.png',
 '0190.png',
 '0184.png',
 '0179.png',
 '0206.png',
 '0199.png']