# May 29, 2022: Feature vector clusterization with K-Means

In [1]:
import os
import pandas as pd
import tensorflow as tf
from sklearn.cluster import KMeans

We loads the images.

In [2]:
image_filenames = [(filename, filename.split('.')[0]) for filename in os.listdir(f"{os.environ['SCRATCH']}/cats-vs-dogs")]

In [3]:
image_dataset = pd.DataFrame(image_filenames, columns=['filename', 'class'])
image_dataset

Unnamed: 0,filename,class
0,dog.8254.jpg,dog
1,dog.10921.jpg,dog
2,cat.8024.jpg,cat
3,dog.9380.jpg,dog
4,dog.995.jpg,dog
...,...,...
24995,cat.8061.jpg,cat
24996,dog.9410.jpg,dog
24997,dog.6603.jpg,dog
24998,dog.4197.jpg,dog


In [4]:
image_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
)

In [5]:
images = image_generator.flow_from_dataframe(
    image_dataset,
    directory=f"{os.environ['SCRATCH']}/cats-vs-dogs",
    x_col='filename',
    y_col='class',
    shuffle=False,
    target_size=(299, 299)
)

Found 25000 validated image filenames belonging to 2 classes.


We load a pre-trained model.

In [6]:
model = tf.keras.applications.inception_resnet_v2.InceptionResNetV2(
    include_top=False,
    weights='imagenet',
    input_shape=(299, 299, 3),
    pooling='avg',
)

We generate feature vectors.

In [7]:
feature_vectors = model.predict(images)



We split the feature vectors into 2 clusters.

In [8]:
model = KMeans(n_clusters=2)

In [9]:
clusters = model.fit_predict(feature_vectors)

In [10]:
image_dataset['cluster_n2'] = clusters

In [11]:
conclusion_dataset = image_dataset.groupby(['cluster_n2','class'], as_index=False).size()

In [12]:
cluster_size_dataset = image_dataset.groupby(['cluster_n2'], as_index=False).size()
cluster_size_dataset = cluster_size_dataset.loc[cluster_size_dataset.index.repeat(2)]
cluster_size_dataset = cluster_size_dataset.reset_index()

In [13]:
conclusion_dataset['# of class'] = conclusion_dataset['size']
conclusion_dataset['% of class'] = conclusion_dataset['size'] / cluster_size_dataset['size'] * 100
conclusion_dataset = conclusion_dataset.drop(['size'], axis=1)
conclusion_dataset

Unnamed: 0,cluster_n2,class,# of class,% of class
0,0,cat,11746,99.779137
1,0,dog,26,0.220863
2,1,cat,754,5.70003
3,1,dog,12474,94.29997


We split the feature vectors into 4 clusters.

In [14]:
model = KMeans(n_clusters=4)

In [15]:
clusters = model.fit_predict(feature_vectors)

In [16]:
image_dataset['cluster_n4'] = clusters

In [17]:
conclusion_dataset = image_dataset.groupby(['cluster_n4','class'], as_index=False).size()

In [18]:
cluster_size_dataset = image_dataset.groupby(['cluster_n4'], as_index=False).size()
cluster_size_dataset = cluster_size_dataset.loc[cluster_size_dataset.index.repeat(2)]
cluster_size_dataset = cluster_size_dataset.reset_index()

In [19]:
conclusion_dataset['# of class'] = conclusion_dataset['size']
conclusion_dataset['% of class'] = conclusion_dataset['size'] / cluster_size_dataset['size'] * 100
conclusion_dataset = conclusion_dataset.drop(['size'], axis=1)
conclusion_dataset

Unnamed: 0,cluster_n4,class,# of class,% of class
0,0,cat,187,2.774069
1,0,dog,6554,97.225931
2,1,cat,582,8.941466
3,1,dog,5927,91.058534
4,2,cat,5482,99.945305
5,2,dog,3,0.054695
6,3,cat,6249,99.744613
7,3,dog,16,0.255387


We split the feature vectors into 8 clusters.

In [20]:
model = KMeans(n_clusters=8)

In [21]:
clusters = model.fit_predict(feature_vectors)

In [22]:
image_dataset['cluster_n8'] = clusters

In [23]:
conclusion_dataset = image_dataset.groupby(['cluster_n8','class'], as_index=False).size()

In [24]:
cluster_size_dataset = image_dataset.groupby(['cluster_n8'], as_index=False).size()
cluster_size_dataset = cluster_size_dataset.loc[cluster_size_dataset.index.repeat(2)]
cluster_size_dataset = cluster_size_dataset.reset_index()

In [25]:
conclusion_dataset['# of class'] = conclusion_dataset['size']
conclusion_dataset['% of class'] = conclusion_dataset['size'] / cluster_size_dataset['size'] * 100
conclusion_dataset = conclusion_dataset.drop(['size'], axis=1)
conclusion_dataset

Unnamed: 0,cluster_n8,class,# of class,% of class
0,0,cat,151,5.634328
1,0,dog,2529,94.365672
2,1,cat,3620,99.917196
3,1,dog,3,0.082804
4,2,cat,4124,99.637594
5,2,dog,15,0.362406
6,3,cat,631,14.363761
7,3,dog,3762,85.636239
8,4,cat,8,0.415369
9,4,dog,1918,99.584631
