# Import Libs

In [17]:
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
import pandas as pd
import os
from os import walk, listdir
import warnings
import sys

# Init Model

In [2]:
model = VGG16(weights='imagenet', include_top=False)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0         
__________

## Load in image data / preprocessing test with predictions

In [3]:
img_path = '/Users/shawn/Developer/ML-Research/training_img-sets/randall/Acanthuridae/Acanthurus_Acanthurus achilles_-117788879.jpg'
img = image.load_img(img_path, target_size=(224, 224))
img_data = image.img_to_array(img)
img_data = np.expand_dims(img_data, axis=0)
img_data = preprocess_input(img_data)

vgg16_feature = model.predict(img_data)

print(vgg16_feature.shape)

(1, 7, 7, 512)


# Unsupervised Image Clustering

## GetFishDirs Function

In [4]:
# obtain all subdirs of fish
def getFishDirs(file_path):
    randall_fish = [f.path for f in os.scandir(file_path) if f.is_dir()]
    
    return(randall_fish)

## Get Fish Paths per Subdir

In [18]:
randall = '/Users/shawn/Developer/ML-Research/training_img-sets/randall/'
fish_subdirs = getFishDirs(randall)

fish_paths = []

for i in range(len(fish_subdirs)):
    for (dirpath, dirnames, filenames) in walk(fish_subdirs[i]):
        for j in range(len(filenames)):
            img = dirpath + '/' + filenames[j]
            if "Store" not in img:
                fish_paths.append(img)
            
        break

## Define VGG16 Model

In [6]:
model = VGG16(weights='imagenet', include_top=False)

vgg16_feature_list = []

warnings.filterwarnings('ignore')

## VGG16 Image Processing

In [7]:
print("Beginning Image Processing...")

for i, fname in enumerate(fish_paths):
    # process files under pre-fetched subdirs
    img = image.load_img(fish_paths[i], target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)
    
    vgg16_feature = model.predict(img_data)
    vgg16_feature_np = np.array(vgg16_feature)
    vgg16_feature_list.append(vgg16_feature_np.flatten())
    
    # progress percentage in console during img processing for tracking
    sys.stdout.write("\r{0}".format((float(i)/len(fish_paths))*100))
    sys.stdout.flush()
    
print("\nImage Processing Completed!")

vgg16_feature_list_np = np.array(vgg16_feature_list)
kmeans = KMeans(n_clusters=10, random_state=0).fit(vgg16_feature_list_np)

print("VGG16 Feature List and KMeans Computed.")

Beginning Image Processing...
99.98894905514422676
Image Processing Completed!
VGG16 Feature List and KMeans Computed.


## Internal Cluster Validation Scoring

In [21]:
predicted_labels = kmeans.labels_

ICV_Score = metrics.silhouette_score(vgg16_feature_list_np, predicted_labels, metric='euclidean')
print("Internal Cluster Validation Score:", ICV_Score)

print("\nPredicted Clusters:", predicted_labels)


Internal Cluster Validation Score: 0.032346845

Predicted Clusters: [6 7 1 ... 4 7 7]


## Define write_list_to_file (non-np array)

In [22]:
def write_list_to_file(fishy_paths_list, filename):
    # Write the list to CSV file

    with open(filename, "w") as outfile:
        for entries in fishy_paths_list:
            outfile.write(entries)
            outfile.write("\n")

## Save Outputs to CSV

In [23]:
cluster_output_filename = "pred_clusters_randallfish.csv"
fish_paths_output_filename = "fish_paths_randallfish.csv"
concat_clusters_fishpaths_filename = "fish_predictions_output.csv"

# save separate CSVs
np.savetxt(cluster_output_filename, predicted_labels, delimiter=",")
write_list_to_file(fish_paths, fish_paths_output_filename)

# concatenate paths and cluster identification CSVs
df1 = pd.read_csv(cluster_output_filename)
df2 = pd.read_csv(fish_paths_output_filename)
df3 = pd.concat([df2, df1], axis=1)
df3.to_csv(concat_clusters_fishpaths_filename, index=False)