In [2]:
import cv2 # First install your OpenCV-Python if you haven't
import os, pickle
import numpy as np
import pandas as pd
import sklearn
from scipy.cluster.vq import vq, kmeans, whiten # not used
import matplotlib.pyplot as plt
from imutils import paths
from sklearn.cluster import KMeans as km # not used
from sklearn.cluster import MiniBatchKMeans as MBkm
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer # not used
from scipy.signal import convolve2d as conv # not used
from scipy import stats

In [3]:
def produce_model(paths):
    sift_vector = [] # append each sift vector to the list
    print('Performing SIFT on each image in the image categories...')
    for cat in range(len(paths)):
        print(f'working on category: {categories[cat]}', end='...')
        for image_path in paths[cat]:
            im = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) # read images one by one, converting them to gray on read
            im = cv2.resize(im, (224, 224)) # resize image to standard 224x224 size
            _, des = sift.detectAndCompute(im, None) # SIFT algorithm to get descriptors
            sift_vector.append(des) # add the (keypoints, 128-d) matrix to the sift_vector list
        print('done')
    #print(len(sift_vector), len(sift_vector[0]), len(sift_vector[0][0])) # sift vector is images x key points x 128-d descriptors
    sift_vector = np.asarray(sift_vector, dtype=object) # convert the list to a numpy array for use of concatenate function
    sift_feature_matrix = np.concatenate(sift_vector, axis = 0) # concatenate along axis 0, ie the result will be (imagesxkeypoints, 128-d descriptors)
    print('Fitting SIFT matrix to MiniBatch KMeans', end='...')
    mbkm_model = MBkm(n_clusters=clusters, random_state=SEED).fit(sift_feature_matrix) # use cluster count, seed for reproducability and fit to the sift vector generated above!
    print('MiniBatch Kmeans Model Produced!\n\n')
    return mbkm_model, sift_feature_matrix

def generate_features(paths, mbkm_model):
    histograms_ = [] # append each histogram vector to the list
    truth_values = [] # append category index per image
    histogram_edges = [] # testing 
    print('Generating feature vectors for each image using sift descriptors, kmeans model predict and histogram...')
    for cat in range(len(paths)):
        print(f'working on category: {categories[cat]}', end='...')
        for image_path in paths[cat]:
            im = cv2.imread(image_path) # read image --- BGR
            im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
            im = cv2.resize(im, (224,224)) # resize the gray images
            #histo_edges, _ = np.histogram(cv2.pyrDown(conv(im, kern, mode='same')).flatten(), bins=int(clusters/2))
            histo_edges, _ = np.histogram(cv2.pyrDown(cv2.Canny(im,40,100)).flatten(), bins=int(clusters/2)) # canny edge detect, downsample, flatten and run through histogram
            mean_ = np.mean(histo_edges)
            max_ = np.max(histo_edges)
            min_ = np.min(histo_edges)
            histo_edges = (histo_edges - mean_) / (max_ - min_) # mean normalization of additional features from canny edge detection
            histogram_edges.append(histo_edges)
            _, des = sift.detectAndCompute(im, None) # SIFT algorithm to get descriptors
            preds = mbkm_model.predict(des) # use the model to predict the descriptor clusters
            histogram, _ = np.histogram(preds, bins=clusters) # only need histogram values as that is the feature vector for the image
            histograms_.append(histogram) # append the hist to the list of hists (ie feature vectors)
            truth_values.append(cat) # build the truth values
        print('done')
    histogram_edges =np.asarray(histogram_edges) 
    histograms_ = np.asarray(histograms_)
    truth_values = np.asarray(truth_values)
    return histograms_, truth_values, histogram_edges

In [4]:
'''kern = [[-1/9, -1/9, -1/9],
            [-1/9, -1/9, -1/9],
            [-1/9, -1/9, -1/9] ]''' # uncomment this is you wish to use conv layer instead of canny edge detector as extra features

SEED = 42 # seed kmeans with a set value -- removed for randomness :D
clusters = 81 # number of clusters - iteratively searched for in the cell at the bottom this notebook!
key_points = 400 # max number of key points per image
sift = cv2.SIFT_create(key_points) # limit the number of keypoints
categories = ['daisy','dandelion','rose','sunflower','tulip'] # classes found inside train
# train folders
train_folder = './data/train/' # location of train data
train_im_folders = [train_folder + categories[i] for i in range(len(categories))] # create list of paths for each category
train_im_paths = [list(paths.list_images(train_im_folders[i])) for i in range(len(train_im_folders))] # get names of all images indexed by class
# test folders
test_folder = './data/test/' # location of test data
test_im_folders = [test_folder + categories[i] for i in range(len(categories))] # create list of paths for each category
test_im_paths = [list(paths.list_images(test_im_folders[i])) for i in range(len(test_im_folders))] # get names of all images indexed by class

print('Generating MiniBatch Kmeans model:\n')
MBkmeans_model, descr = produce_model(train_im_paths)
print('Generating Train Features:\n')
train_features_hists, y_train_true, train_edges  = generate_features(train_im_paths, MBkmeans_model)
print('\nGenerating Test Features:\n')
X_test_features, y_test, test_edges = generate_features(test_im_paths, MBkmeans_model)

Generating MiniBatch Kmeans model:

Performing SIFT on each image in the image categories...
working on category: daisy...done
working on category: dandelion...done
working on category: rose...done
working on category: sunflower...done
working on category: tulip...done
Fitting SIFT matrix to MiniBatch KMeans...MiniBatch Kmeans Model Produced!


Generating Train Features:

Generating feature vectors for each image using sift descriptors, kmeans model predict and histogram...
working on category: daisy...done
working on category: dandelion...done
working on category: rose...done
working on category: sunflower...done
working on category: tulip...done

Generating Test Features:

Generating feature vectors for each image using sift descriptors, kmeans model predict and histogram...
working on category: daisy...done
working on category: dandelion...done
working on category: rose...done
working on category: sunflower...done
working on category: tulip...done


In [5]:
# stack the feature dimensions 
train_features = np.hstack((train_features_hists, train_edges))
X_test = np.hstack((X_test_features, test_edges))

X_train, X_val, y_train, y_val = tts(train_features, y_train_true, test_size=0.2, random_state=SEED)

In [6]:
# Data for training and testing!
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.linear_model import RidgeClassifier as LR_ridge
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
from sklearn.svm import SVC
print('----------------------------------------------------------')
print('Random Forest')
print('----------------------------------------------------------')
RF_model = RF(random_state=SEED, max_depth=12, n_estimators=1000).fit(X_train, y_train)
print('\nTrain Scores:')
pred_train = RF_model.predict(X_train)
print(f'Confusion Matrix: \n{confusion_matrix(y_train, pred_train)}')
print(f'F1-Score: {f1_score(y_train, pred_train, average="weighted")}')
print(f'Accuracy: {accuracy_score(y_train, pred_train)}')
print('\nValidation Scores:')
pred_val = RF_model.predict(X_val)
print(f'Confusion Matrix: \n{confusion_matrix(y_val, pred_val)}')
print(f'F1-Score: {f1_score(y_val, pred_val, average="weighted")}')
print(f'Accuracy: {accuracy_score(y_val, pred_val)}')
print('\nTest Scores:')
pred_test_rf = RF_model.predict(X_test)
print(f'Confusion Matrix: \n{confusion_matrix(y_test, pred_test_rf)}')
print(f'F1-Score: {f1_score(y_test, pred_test_rf, average="weighted")}')
print(f'Accuracy: {accuracy_score(y_test, pred_test_rf)}\n')

print('----------------------------------------------------------')
print('Multinomial Logistic Regression')
print('----------------------------------------------------------')
print('\nTrain Scores:')
LR_model = LR_ridge(max_iter=800, alpha=10e-3).fit(X_train, y_train)
pred_train = LR_model.predict(X_train)
print(f'Confusion Matrix: \n{confusion_matrix(y_train, pred_train)}')
print(f'F1-Score: {f1_score(y_train, pred_train, average="weighted")}')
print(f'Accuracy: {accuracy_score(y_train, pred_train)}')
print('\nValidation Scores:')
pred_val = LR_model.predict(X_val)
print(f'Confusion Matrix: \n{confusion_matrix(y_val, pred_val)}')
print(f'F1-Score: {f1_score(y_val, pred_val, average="weighted")}')
print(f'Accuracy: {accuracy_score(y_val, pred_val)}')
print('\nTest Scores:')
pred_test_lr = LR_model.predict(X_test)
print(f'Confusion Matrix: \n{confusion_matrix(y_test, pred_test_lr)}')
print(f'F1-Score: {f1_score(y_test, pred_test_lr, average="weighted")}')
print(f'Accuracy: {accuracy_score(y_test, pred_test_lr)}\n')

print('----------------------------------------------------------')
print('SVM')
print('----------------------------------------------------------')
print('\nTrain Scores:')
SVM_model = SVC().fit(X_train, y_train)
pred_train = SVM_model.predict(X_train)
print(f'Confusion Matrix: \n{confusion_matrix(y_train, pred_train)}')
print(f'F1-Score: {f1_score(y_train, pred_train, average="weighted")}')
print(f'Accuracy: {accuracy_score(y_train, pred_train)}')
print('\nValidation Scores:')
pred_val = SVM_model.predict(X_val)
print(f'Confusion Matrix: \n{confusion_matrix(y_val, pred_val)}')
print(f'F1-Score: {f1_score(y_val, pred_val, average="weighted")}')
print(f'Accuracy: {accuracy_score(y_val, pred_val)}')
print('\nTest Scores:')
pred_test_svm = SVM_model.predict(X_test)
print(f'Confusion Matrix: \n{confusion_matrix(y_test, pred_test_svm)}')
print(f'F1-Score: {f1_score(y_test, pred_test_svm, average="weighted")}')
print(f'Accuracy: {accuracy_score(y_test, pred_test_svm)}\n')

pred_test_ensemble = []
for i in range(pred_test_rf.shape[0]):
    mode = stats.mode([pred_test_rf[i], pred_test_lr[i], pred_test_svm[i],])[0][0]
    pred_test_ensemble.append(mode)

print('----------------------------------------------------------')
print('Ensemble Results of test set predictions from Random Forest, Ridge Regression and SVM')
print('----------------------------------------------------------')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, pred_test_ensemble)}')
print(f'\nF1-Score: {f1_score(y_test, pred_test_ensemble, average="weighted")}')
print(f'Accuracy: {accuracy_score(y_test, pred_test_ensemble)}\n')

----------------------------------------------------------
Random Forest
----------------------------------------------------------

Train Scores:
Confusion Matrix: 
[[462   0   0   0   0]
 [  0 678   0   0   0]
 [  0   2 501   0   0]
 [  0   1   0 474   0]
 [  0   0   1   0 642]]
F1-Score: 0.9985515765105095
Accuracy: 0.9985512495472655

Validation Scores:
Confusion Matrix: 
[[ 57  30  14  11  37]
 [  6 115   7   9  26]
 [  4  14  47  11  48]
 [  6  19  12  60  14]
 [ 14  14  14  13  89]]
F1-Score: 0.5276401645694381
Accuracy: 0.532561505065123

Test Scores:
Confusion Matrix: 
[[ 72  28  10  12  31]
 [ 19 130  10  20  32]
 [  9  27  65  14  42]
 [ 15  22  13  78  19]
 [  4  17  28  18 130]]
F1-Score: 0.5458432481268288
Accuracy: 0.5491329479768786

----------------------------------------------------------
Multinomial Logistic Regression
----------------------------------------------------------

Train Scores:
Confusion Matrix: 
[[202 124  42  37  57]
 [ 34 530  24  30  60]
 [ 23  93 

In [None]:
'''
Shown above are my train, validation and test results across three different ML classifiers. 
I used Random Forests, Ridge Regression and SVM. To achieve slighly higher results I used a type of ensemble
of these classifiers, by taking the mode of each prediction (if there is no mode than the smallest index is taken)
and generating a new prediction. I achieved an F1 Score of ~0.571 and Accuracy of ~0.575 with the ensemble
method!
'''

In [28]:
# =====================================================================
# Cluster Count Hyper-Parameter Iterative Testing
# Long Run Time Warning!

# variable 'descr' is from above code, it must be run first. 
# =====================================================================

def iterative_testing(clusties):
    print('Fitting SIFT matrix to MiniBatch KMeans')
    mbkm_model = MBkm(n_clusters=clusties, random_state=SEED).fit(descr)
    print('Generating Train Features:\n')
    train_features_hists, y_train_true, train_edges  = generate_features(train_im_paths, mbkm_model)
    print('\nGenerating Test Features:\n')
    X_test_features, y_test, test_edges = generate_features(test_im_paths, mbkm_model)
    
    train_features = np.hstack((train_features_hists, train_edges))
    X_test = np.hstack((X_test_features, test_edges))
    X_train, X_val, y_train, y_val = tts(train_features, y_train_true, test_size=0.2, random_state=SEED)

    RF_model = RF(random_state=SEED, max_depth=12, n_estimators=1000).fit(X_train, y_train)
    pred_test = RF_model.predict(X_test)
    RF_accuracy = f1_score(y_test, pred_test, average="weighted")
    
    LR_model = LR_ridge(max_iter=800, alpha=10e-3).fit(X_train, y_train)
    pred_test = LR_model.predict(X_test)
    LR_accuracy = f1_score(y_test, pred_test, average="weighted")
    
    SVM_model = SVC().fit(X_train, y_train)
    pred_test = SVM_model.predict(X_test)
    SVM_accuracy = f1_score(y_test, pred_test, average="weighted")
    
    return RF_accuracy, LR_accuracy, SVM_accuracy

clusters_range = np.arange(65,95, 2)
RF_acc_by_cluster = []
LR_acc_by_cluster = []
SVM_acc_by_cluster = []

for clustie in clusters_range:
    print(f'\ncluster: {clustie}')
    rf_acc, lr_acc, svm_acc = iterative_testing(clustie)
    print(f'RF f1: {rf_acc}')
    print(f'LR f1: {lr_acc}')
    print(f'SVM f1: {svm_acc}')
    RF_acc_by_cluster.append(rf_acc)
    LR_acc_by_cluster.append(lr_acc)
    SVM_acc_by_cluster.append(svm_acc)


cluster: 65
Fitting SIFT matrix to MiniBatch KMeans
Generating Train Features:

Generating feature vectors for each image using sift descriptors, kmeans model predict and histogram...
working on category: daisy...done
working on category: dandelion...done
working on category: rose...done
working on category: sunflower...done
working on category: tulip...done

Generating Test Features:

Generating feature vectors for each image using sift descriptors, kmeans model predict and histogram...
working on category: daisy...done
working on category: dandelion...done
working on category: rose...done
working on category: sunflower...done
working on category: tulip...done
RF f1: 0.5206279252397833
LR f1: 0.521714951522194
SVM f1: 0.5179768535281327

cluster: 67
Fitting SIFT matrix to MiniBatch KMeans
Generating Train Features:

Generating feature vectors for each image using sift descriptors, kmeans model predict and histogram...
working on category: daisy...done
working on category: dandelion..

KeyboardInterrupt: 

In [None]:
x_range = len(clusters_range)
plt.plot(x_range, RF_acc_by_cluster)
plt.plot(x_range, LR_acc_by_cluster)
plt.plot(x_range, SVM_acc_by_cluster)
plt.xlabel('f1-score')
plt.ylabel('cluster count')
plt.show()