Note: the data used in this project can be downloaded from: https://www.dropbox.com/s/cst9awcjpp08k33/50_categories.tar.gz

## Imports

In [297]:
from skimage import feature, filters
from skimage.io import imread
from skimage.segmentation import felzenszwalb
from itertools import combinations
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, zero_one_loss
import matplotlib.pyplot as plt
%matplotlib inline

## Compute Features from Data

In [260]:
def feature_extract(img_file):
    
    # read image
    im_arr = imread(img_file)
    
    # check if image has only one color channel and if so stack the image three times to ensure that there are three
    # (unfortunately) identical color channels
    if len(im_arr.shape) == 2:
        im_arr = np.dstack((im_arr, im_arr, im_arr))
    
    ## instatiate list (later to convert to array) to hold features
    features = []
    
    # first do some dumb features:
    
    # calculate the mean in each color
    color_means = im_arr.mean(axis=1).mean(axis=0)
    
    # use the ratio of maximum value in each color to the mean of each color
    color_max_div_mean = im_arr.max(axis=1).max(axis=0) / color_means
    features += list(color_max_div_mean)
     
    # use the ratio of standard deviations in each color to mean in each color as another set of features
    color_std_div_mean = im_arr.std(axis=1).mean(axis=0) / color_means
    features += list(color_std_div_mean)
    
    # use ratios of means, and correlation coefficients between flattened as additional features
    mean_ratios = []
    corr_coefs = []
    for idx_pair in combinations(range(3), 2):
        mean_ratios.append(color_means[idx_pair[0]] / color_means[idx_pair[1]])
        corr_coefs.append(np.corrcoef(im_arr[:,:,idx_pair[0]].flatten(), im_arr[:,:,idx_pair[1]].flatten())[0,1])
    features += mean_ratios + corr_coefs
    
    # encode edge information
    for i in range(3):
        features.append(np.mean(filters.sobel(im_arr[:,:,i])))
        features.append(np.mean(filters.sobel_v(im_arr[:,:,i])))
        features.append(np.mean(filters.sobel_h(im_arr[:,:,i])))
        
    # encode segmentation information
    features.append(felzenszwalb(im_arr).mean() / im_arr.mean())
    
    return np.array(features)

## Run Feature Extraction

In [261]:
path = '50_categories/'

# get labels
labels = np.array([dr for dr in os.listdir(path) if '.DS_Store' not in dr])

# process features
X = []
y = []
for label in labels:
    print('Processing label: {}'.format(label))
    for fl in [fl for fl in os.listdir(path + label) if '.DS_Store' not in fl]:
        X.append(feature_extract(path + label + '/' + fl))
        y.append(label)
        
X = np.array(X)
y = np.array(y)

Processing label: gorilla
Processing label: raccoon
Processing label: crab
Processing label: blimp
Processing label: snail
Processing label: airplanes
Processing label: dog
Processing label: dolphin
Processing label: goldfish
Processing label: giraffe
Processing label: bear
Processing label: killer-whale
Processing label: penguin
Processing label: zebra
Processing label: duck
Processing label: conch
Processing label: camel
Processing label: owl
Processing label: helicopter
Processing label: starfish
Processing label: saturn
Processing label: galaxy
Processing label: goat
Processing label: iguana
Processing label: elk
Processing label: hummingbird
Processing label: triceratops
Processing label: porcupine
Processing label: teddy-bear
Processing label: comet
Processing label: hot-air-balloon
Processing label: leopards
Processing label: toad
Processing label: mussels
Processing label: kangaroo
Processing label: speed-boat
Processing label: bat
Processing label: swan
Processing label: octop

## Perform Classification

In [262]:
# prepare training and testing data
X_tr_tmp, X_test_tmp, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# fit and scale training data
X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_tr_tmp)

# use scaling from training data to transform testing data
X_test = X_scaler.transform(X_test_tmp)

In [263]:
# determine baseline
d_clf = DummyClassifier(strategy='prior')
d_clf.fit(X_train, y_train)
d_clf.score(X_test, y_test)

0.14487632508833923

In [265]:
# do random forest classification with default params
rf_clf = RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=100)
rf_clf.fit(X_train, y_train)
rf_clf.score(X_test, y_test)

0.23910482921083628

In [298]:
# do grid search over parameters with random forest classifier
parameters = {'n_estimators': [10, 50, 150, 200, 300], 'max_depth': [10, 50, 100], 'min_samples_split': [2, 3, 4, 5]}
cross_val = StratifiedKFold(n_splits=8, random_state = 100)
gs = GridSearchCV(RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=100), parameters, cv = cross_val, n_jobs=-1)
gs.fit(X_train, y_train)
gs.score(X_test, y_test)

0.28150765606595995

## Evaluate Classification

In [291]:
def eval_class(clf, X_test, y_test):
    
    pred = clf.predict(X_test)
    
    print('Classification Metrics, between 0 and 1\n')
    
    print('Accuracy Score: {:.3f}'.format(accuracy_score(y_test, pred)))
    print('proportion of correct classifications - higher better\n')
    
    print('Precision Score: {:.3f}'.format(precision_score(y_test, pred, average='weighted')))
    print('tp / (tp + fp), how good at not having fp - higher better\n')
    
    print('Recall Score: {:.3f}'.format(recall_score(y_test, pred, average='weighted')))
    print('tp / (tp + fn), how good at finding positives - higher better\n')
    
    print('Zero-One Loss: {:.3f}'.format(zero_one_loss(y_test, pred)))
    print('fraction of misclassifications - smaller better')
    
    print('\nFeature Importances: {}'.format(clf.feature_importances_))

In [292]:
eval_class(rf_clf, X_test, y_test)

Classification Metrics, between 0 and 1

Accuracy Score: 0.239
proportion of correct classifications - higher better

Precision Score: 0.198
tp / (tp + fp), how good at not having fp - higher better

Recall Score: 0.239
tp / (tp + fn), how good at finding positives - higher better

Zero-One Loss: 0.761
fraction of misclassifications - smaller better

Feature Importances: [0.04494804 0.04647429 0.04719095 0.04680405 0.04236601 0.04590378
 0.04984899 0.04678795 0.05229365 0.0479259  0.04438211 0.04630491
 0.04270987 0.03983158 0.04199778 0.04423784 0.041179   0.04270349
 0.04642514 0.03777494 0.04390861 0.05800111]


In [299]:
eval_class(gs.best_estimator_, X_test, y_test)

Classification Metrics, between 0 and 1

Accuracy Score: 0.282
proportion of correct classifications - higher better

Precision Score: 0.226
tp / (tp + fp), how good at not having fp - higher better

Recall Score: 0.282
tp / (tp + fn), how good at finding positives - higher better

Zero-One Loss: 0.718
fraction of misclassifications - smaller better

Feature Importances: [0.04553319 0.04703471 0.04502573 0.04430521 0.0451352  0.0462564
 0.05085417 0.05002449 0.05180339 0.04927877 0.04613386 0.04608677
 0.04404273 0.03835433 0.0393333  0.04533137 0.03802278 0.04018042
 0.04509183 0.03764597 0.04435475 0.06017063]
