Note: the data used in this project can be downloaded from: https://www.dropbox.com/s/cst9awcjpp08k33/50_categories.tar.gz

## Imports

In [220]:
from skimage import feature
from skimage.io import imread
from itertools import combinations
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
%matplotlib inline

## Compute Features from Data

In [206]:
def feature_extract(img_file):
    
    # read image
    im_arr = imread(img_file)
    
    # check if image has only one color channel and if so stack the image three times to ensure that there are three
    # (unfortunately) identical color channels
    if len(im_arr.shape) == 2:
        im_arr = np.dstack((im_arr, im_arr, im_arr))
    
    ## instatiate list (later to convert to array) to hold features
    features = []
    
    # first do some dumb features:
    
    # calculate the mean in each color
    color_means = im_arr.mean(axis=1).mean(axis=0)
    
    # use the ratio of maximum value in each color to the mean of each color
    color_max_div_mean = im_arr.max(axis=1).max(axis=0) / color_means
    features += list(color_max_div_mean)
     
    # use the ratio of standard deviations in each color to mean in each color as another set of features
    color_std_div_mean = im_arr.std(axis=1).mean(axis=0) / color_means
    features += list(color_std_div_mean)
    
    # use ratios of means, and correlation coefficients between flattened as additional features
    mean_ratios = []
    corr_coefs = []
    for idx_pair in combinations(range(3), 2):
        mean_ratios.append(color_means[idx_pair[0]] / color_means[idx_pair[1]])
        corr_coefs.append(np.corrcoef(im_arr[:,:,idx_pair[0]].flatten(), im_arr[:,:,idx_pair[1]].flatten())[0,1])
    features += mean_ratios + corr_coefs
    
    return np.array(features)

## Run Feature Extraction

In [207]:
path = '50_categories/'

# get labels
labels = np.array([dr for dr in os.listdir(path) if '.DS_Store' not in dr])

# process features
X = []
y = []
for label in labels:
    print('Processing label: {}'.format(label))
    for fl in [fl for fl in os.listdir(path + label) if '.DS_Store' not in fl]:
        X.append(feature_extract(path + label + '/' + fl))
        y.append(label)
        
X = np.array(X)
y = np.array(y)

Processing label: gorilla
Processing label: raccoon
Processing label: crab
Processing label: blimp
Processing label: snail
Processing label: airplanes
Processing label: dog
Processing label: dolphin
Processing label: goldfish
Processing label: giraffe
Processing label: bear
Processing label: killer-whale
Processing label: penguin
Processing label: zebra
Processing label: duck
Processing label: conch
Processing label: camel
Processing label: owl
Processing label: helicopter
Processing label: starfish
Processing label: saturn
Processing label: galaxy
Processing label: goat
Processing label: iguana
Processing label: elk
Processing label: hummingbird
Processing label: triceratops
Processing label: porcupine
Processing label: teddy-bear
Processing label: comet
Processing label: hot-air-balloon
Processing label: leopards
Processing label: toad
Processing label: mussels
Processing label: kangaroo
Processing label: speed-boat
Processing label: bat
Processing label: swan
Processing label: octop

## Perform Classification

In [218]:
# prepare training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [223]:
# determine baseline
d_clf = DummyClassifier(strategy='prior')
d_clf.fit(X_train, y_train)
d_clf.score(X_test, y_test)

0.127208480565371

In [226]:
# do random forest classification
rf_clf = RandomForestClassifier(class_weight='balanced', random_state=100)
rf_clf.fit(X_train, y_train)
rf_clf.score(X_test, y_test)

0.1696113074204947