In [123]:
%load_ext autoreload
%autoreload 2

import numpy as np
import os
import pandas as pd
from sklearn.metrics import accuracy_score

from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import load_img, img_to_array
from keras.models import load_model
from keras.preprocessing import image
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

import models.simpleConv as simpleConv
from imgaug import augmenters as iaa

import h5py
from imgaug import augmenters as iaa

import preprocessing as pp
pp.init()

allTrainingFolders = [x for x in os.listdir(pp.trainPath) if x.startswith('n')]


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Checking to make sure output directories are created..
..done


# Parsing & Data Loading

In [68]:
encoder = LabelBinarizer()
##MAKE SURE classLabels is set to ALL the folders you will train on, even if doing in batches
encoder = encoder.fit(allTrainingFolders) 

parse = False
includeAugmented = True
onlyAugmented = False
augments = [iaa.GaussianBlur(3.0)]

trainingFolders = allTrainingFolders[0:5]

if parse:
    pp.parseImages(trainingFolders)
    
#TODO: ADD BATCH SPLITTING HERE. Subset trainingfolders, load and fit. 


#Load in data
x_train, y_train, x_val, y_val = pp.loadH5s(trainingFolders)

x_aug = []
y_aug = []
if includeAugmented:
    x_aug, y_aug = pp.augmentData(x_train, y_train, augments = augments)
    if onlyAugmented:
        x_train = x_aug
        y_train = y_aug
    else:
        x_train.extend(x_aug)
        y_train.extend(y_aug)

    
y_train = encoder.transform(y_train) #call encoder.inverse_transform() to get real class labels
y_val = encoder.transform(y_val)


# MODEL

In [76]:
callbacks = []
epochs = 1

model = simpleConv.build_model(pp.img_width, pp.img_height, 3)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

results = model.fit(np.array(x_train), np.array(y_train), 
                    validation_data = (np.array(x_val), np.array(y_val)),
                    epochs=epochs, verbose=1, callbacks = callbacks)


Train on 4140 samples, validate on 519 samples
Epoch 1/1


In [78]:
predictions = model.predict(np.array(x_val))

In [92]:
predictions[500]

array([0.03294557, 0.00764119, 0.5023323 , 0.26514718, 0.19193377],
      dtype=float32)

In [129]:
pp.topClasses(predictions[503], encoder.classes_)

def predictionsToDataframe(predictions, truth, encoder):
    df = pd.DataFrame()
    
    df['truth'] = [pp.translateID(x) for x in encoder.inverse_transform(truth)]
    one = []
    two = []
    three = []
    for p in predictions:
        top = pp.topClasses(p, encoder.classes_)
        one.append(top[0][0])
        two.append(top[1][0])
        three.append(top[2][0])
        
        
    df['one'] = one
    df['two'] = two
    df['three'] = three
    return df

df = predictionsToDataframe(predictions, y_val, encoder)

In [140]:
acc = len(df[df.truth == df.one])/len(df)
print("Top-1 accuracy: {0}".format(acc))

acc = len(df[(df.truth == df.one) | (df.truth == df.two) | (df.truth == df.three) ]) / len(df)
print("Top-3 accuracy: {0}".format(acc))


Top-1 accuracy: 0.605009633911368
Top-3 accuracy: 0.9460500963391136


Steps
1. Parse Image Data for all folders that you want to train on
2. Load in each batch and repeat:
    3. Apply augments to batch, if any
    4. Train batch
5. Evaluate model. 

array(['n01440764', 'n01443537', 'n01484850', 'n01491361', 'n01494475'],
      dtype='<U9')