## Imports and Paths

In [2]:
%matplotlib inline
from keras.applications import VGG16
#from keras.applications import InceptionResNetV2
from keras.preprocessing import image
from keras.models import Model, Sequential
from keras.utils.np_utils import to_categorical
from keras import regularizers
from keras.layers import Dense, BatchNormalization, Activation, Flatten, Dropout, Conv2D, MaxPooling2D
from keras.applications.vgg16 import preprocess_input
from keras.optimizers import Adam, Nadam
import numpy as np
import bcolz, os
from PIL import Image
from scipy.misc import toimage
from matplotlib import pyplot as plt

train_path = '/home/arman/deep-learning/kaggle-competitions/kaggle-state-farm-notebook/data/train/'
valid_path = '/home/arman/deep-learning/kaggle-competitions/kaggle-state-farm-notebook/data/valid/'
test_path = '/home/arman/deep-learning/kaggle-competitions/kaggle-state-farm-notebook/data/test/'
model_path = '/home/arman/deep-learning/kaggle-competitions/kaggle-state-farm-notebook/'

Using Theano backend.
Using cuDNN version 5103 on context None
Mapped name None to device cuda: Tesla K80 (30FC:00:00.0)


## Precompute VGG conv layer output

In [6]:
number_of_augmented_sets = 2

In [16]:
base_model = VGG16(include_top=False, weights='imagenet', input_tensor=None, input_shape=(3, 224, 224), pooling=None, classes=1000)

train_gen = image.ImageDataGenerator(rotation_range=15, width_shift_range=0.10, height_shift_range=0.05, # zoom_range=0.1,
                                     shear_range=0.1, channel_shift_range=20)
gen = image.ImageDataGenerator()
train_data = gen.flow_from_directory(train_path, target_size=(224,224), shuffle=False, batch_size=32, class_mode='categorical')
train_data_augm = train_gen.flow_from_directory(train_path, target_size=(224,224), shuffle=False, batch_size=32, class_mode='categorical')
valid_data = gen.flow_from_directory(valid_path, target_size=(224,224), shuffle=False, batch_size=32, class_mode='categorical')

tsteps = int(np.ceil(train_data.samples/32))
vsteps = int(np.ceil(valid_data.samples/32))

train_output = base_model.predict_generator(train_data, tsteps, workers=3)            # VGG Conv layers output of unaltered train data
print('Normal train data conv features generated.')

Found 18533 images belonging to 10 classes.
Found 18533 images belonging to 10 classes.
Found 3891 images belonging to 10 classes.
Normal train data conv features generated.


In [8]:
train_output_augm = base_model.predict_generator(train_data_augm, number_of_augmented_sets * tsteps, workers=3)   # VGG Conv layers output of augmented train data
print('Augmented train data conv features generated.')

Augmented train data conv features generated.


In [24]:
train_output = np.concatenate([train_output, train_output_augm])
y_train = to_categorical(train_data.classes)                                          
y_train = np.concatenate([y_train] * (number_of_augmented_sets + 1))    
    
valid_output = base_model.predict_generator(valid_data, vsteps)                       # VGG Conv layers output of validation data
print('Validation data conv features generated.')
y_valid = to_categorical(valid_data.classes)
    
c = bcolz.carray(train_output, rootdir=os.path.join(model_path,'VGGTrainOutput.bcolz'), mode='w')
c.flush()
c = bcolz.carray(valid_output, rootdir=os.path.join(model_path,'VGGValidOutput.bcolz'), mode='w')
c.flush()
c = bcolz.carray(y_train, rootdir=os.path.join(model_path,'VGGTrainClasses.bcolz'), mode='w')
c.flush()
c = bcolz.carray(y_valid, rootdir=os.path.join(model_path,'VGGValidClasses.bcolz'), mode='w')
c.flush()
print('All conv features generated and saved.')

Validation data conv features generated.
All conv features generated and saved.


In [25]:
print('number of augments: ' + str(number_of_augmented_sets))
print('train_output:      ' + str(train_output.shape))
print('train_output_augm: ' + str(train_output_augm.shape))
print('valid_output:      ' + str(valid_output.shape))
print('y_train:           ' + str(y_train.shape))
if (train_output.shape[0] == train_data.samples * (number_of_augmented_sets + 1) and \
    train_output.shape[0] == y_train.shape[0]):
        print('Training data VGG16 conv feature tensor with augmentation is of correct size and matches size of y tensor.')

number of augments: 2
train_output:      (55599, 512, 7, 7)
train_output_augm: (37066, 512, 7, 7)
valid_output:      (3891, 512, 7, 7)
y_train:           (55599, 10)
Training data VGG16 conv feature tensor with augmentation is of correct size and match size of y tensor.


## Fully Connected Layers

In [29]:
dropout_rate = 0.6
fc_model = Sequential()
fc_model.add(Flatten(input_shape=base_model.layers[-1].output_shape[1:]))
fc_model.add(BatchNormalization())
fc_model.add(Dense(512, activation='relu')) #  ,kernel_regularizer=regularizers.l2(0.01) ))
fc_model.add(Dropout(dropout_rate))
fc_model.add(BatchNormalization())
fc_model.add(Dense(512, activation='relu')) #, kernel_regularizer=regularizers.l2(0.01)))
fc_model.add(Dropout(dropout_rate))
fc_model.add(BatchNormalization())
fc_model.add(Dense(10, activation='softmax'))
fc_model.compile(optimizer=Adam(lr=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

## Training Code with Precomputed Data

In [None]:
# Put the pre-computed data loading code here

In [30]:
fc_model.fit(train_output, y_train, epochs=2, validation_data=(valid_output,y_valid))

Train on 55599 samples, validate on 3891 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fcd2274a940>

In [31]:
fc_model.optimizer.lr.set_value(0.00001)
fc_model.fit(train_output, y_train, epochs=2, validation_data=(valid_output,y_valid))

Train on 55599 samples, validate on 3891 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fcdd30257f0>

In [33]:
fc_model.save_weights(model_path + 'state_farm.h5')

## Pseudo Labeling

In [34]:
test_set_percent_use = 0.2
gen = image.ImageDataGenerator()
test_data = gen.flow_from_directory(test_path, target_size=(224,224), shuffle=False, batch_size=32, class_mode=None)
tsteps = test_set_percent_use * int(np.ceil(test_data.samples/32))
test_output = base_model.predict_generator(test_data, tsteps, workers=3)    # pseudo label 20% of test data - 15,945 images which is about 1/4 of the total training set
y_valid_pseudo = fc_model.predict(valid_output)
y_test_pseudo = fc_model.predict(test_output)
train_combined = np.concatenate([train_output, valid_output, test_output])
y_combined = np.concatenate([y_train, y_valid_pseudo, y_test_pseudo])
print('Partial test data loaded and pseudo-labeled.')

Found 79726 images belonging to 1 classes.
Partial test data loaded and pseudo-labeled.


In [36]:
fc_model.optimizer.lr.set_value(0.00001)
fc_model.fit(train_combined, y_combined, epochs=1, validation_data=(valid_output, y_valid))

Train on 75458 samples, validate on 3891 samples
Epoch 1/1


<keras.callbacks.History at 0x7fcd2274aa58>

## Kaggle submission code

In [40]:
gen = image.ImageDataGenerator()
test_data = gen.flow_from_directory(test_path, target_size=(224,224), shuffle=False, batch_size=32, class_mode=None)
steps = int(np.ceil(test_data.samples/32))

Found 79726 images belonging to 1 classes.


In [41]:
test_output = base_model.predict_generator(test_data, steps, workers=3)
pre_gen = fc_model.predict(test_output)

In [None]:
#probabilities = pred_gen.clip(min=0.05, max=0.95)

In [42]:
ids = [element[8:] for element in test_data.filenames]
ids = np.asarray(ids).reshape(-1,1)
print(pre_gen.shape)
print(ids.shape)

(79726, 10)
(79726, 1)


In [43]:
results = np.concatenate([ids,pre_gen], axis=-1)
results.shape

(79726, 11)

In [44]:
submission_file_name = '/home/arman/deep-learning/kaggle-competitions/kaggle-state-farm-notebook/submission2.csv'
np.savetxt(submission_file_name, results, fmt='%5s',delimiter=',', header='img,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9')