# Prepare Dataset

## Download Dataset

Download dataset from https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition

## Split Training and Validation Sets

In [None]:
import glob
cats = glob.glob('train/cat*.*')
dogs = glob.glob('train/dog*.*')

In [None]:
from random import shuffle
shuffle(cats)
shuffle(dogs)

In [None]:
train_perc = 0.85
n_train_cats = int(train_perc * len(cats))
n_train_dogs = int(train_perc * len(dogs))

In [None]:
cats_train, cats_val = cats[:n_train_cats], cats[n_train_cats:]
dogs_train, dogs_val = dogs[:n_train_dogs], dogs[n_train_dogs:]

## Organize Images into Train and Validation Folders

In [None]:
import os
import pathlib
def move_files_to_folder(files, dest_folder):
    if not os.path.isdir(dest_folder):
        os.makedirs(dest_folder)
    for file in files:
        dest = str(pathlib.Path(dest_folder) / pathlib.Path(file).name)
        os.rename(file, dest)

In [None]:
train_path = 'dataset/train/'
move_files_to_folder(cats_train, train_path + 'cat/')
move_files_to_folder(dogs_train, train_path + 'dog/')

In [None]:
val_path = 'dataset/validation/'
move_files_to_folder(cats_val, val_path + 'cat/')
move_files_to_folder(dogs_val, val_path + 'dog/')

In [None]:
ntrain_examples = len(glob.glob(train_path + '*/*.*'))
nval_examples = len(glob.glob(val_path + '*/*.*'))
print(ntrain_examples, nval_examples)

# Train Model

## Load a Model Pretrained on ImageNet

### ResNet50

In [None]:
import numpy as np
from keras.models import Sequential
from keras.applications import ResNet50
from keras.layers.core import Lambda

def preprocess(x):
    imagenet_mean = np.array([123.68, 116.779, 103.939]).reshape((1,1,3))
    x = x - imagenet_mean
    return x[..., ::-1]

ptmodel = Sequential([
    Lambda(preprocess, input_shape=(256, 256, 3), output_shape=(256, 256, 3)),
    ResNet50(include_top=False, weights='imagenet', input_shape=(256, 256, 3))
])

for layer in ptmodel.layers:
    layer.trainable = False

ptmodel.summary()

## Precompute Pretrained Model's Outputs

In [None]:
from keras.preprocessing.image import ImageDataGenerator
generator = ImageDataGenerator()

In [None]:
train_generator = generator.flow_from_directory(train_path, target_size=(256, 256), batch_size=50, shuffle=True)
validation_generator = generator.flow_from_directory(val_path, target_size=(256, 256), batch_size=50, shuffle=False)

In [None]:
import numpy as np
import bcolz

def precompute_model_output(model, generator, steps):
    X, Y = [], []
    for i in range(steps):
        x, y = next(generator)
        X.append(model.predict(x))
        Y.append(y)
    X = np.concatenate(X, axis=0)
    Y = np.concatenate(Y)
    return X, Y

def save_array(fname, arr):
    try:
        os.makedirs(fname)
    except FileExistsError:
        pass
    c = bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()

def load_array(fname):
    return bcolz.open(fname)[:]

In [None]:
train_steps = ntrain_examples // train_generator.batch_size
x_train, y_train = precompute_model_output(ptmodel, train_generator, train_steps)
print(x_train.shape, y_train.shape)

In [None]:
validation_steps = nval_examples // validation_generator.batch_size
x_val, y_val = precompute_model_output(ptmodel, validation_generator, validation_steps)
print(x_val.shape, y_val.shape)

In [None]:
save_array('.cache/x_train', x_train)
save_array('.cache/y_train', y_train)
save_array('.cache/x_val', x_val)
save_array('.cache/y_val', y_val)

## Load Precomputed ResNet's Outputs

In [None]:
x_train = load_array('.cache/x_train')
y_train = load_array('.cache/y_train')
print(x_train.shape, y_train.shape)

In [None]:
x_val = load_array('.cache/x_val')
y_val = load_array('.cache/y_val')
print(x_val.shape, y_val.shape)

## Finetune Pretrained Model

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Reshape
from keras.layers.core import Dropout

input_layer = Input(shape=(1, 1, 2048,))
x = Reshape((2048,))(input_layer)
x = Dropout(0.7)(x)
output = Dense(2, activation='softmax')(x)

model = Model(input_layer, output)
model.summary()

## Train Model

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train, batch_size=256, epochs=1, verbose=1, validation_data=(x_val, y_val))

In [None]:
model.optimizer.lr = 0.01
model.fit(x_train, y_train, batch_size=256, epochs=1, verbose=1, validation_data=(x_val, y_val))

## Evaluate Model

In [None]:
model.evaluate(x_train, y_train)

In [None]:
model.evaluate(x_val, y_val)

# Make Submission File

## Assemble Full Model

In [None]:
full_model = Sequential([
    ptmodel,
    model
])

full_model.summary()

## Compute Solution

In [None]:
test_path = 'dataset/test/'
generator = ImageDataGenerator()
test_generator = generator.flow_from_directory(test_path, target_size=(256, 256), batch_size=50, class_mode=None, shuffle=False)

In [None]:
ntest_examples = len(glob.glob(test_path + '*/*.*'))
test_steps = ntest_examples // test_generator.batch_size
y_pred = full_model.predict_generator(test_generator, test_steps, verbose=1)

## Write Submission File

In [None]:
import pandas as pd
submission = pd.read_csv('sample_submission.csv')
submission['label'] = y_pred[:, 0]
submission.to_csv('my_kaggle_submission.csv', index=False)

Cool!