### Assignment 3 - Invasive Species - Fine tuning

### Setup

In [1]:
%matplotlib inline

In [2]:
from __future__ import division, print_function

import os, json
from glob import glob
import pandas as pd
import numpy as np
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt

import utils; reload(utils)
from utils import plots

# Dataset formatting
from os import walk
import shutil

from utils import *

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)
Using Theano backend.


In [3]:
path = 'data/invasivespecies/'

# NEVER ENABLE THIS WHEN CREATING VALIDATION SET (STEP 1)
# path = 'data/invasivespecies/sample/'

In [4]:
model_path = path+'models/'
if not os.path.exists(model_path):
    os.makedirs(model_path)

# VGG

In [None]:
batch_size=64
# batch_size=8

In [None]:
import vgg16; reload(vgg16)
from vgg16 import Vgg16

In [None]:
vgg = Vgg16()

In [None]:
vgg.model.summary()

#### Visualize data

In [None]:
batches = vgg.get_batches(path+'train', batch_size=4)
imgs,labels = next(batches)

In [None]:
print(imgs.shape)
print(labels.shape)

In [None]:
imgs[0, :, 0, 0]

In [None]:
plots(imgs, titles=labels)

In [None]:
vgg.predict(imgs, True)

In [None]:
vgg.classes[:4]

## Setup data

In [None]:
import bcolz
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]

In [None]:
val_data = get_data(path+'valid')
trn_data = get_data(path+'train')
save_array(model_path+'train_data.bc', trn_data)
save_array(model_path+'valid_data.bc', val_data)

In [None]:
trn_data = load_array(model_path+'train_data.bc')
val_data = load_array(model_path+'valid_data.bc')

In [None]:
val_batches = get_batches(path+'valid', shuffle=False, batch_size=1)
batches = get_batches(path+'train', shuffle=False, batch_size=1)
def onehot(x): return np.array(OneHotEncoder().fit_transform(x.reshape(-1,1)).todense())
val_classes = val_batches.classes
trn_classes = batches.classes
val_labels = onehot(val_classes)
trn_labels = onehot(trn_classes)

Look at the class ratio

In [None]:
print('Valuation distribution: ', sum(val_labels))
print('Percentages: ', sum(val_labels) / val_labels.shape[0])

print('Training distribution: ', sum(trn_labels))
print('Percentages: ', sum(trn_labels) / trn_labels.shape[0])

### Option 0: Use Batch Normalization and Data Augmentation

In [None]:
model = vgg.model

In [None]:
layers = model.layers

In [None]:
last_conv_idx = [index for index,layer in enumerate(layers) if type(layer) is Convolution2D][-1]

In [None]:
conv_layers = layers[:last_conv_idx+1]
conv_layers[-1].output_shape

In [None]:
print(last_conv_idx)
conv_layers

In [None]:
def get_bn_layers(p):
    return [
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dense(4096, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(4096, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(1000, activation='softmax')
    ]

In [None]:
def load_fc_weights_from_vgg16bn(model):
    from vgg16bn import Vgg16BN
    vgg16_bn = Vgg16BN()
    c_layers, fc_layers = split_at(vgg16_bn.model, Convolution2D)
    copy_weights(fc_layers, model.layers)

In [None]:
p=0.5

In [None]:
bn_model = Sequential(get_bn_layers(p))

In [None]:
load_fc_weights_from_vgg16bn(bn_model)

In [None]:
def proc_wgts(layer, prev_p, new_p):
    scal = (1-prev_p)/(1-new_p)
    return [o*scal for o in layer.get_weights()]

In [None]:
bn_model.summary()

In [None]:
for l in bn_model.layers:
    if type(l) is Dense:
        l.set_weights(proc_wgts(l, .5, p))

In [None]:
bn_model.save_weights(model_path+'batchnorm_original_vgg_model.bc')

### Final Model

In [None]:
bn_layers = bn_model.layers
final_model = Sequential(conv_layers)
for layer in final_model.layers: 
    layer.trainable = False
for layer in bn_layers: 
    final_model.add(layer)
    layer.trainable = False

### Option 1: use existing bn_vgg model

In [5]:
from vgg16bn import Vgg16BN
vgg16_bn = Vgg16BN()
model = vgg16_bn.model

  mode='max')
  mode='max')
  mode='max')


## Option 2: Manual fine tuning

In [None]:
model.summary()

In [6]:
model.pop()
for layer in model.layers: layer.trainable=False

In [7]:
model.add(Dense(2, activation='softmax'))

In [8]:
opt = Adam()
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [9]:
batch_size=64
train_gen = image.ImageDataGenerator(rotation_range=10, width_shift_range=0.1, 
        height_shift_range=0.1, shear_range=0.15, zoom_range=0.1, 
        channel_shift_range=10., horizontal_flip=True)
batches = get_batches(path+'train', train_gen, batch_size=batch_size)
val_batches = get_batches(path+'valid', shuffle=False, batch_size=batch_size)

Found 1606 images belonging to 2 classes.
Found 1167 images belonging to 2 classes.


In [10]:
def fit_model(model, batches, val_batches, nb_epoch=1):
    model.fit_generator(batches, samples_per_epoch=batches.N, nb_epoch=nb_epoch,
                        validation_data=val_batches, nb_val_samples=val_batches.N)

In [11]:
fit_model(model, batches, val_batches, nb_epoch=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [12]:
model.save_weights(model_path+'bn_finetune_e4_1.h5')

In [None]:
model.load_weights(model_path+'bn_finetune_e4_1.h5')

In [19]:
fit_model(model, batches, val_batches, nb_epoch=2)

Epoch 1/2
Epoch 2/2


In [20]:
model.save_weights(model_path+'bn_finetune_e14_1.h5')

### Fine tuning previous layers

In [23]:
layers = model.layers
# Get the index of the first dense layer...
first_dense_idx = [index for index,layer in enumerate(layers) if type(layer) is Dense][0]
# ...and set this and all subsequent layers to trainable
for layer in layers[first_dense_idx:]: layer.trainable=True

In [24]:
model.optimizer.lr=0.01
fit_model(model, batches, val_batches, 4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [26]:
model.save_weights(model_path+'bn_finetune_dense_1.h5')

In [27]:
for layer in layers[12:]: layer.trainable=True
# K.set_value(opt.lr, 0.001)
model.optimizer.lr=0.001

In [None]:
fit_model(model, batches, val_batches, 4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

In [None]:
model.save_weights(model_path+'manual_finetune_12th_layer_1.h5')

## Evaluating Results

In [None]:
vgg.model.evaluate(val_data, val_labels)

In [None]:
preds = vgg.model.predict_classes(val_data, batch_size=batch_size)
# probs = vgg.model.predict_proba(val_data, batch_size=batch_size)[:,0]
# probs[:8]

In [None]:
import sklearn
cm = confusion_matrix(val_classes, preds)
print(cm)
print(sklearn.metrics.f1_score(val_classes, preds))

In [None]:
plot_confusion_matrix(cm, {'noninvasive':0, 'invasive':1})

### Predict results in test set

In [None]:
testFolder = path+'test/'

# another test directory embeded in test folder
embeddedFolder = testFolder+'test/'
_, _, files = os.walk(embeddedFolder).next()
num_test_images = len(files)
test_batch_size = 100

print(num_test_images)
# num_test_images = 8
# test_batch_size = 1

batches = vgg.get_batches(testFolder, batch_size=test_batch_size, shuffle=False, class_mode=None)


Let's look at our predictions...

Need to convert file names to label ids

In [None]:
import re
filenames = batches.filenames

p = re.compile('.*/([0-9]+).jpg')
def find_file_id(filename):
    m = p.match(filename)
    if m is not None:
        return int(m.group(1))
    else:
        print('Could not regex filename: ', filename)
        return -1
file_ids = map(find_file_id, filenames)

Option A: Predict values using vgg test function (Not recommended. No progress)

Option B: Predict values with batches and progress

In [None]:
import math
from tqdm import tqdm

predict_file = model_path + 'predict.bc'

def predict_test():
    batches = vgg.get_batches(testFolder, batch_size=test_batch_size, shuffle=False, class_mode=None)

    p_results = np.zeros(num_test_images)
    current_index = 0
    # Iterative loop
    for batch in tqdm(batches, total=math.ceil(num_test_images/test_batch_size)):
        if batch is None:
            break
        p = vgg.model.predict_on_batch(batch)
        p_true = p[:, 1]
        p_size = p.shape[0]
#         print('Predictions: {}\n Size: {}'.format(p_true, p_size))
        new_index = current_index + p_size
#         print('Current index: {} New index: {} PResults: {}'.format(current_index, new_index, p_results))
        p_results[current_index:new_index] = p_true
        current_index = new_index
        if current_index >= num_test_images:
            break
    print(p_results)
    utils.save_array(predict_file, p_results)
    return p_results

if os.path.exists(predict_file):
    p_results = utils.load_array(predict_file)
    print('Loaded predictions from cache')
else:
    p_results = predict_test()

In [None]:
# Verify the arrays match
print(p_results.shape)
print(len(file_ids))
print(p_results)

In [None]:
rounded_results = np.rint(p_results)

In [None]:
# Clip results for better log loss
clipped_results = np.clip(p_results, 0.05, 0.95)

clipped_file = model_path + 'clip.bc'
utils.save_array(clipped_file, clipped_results)

In [None]:
# Load saved clip file
clipped_file = models_folder + 'clip.bc'
clipped_results = load_array(clipped_file)

### Combine ids with labels and save

In [None]:
import pandas as pd

agg = pd.DataFrame({'name': file_ids, 'invasive': clipped_results})
agg = agg[agg.columns[::-1]]
# agg = pd.DataFrame([file_ids, clipped_results], columns=['name', 'invasive'])
agg = agg.sort_values(['name'])
print(agg)
agg.to_csv(path + 'clipped.csv', index=False)


In [None]:
from IPython.display import FileLink
FileLink(path+'clipped.csv')

In [None]:
!kg config -g -c 'invasive-species-monitoring'
!kg submit {path+'clipped.csv'}

------