### Assignment 1 - Invasive Species

### Setup

In [1]:
%matplotlib inline

In [4]:
from __future__ import division, print_function

import os, json
from glob import glob
import pandas as pd
import numpy as np
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt

import utils; reload(utils)
from utils import plots

# Dataset formatting
from os import walk
import shutil

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)
Using Theano backend.


In [5]:
path = 'data/invasivespecies/'

# NEVER ENABLE THIS WHEN CREATING VALIDATION SET (STEP 1)
# path = 'data/invasivespecies/sample/'

In [6]:
model_path = path+'models/'
if not os.path.exists(model_path):
    os.makedirs(model_path)

### Step 1: Create Validation set and sample

In [None]:
# Just testing out command line
%cd data
%cd ../

In [None]:
!kg config -g -c 'invasive-species-monitoring'

In [None]:
# Unzip training folder
originals_folder = path+'train_original/'
# !7z --help
if not os.path.exists(originals_folder):
    !7z e {path}train.7z -o{originals_folder}

In [None]:
# Unzip to test folder
test_folder = path+'test/test/'
if not os.path.exists(test_folder):
    os.makedirs(test_folder)
    !7z e {path}test.7z -o{test_folder}

# Create sample test folder
_, _, filenames = next(os.walk(test_folder))
shuf = np.random.permutation(filenames)
sample_test_folder = path+'sample/test/test/'
if not os.path.exists(sample_test_folder):
    os.makedirs(sample_test_folder)
for i in range(30):
    shutil.copyfile(test_folder+shuf[i], sample_test_folder+shuf[i])


In [None]:
# Unzip training labels
import zipfile
def unzip_file(filename, path):
    filepath = path+filename
    if os.path.isfile(filepath):
        zip_ref = zipfile.ZipFile(filepath, 'r')
        zip_ref.extractall(path)
        zip_ref.close()
        print('Unzipping file:', filepath)

unzip_file('train_labels.csv.zip', path)
unzip_file('sample_submission.csv.zip', path)

In [None]:
# Get training labels
training_labels_df = pd.read_csv(path + 'train_labels.csv')
size = training_labels_df.size
num_positives = training_labels_df['invasive'].sum()
print('Size:', size)
print('Positives: {} Percent: {}'.format(num_positives, num_positives/size))


In [None]:
# Get training files
import re

_, _, filenames = next(walk(originals_folder))


p = re.compile('^([0-9]+).jpg')
def find_file_id(filename):
    m = p.match(filename)
    if m is not None:
        return int(m.group(1))
    else:
        print('Could not regex filename: ', filename)
        return -1
file_ids = map(find_file_id, filenames)

# Merge data into one dataframe:
file_df = pd.DataFrame([file_ids, filenames], index=['name', 'file']).transpose()

In [None]:
# Label files and move to labeled folder


labeled_folder = path+'labeled/'
if not os.path.exists(labeled_folder):
    os.makedirs(labeled_folder)
    
labeled_df = pd.merge(training_labels_df, file_df, on='name')
for row in labeled_df.itertuples():
    label = 'invasive.' if row[2] == 1 else 'noninvasive.'
    file_name = row[3]
    new_file = labeled_folder+label+file_name;
    shutil.copyfile(originals_folder+file_name, new_file)
    

In [None]:
# Separate images into training and validation sets
train_folder = path+'train/'
shutil.copytree(labeled_folder, train_folder)
_, _, filenames = next(walk(train_folder))
shuf = np.random.permutation(filenames)
size = len(filenames)
print(size)

valid_folder = path+'valid/'
if not os.path.exists(valid_folder):
    os.makedirs(valid_folder)
    
sample_train_folder = path+'sample/train/'
if not os.path.exists(sample_train_folder):
    os.makedirs(sample_train_folder)
for i in range(200):
    copyfile(train_folder+shuf[i], sample_train_folder+shuf[i])
    
sample_valid_folder = path+'sample/valid/'
if not os.path.exists(sample_valid_folder):
    os.makedirs(sample_valid_folder)
for i in range(200):
    copyfile(train_folder+shuf[i], sample_valid_folder+shuf[i])
    
validation_size = int(round(size * .3))
for i in range(validation_size):
    os.rename(train_folder+shuf[i], valid_folder+shuf[i])



In [None]:
def classify_folders(folder):
    if not os.path.exists(folder+'noninvasive'):
        os.makedirs(folder+'noninvasive')
        os.makedirs(folder+'invasive')
    inv = glob(folder+'invasive.*.jpg')
    noninv = glob(folder+'noninvasive.*.jpg')
    move_files_to(inv, folder, folder+'invasive/')
    move_files_to(noninv, folder, folder+'noninvasive/')
    
def move_files_to(files, old_folder, new_folder):
    for fname in files:
        newf = fname.replace(old_folder, new_folder)
        os.rename(fname, newf)
        
classify_folders(train_folder)
classify_folders(valid_folder)
classify_folders(sample_train_folder)
classify_folders(sample_valid_folder)

# VGG

In [39]:
batch_size=64
# batch_size=8

In [40]:
import vgg16; reload(vgg16)
from vgg16 import Vgg16

In [41]:
vgg = Vgg16()

#### Visualize data

In [None]:
batches = vgg.get_batches(path+'train', batch_size=4)
imgs,labels = next(batches)

In [None]:
print(imgs.shape)
print(labels.shape)

In [None]:
imgs[0, :, 0, 0]

In [None]:
plots(imgs, titles=labels)

In [None]:
vgg.predict(imgs, True)

In [None]:
vgg.classes[:4]

## Setup data

In [10]:
import bcolz
from utils import *
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]

In [None]:
val_data = get_data(path+'valid')
trn_data = get_data(path+'train')
save_array(model_path+'train_data.bc', trn_data)
save_array(model_path+'valid_data.bc', val_data)

In [11]:
trn_data = load_array(model_path+'train_data.bc')
val_data = load_array(model_path+'valid_data.bc')

In [17]:
val_batches = get_batches(path+'valid', shuffle=False, batch_size=1)
batches = get_batches(path+'train', shuffle=False, batch_size=1)
def onehot(x): return np.array(OneHotEncoder().fit_transform(x.reshape(-1,1)).todense())
val_classes = val_batches.classes
trn_classes = batches.classes
val_labels = onehot(val_classes)
trn_labels = onehot(trn_classes)

Found 1167 images belonging to 2 classes.
Found 1606 images belonging to 2 classes.


Look at the class ratio

In [38]:
print('Valuation distribution: ', sum(val_labels))
print('Percentages: ', sum(val_labels) / val_labels.shape[0])

print('Training distribution: ', sum(trn_labels))
print('Percentages: ', sum(trn_labels) / trn_labels.shape[0])

Valuation distribution:  [ 752.  415.]
Percentages:  [ 0.6444  0.3556]
Training distribution:  [ 995.  611.]
Percentages:  [ 0.6196  0.3804]


### Option 1: Auto Finetune Model

In [50]:
def train_vgg(save_file='trained_weights.h5', nb_epoch=1):
    batch_size=64
    batches = vgg.get_batches(path+'train', batch_size=batch_size)
    val_batches = vgg.get_batches(path+'valid', batch_size=batch_size*2)
    vgg.fit(batches, val_batches, nb_epoch=nb_epoch)
    vgg.model.save_weights(model_path + save_file)

In [43]:
vgg.finetune(batches)


train_vgg()

Found 1606 images belonging to 2 classes.
Found 1167 images belonging to 2 classes.
Epoch 1/1


In [46]:
train_vgg('trained_weights_n2.h5')

Found 1606 images belonging to 2 classes.
Found 1167 images belonging to 2 classes.
Epoch 1/1


In [56]:
train_vgg('trained_weights_n3.h5')

Found 1606 images belonging to 2 classes.
Found 1167 images belonging to 2 classes.
Epoch 1/1


In [55]:
vgg.model.load_weights(model_path + 'trained_weights_n2.h5')

In [57]:
train_vgg('trained_weights_n4.h5', nb_epoch=2)

Found 1606 images belonging to 2 classes.
Found 1167 images belonging to 2 classes.
Epoch 1/2
Epoch 2/2


#### Load from Cache:

In [None]:
batch_size=64
batches = vgg.get_batches(path+'train', batch_size=batch_size)
val_batches = vgg.get_batches(path+'valid', batch_size=batch_size*2)
vgg.finetune(batches)

if os.path.exists(weights_fullpath):
    vgg.model.load_weights(weights_fullpath)
    print('Loaded model from cache')

## Option 2: Manual fine tuning

In [13]:
model = vgg.model

In [None]:
model.summary()

In [15]:
model.pop()
model.add(Dense(2, activation='softmax'))

In [71]:
gen=image.ImageDataGenerator()
batches = gen.flow(trn_data, trn_labels, batch_size=batch_size, shuffle=True)
val_batches = gen.flow(val_data, val_labels, batch_size=batch_size, shuffle=False)

In [72]:
def fit_model(model, batches, val_batches, nb_epoch=1):
    model.fit_generator(batches, samples_per_epoch=batches.N, nb_epoch=nb_epoch,
                        validation_data=val_batches, nb_val_samples=val_batches.N)

In [20]:
opt = Adam(lr=0.01)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [21]:
fit_model(model, batches, val_batches, nb_epoch=2)

Epoch 1/2
Epoch 2/2


In [23]:
model.save_weights(model_path+'manual_finetune1.h5')

In [None]:
model.load_weights(model_path+'manual_finetune1.h5')

### Fine tuning previous layers

In [69]:
model = vgg.model
layers = model.layers
# Get the index of the first dense layer...
first_dense_idx = [index for index,layer in enumerate(layers) if type(layer) is Dense][0]
# ...and set this and all subsequent layers to trainable
for layer in layers[first_dense_idx:]: layer.trainable=True

In [73]:
K.set_value(opt.lr, 0.01)
fit_model(model, batches, val_batches, 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [74]:
model.save_weights(model_path+'manual_finetune_dense_1.h5')

In [75]:
for layer in layers[12:]: layer.trainable=True
K.set_value(opt.lr, 0.001)

In [76]:
fit_model(model, batches, val_batches, 4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [77]:
model.save_weights(model_path+'manual_finetune_12th_layer_1.h5')

## Evaluating Results

In [24]:
vgg.model.evaluate(val_data, val_labels)



[5.7317992098431656, 0.64438731790916881]

In [25]:
preds = vgg.model.predict_classes(val_data, batch_size=batch_size)
# probs = vgg.model.predict_proba(val_data, batch_size=batch_size)[:,0]
# probs[:8]



array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.], dtype=float32)

In [27]:
import sklearn
cm = confusion_matrix(val_classes, preds)
print(cm)
print(sklearn.metrics.f1_score(val_classes, preds))

[[752   0]
 [415   0]]
0.0


In [None]:
plot_confusion_matrix(cm, {'noninvasive':0, 'invasive':1})

### Predict results in test set

In [58]:
testFolder = path+'test/'

# another test directory embeded in test folder
embeddedFolder = testFolder+'test/'
_, _, files = os.walk(embeddedFolder).next()
num_test_images = len(files)
test_batch_size = 100

print(num_test_images)
# num_test_images = 8
# test_batch_size = 1

batches = vgg.get_batches(testFolder, batch_size=test_batch_size, shuffle=False, class_mode=None)


1531
Found 1531 images belonging to 1 classes.


Let's look at our predictions...

Need to convert file names to label ids

In [59]:
import re
filenames = batches.filenames

p = re.compile('.*/([0-9]+).jpg')
def find_file_id(filename):
    m = p.match(filename)
    if m is not None:
        return int(m.group(1))
    else:
        print('Could not regex filename: ', filename)
        return -1
file_ids = map(find_file_id, filenames)

Option A: Predict values using vgg test function (Not recommended. No progress)

Option B: Predict values with batches and progress

In [79]:
import math
from tqdm import tqdm_notebook as tqdm

predict_file = model_path + 'predict.bc'

def predict_test():
    batches = vgg.get_batches(testFolder, batch_size=test_batch_size, shuffle=False, class_mode=None)

    p_results = np.zeros(num_test_images)
    current_index = 0
    # Iterative loop
    for batch in tqdm(batches, total=math.ceil(num_test_images/test_batch_size)):
        if batch is None:
            break
        p = vgg.model.predict_on_batch(batch)
        p_true = p[:, 1]
        p_size = p.shape[0]
#         print('Predictions: {}\n Size: {}'.format(p_true, p_size))
        new_index = current_index + p_size
#         print('Current index: {} New index: {} PResults: {}'.format(current_index, new_index, p_results))
        p_results[current_index:new_index] = p_true
        current_index = new_index
        if current_index >= num_test_images:
            break
    print(p_results)
    utils.save_array(predict_file, p_results)
    return p_results

if os.path.exists(predict_file):
    p_results = utils.load_array(predict_file)
    print('Loaded predictions from cache')
else:
    p_results = predict_test()


  0%|          | 0/16.0 [00:00<?, ?it/s]

Found 1531 images belonging to 1 classes.


[A

  6%|▋         | 1/16.0 [00:05<01:27,  5.81s/it][A
 12%|█▎        | 2/16.0 [00:11<01:21,  5.81s/it][A
 19%|█▉        | 3/16.0 [00:17<01:15,  5.77s/it][A
 25%|██▌       | 4/16.0 [00:23<01:09,  5.77s/it][A
 31%|███▏      | 5/16.0 [00:28<01:03,  5.76s/it][A
 38%|███▊      | 6/16.0 [00:34<00:57,  5.74s/it][A
 44%|████▍     | 7/16.0 [00:40<00:51,  5.74s/it][A
 50%|█████     | 8/16.0 [00:45<00:45,  5.73s/it][A
 56%|█████▋    | 9/16.0 [00:51<00:40,  5.72s/it][A
 62%|██████▎   | 10/16.0 [00:57<00:34,  5.73s/it][A
 69%|██████▉   | 11/16.0 [01:03<00:28,  5.75s/it][A
 75%|███████▌  | 12/16.0 [01:08<00:22,  5.75s/it][A
 81%|████████▏ | 13/16.0 [01:14<00:17,  5.76s/it][A
 88%|████████▊ | 14/16.0 [01:20<00:11,  5.75s/it][A
 94%|█████████▍| 15/16.0 [01:26<00:05,  5.74s/it][A

[  7.6409e-04   9.7042e-01   9.7979e-01 ...,   9.9461e-01   9.7114e-01   9.9493e-01]


In [80]:
# Verify the arrays match
print(p_results.shape)
print(len(file_ids))
print(p_results)

(1531,)
1531
[  7.6409e-04   9.7042e-01   9.7979e-01 ...,   9.9461e-01   9.7114e-01   9.9493e-01]


In [64]:
rounded_results = np.rint(p_results)

In [83]:
# Clip results for better log loss
clipped_results = np.clip(p_results, 0.05, 0.95)

clipped_file = model_path + 'clip.bc'
utils.save_array(clipped_file, clipped_results)

In [None]:
# Load saved clip file
clipped_file = models_folder + 'clip.bc'
clipped_results = load_array(clipped_file)

### Combine ids with labels and save

In [84]:
import pandas as pd

agg = pd.DataFrame({'name': file_ids, 'invasive': clipped_results})
agg = agg[agg.columns[::-1]]
# agg = pd.DataFrame([file_ids, clipped_results], columns=['name', 'invasive'])
agg = agg.sort_values(['name'])
print(agg)
agg.to_csv(path + 'clipped.csv', index=False)


      name  invasive
865      1  0.244312
487      2  0.950000
432      3  0.924252
688      4  0.602737
334      5  0.099905
468      6  0.950000
1163     7  0.661849
1017     8  0.050000
1135     9  0.050000
774     10  0.950000
295     11  0.950000
900     12  0.950000
1460    13  0.950000
250     14  0.050000
1206    15  0.642296
657     16  0.657809
164     17  0.950000
20      18  0.324960
1482    19  0.458225
1088    20  0.850550
837     21  0.050000
242     22  0.950000
178     23  0.050000
215     24  0.704051
411     25  0.050000
1242    26  0.050000
95      27  0.050000
605     28  0.950000
1307    29  0.950000
1479    30  0.950000
...    ...       ...
246   1502  0.946256
1378  1503  0.950000
744   1504  0.950000
1210  1505  0.050000
554   1506  0.050000
1423  1507  0.769502
1363  1508  0.950000
1021  1509  0.050000
1051  1510  0.050000
878   1511  0.050203
420   1512  0.884837
1386  1513  0.879919
988   1514  0.050000
922   1515  0.950000
1179  1516  0.050000
193   1517  0

In [None]:
from IPython.display import FileLink
FileLink(path+'clipped.csv')

In [85]:
!kg config -g -c 'invasive-species-monitoring'
!kg submit {path+'clipped.csv'}




------

## Implementing VGG with Keras Backend

In [None]:
from numpy.random import random, permutation
from scipy import misc, ndimage
from scipy.ndimage.interpolation import zoom

import keras
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.models import Sequential, Model
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers import Input
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, RMSprop
from keras.preprocessing import image

In [None]:
FILES_PATH = 'http://files.fast.ai/models/'
CLASS_FILE='imagenet_class_index.json'
fpath = get_file(CLASS_FILE, FILES_PATH+CLASS_FILE, cache_subdir='models')
with open(fpath) as f:
    class_dict = json.load(f)
# classes = [class_dict[i][1] for i in class_dict]
classes = [class_dict[str(i)][1] for i in range(len(class_dict))] # original

## Model creation

In [None]:
def ConvBlock(layers, model, filters):
    for i in range(layers): 
        model.add(ZeroPadding2D((1,1)))
        model.add(Convolution2D(filters, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

In [None]:
def FCBlock(model):
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))

In [None]:
# ??Convolution2D

# ??MaxPooling2D

# ??Dense

In [None]:
# Mean of each channel as provided by VGG researchers
vgg_mean = np.array([123.68, 116.779, 103.939]).reshape((3,1,1))

def vgg_preprocess(x):
    x = x - vgg_mean     # subtract mean
    return x[:, ::-1]    # reverse axis bgr->rgb

In [None]:
def VGG_16():
    model = Sequential()
    model.add(Lambda(vgg_preprocess, input_shape=(3,224,224)))

    ConvBlock(2, model, 64)
    ConvBlock(2, model, 128)
    ConvBlock(3, model, 256)
    ConvBlock(3, model, 512)
    ConvBlock(3, model, 512)

    model.add(Flatten())
    FCBlock(model)
    FCBlock(model)
    model.add(Dense(1000, activation='softmax'))
    return model

In [None]:
??MaxPooling2D

??Dense

In [None]:
model = VGG_16()

In [None]:
fpath = get_file('vgg16.h5', FILES_PATH+'vgg16.h5', cache_subdir='models')
model.load_weights(fpath)

In [None]:
batch_size=4

In [None]:
def get_batches(dirname, gen=image.ImageDataGenerator(), shuffle=True, 
                batch_size=batch_size, class_mode='categorical'):
    return gen.flow_from_directory(path+dirname, target_size=(224,224), 
                class_mode=class_mode, shuffle=shuffle, batch_size=batch_size)

In [None]:
batches = get_batches('train', batch_size=batch_size)
val_batches = get_batches('valid', batch_size=batch_size)
imgs,labels = next(batches)

# This shows the 'ground truth'
plots(imgs, titles=labels)

In [None]:
def pred_batch(imgs):
    preds = model.predict(imgs)
    idxs = np.argmax(preds, axis=1)

    print('Shape: {}'.format(preds.shape))
    print('First 5 classes: {}'.format(classes[:5]))
    print('First 5 probabilities: {}\n'.format(preds[0, :5]))
    print('Predictions prob/class: ')
    
    for i in range(len(idxs)):
        idx = idxs[i]
        print ('  {:.4f}/{}'.format(preds[i, idx], classes[idx]))

In [None]:
pred_batch(imgs)