In [7]:
import numpy as np
import pandas as pd
from skimage import io
import cv2

import os
import re
import glob
import hashlib
import argparse
import warnings

import six

from keras import applications
from keras.preprocessing.image import (ImageDataGenerator, Iterator,
                                       array_to_img, img_to_array, load_img)
from keras import optimizers
from keras.models import Sequential, Model 
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras import backend as K
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping

from keras_tqdm import TQDMNotebookCallback

import tensorflow as tf
from tensorflow.python.platform import gfile

from tqdm import tqdm

from sklearn.model_selection import KFold

In [10]:
DATA_DIR = '/home/user/data/amazon_planet'
TRAIN_DIR = 'train-jpg'
TRAIN_DATA = 'train_v2.csv'
TEST_DIR = 'test-jpg'
IMG_EXT = '.jpg'

In [11]:
input_size = 224
input_channels = 3

epochs = 30
batch_size = 96
learning_rate = 0.001
lr_decay = 1e-4

valid_data_size = 5000

In [15]:
x_train = []
x_test = []
y_train = []

df_train = pd.read_csv(DATA_DIR + '/' + TRAIN_DATA)
df_test = pd.read_csv(DATA_DIR + '/sample_submission_v2.csv')

flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))

labels = ['blow_down',
 'bare_ground',
 'conventional_mine',
 'blooming',
 'cultivation',
 'artisinal_mine',
 'haze',
 'primary',
 'slash_burn',
 'habitation',
 'clear',
 'road',
 'selective_logging',
 'partly_cloudy',
 'agriculture',
 'water',
 'cloudy']

label_map = {'agriculture': 14,
 'artisinal_mine': 5,
 'bare_ground': 1,
 'blooming': 3,
 'blow_down': 0,
 'clear': 10,
 'cloudy': 16,
 'conventional_mine': 2,
 'cultivation': 4,
 'habitation': 9,
 'haze': 6,
 'partly_cloudy': 13,
 'primary': 7,
 'road': 11,
 'selective_logging': 12,
 'slash_burn': 8,
 'water': 15}

In [None]:
def generate_arrays_from_file(path):
    while 1:
    f = open(path)
    for line in f:
        # create numpy arrays of input data
        # and labels, from each line in the file
        x1, x2, y = process_line(line)
        yield ({'input_1': x1, 'input_2': x2}, {'output': y})
    f.close()

In [5]:
for f, tags in tqdm(df_train.values, miniters=1000):
    img = cv2.imread(DATA_DIR + '/' + TRAIN_DIR + '/{}.jpg'.format(f))
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    x_train.append(cv2.resize(img, (input_size, input_size)))
    y_train.append(targets)

for f, tags in tqdm(df_test.values, miniters=1000):
    img = cv2.imread(DATA_DIR + '/' + TEST_DIR + '/{}.jpg'.format(f))
    x_test.append(cv2.resize(img, (input_size, input_size)))
    
y_train = np.array(y_train, np.uint8)
x_train = np.array(x_train, np.float32)/255.
x_test  = np.array(x_test, np.float32)/255.

print(x_train.shape)
print(y_train.shape)

nfolds = 5

num_fold = 0
sum_score = 0

yfull_test = []
yfull_train =[]

100%|██████████| 40479/40479 [01:10<00:00, 575.61it/s]
100%|██████████| 61191/61191 [01:45<00:00, 581.13it/s]


(40479, 224, 224, 3)
(40479, 17)


In [None]:
model = applications.resnet50.ResNet50(weights = "imagenet", include_top=False, input_shape = (input_size, input_size, 3))
print('Model loaded.')

In [None]:
model.layers

In [None]:
for layer in model.layers[:]:
    layer.trainable = False
    print 'Not training: ' + str(layer)

In [None]:
# ResNet-50
# add a global spatial average pooling layer
x = model.output
x = GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = Dense(1024, activation='relu')(x)
# and a logistic layer -- let's say we have 200 classes
predictions = Dense(17, activation='softmax')(x)

In [None]:
model_final = Model(inputs = model.input, outputs = predictions)
# model_final.compile(loss = "binary_crossentropy", optimizer = 'adam', metrics=["accuracy"])
# model_final.compile(loss = "binary_crossentropy", optimizer = optimizers.SGD(lr=0.001, momentum=0.9), metrics=["accuracy"])

In [None]:
model_final.summary()

In [None]:
datagen = ImageDataGenerator(
    featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=90,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    vertical_flip=True)

In [None]:
kf = KFold(nfolds, shuffle=True, random_state=1)

for train_index, test_index in kf.split(x_train):
#     start_time_model_fitting = time.time()

    X_train = x_train[train_index]
    Y_train = y_train[train_index]
    X_valid = x_train[test_index]
    Y_valid = y_train[test_index]
    
    datagen.fit(X_train)

    num_fold += 1
    print('Start KFold number {} from {}'.format(num_fold, nfolds))
    print('Split train: ', len(X_train), len(Y_train))
    print('Split valid: ', len(X_valid), len(Y_valid))

    kfold_weights_path = os.path.join('', 'weights_kfold_' + str(num_fold) + '.h5')

    epochs_arr = [30, 10, 10]
    learn_rates = [0.001, 0.0001, 0.00001]

    for learn_rate, epochs in zip(learn_rates, epochs_arr):
        opt  = optimizers.SGD(lr=learn_rate)
        model_final.compile(loss='binary_crossentropy', # We NEED binary here, since categorical_crossentropy l1 norms the output before calculating loss.
                      optimizer=opt,
                      metrics=['accuracy'])
        callbacks = [EarlyStopping(monitor='val_loss', patience=2, verbose=0),
        ModelCheckpoint(kfold_weights_path, monitor='val_loss', save_best_only=True, verbose=0),
        TQDMNotebookCallback()]

#         model_final.fit(x = X_train, y= Y_train, validation_data=(X_valid, Y_valid),
#               batch_size=96, verbose=2, epochs=epochs, callbacks=callbacks, shuffle=True)
        model_final.fit_generator(datagen.flow(X_train, Y_train, batch_size=96), validation_data=(X_valid, Y_valid),
                                  steps_per_epoch=len(X_train) / 96,
                                  verbose=2, epochs=epochs, callbacks=callbacks, workers=8, use_multiprocessing=False)
    
        for layer in model.layers[:]:
            layer.trainable = True
            print 'Not training: ' + str(layer)

    if os.path.isfile(kfold_weights_path):
        model_final.load_weights(kfold_weights_path)

    p_valid = model_final.predict(X_valid, batch_size = 128, verbose=2)
    print(fbeta_score(Y_valid, np.array(p_valid) > 0.2, beta=2, average='samples'))

    p_train = model_final.predict(x_train, batch_size =128, verbose=2)
    yfull_train.append(p_train)

    p_test = model_final.predict(x_test, batch_size = 128, verbose=2)
    yfull_test.append(p_test)

    break

In [None]:
result = np.array(yfull_test[0])
for i in range(1, nfolds):
    result += np.array(yfull_test[i])
result /= nfolds
result = pd.DataFrame(result, columns = labels)
result

from tqdm import tqdm
thres = [0.07, 0.17, 0.2, 0.04, 0.23, 0.33, 0.24, 0.22, 0.1, 0.19, 0.23, 0.24, 0.12, 0.14, 0.25, 0.26, 0.16]
preds = []
for i in tqdm(range(result.shape[0]), miniters=1000):
    a = result.ix[[i]]
    a = a.apply(lambda x: x > 0.2, axis=1)
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))
    
df_test['tags'] = preds
df_test.to_csv('submission_keras_5_fold_CV_0.9136_LB_0.913.csv', index=False)