In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import os.path
import glob

import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.normalization import BatchNormalization

import cv2
from tqdm import tqdm

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import fbeta_score
import time
from matplotlib import pyplot as plt

%matplotlib inline

In [None]:
DATA_DIR = '/home/chicm/data/planet'
RESULT_DIR = DATA_DIR + '/resultsv2'


df_train = pd.read_csv(DATA_DIR+'/train_v2.csv')
df_test = pd.read_csv(DATA_DIR+'/sample_submission_v2.csv')

classes = ['clear', 'haze', 'partly_cloudy', 'cloudy', 
           'primary', 'agriculture', 'water', 'cultivation', 'habitation', 'road',
            'slash_burn', 'conventional_mine', 'bare_ground', 'artisinal_mine', 
           'blooming', 'selective_logging', 'blow_down']

In [None]:
flatten = lambda l:[item for sublist in l for item in sublist]
t = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))
print(t)

In [None]:
label_map = {l: i for i, l in enumerate(classes)}
inv_label_map = {i: l for l, i in label_map.items()}
print(inv_label_map)

In [None]:
img_size = (224, 224)

def get_train_data(start, end):
    x_train_list = []
    y_train_list = []

    for f, tags in tqdm(df_train.values[start:end]):
        fn = DATA_DIR+'/train-jpg/'+f+'.jpg'
        img = cv2.imread(fn) 
        targets = np.zeros(17)
        for t in tags.split(' '):
            targets[label_map[t]] = 1
        x_train_list.append(cv2.resize(img, img_size)) 
        y_train_list.append(targets)
        
    x_train = np.array(x_train_list)
    y_train = np.array(y_train_list)
    print(x_train.shape) 
    print(y_train.shape)
    return x_train,y_train


In [None]:
print(df_train[:5])
print(y_train[:5])

In [None]:
plt.figure(figsize=(32,32))
plt.subplot(1,4,1)
plt.imshow(x_train[26])
plt.title(y_train[26])
plt.subplot(1,4,2)
plt.imshow(x_train[273])
plt.title(y_train[273])
plt.subplot(1,4,3)
plt.imshow(x_train[290])
plt.title(y_train[290])
plt.subplot(1,4,4)
plt.imshow(x_train[412])
plt.title(y_train[412])

# Split training data

In [None]:
def split_train(x_train, y_train):
    split_percent = 0.85
    split = int(x_train.shape[0] * split_percent)
    x_val = x_train[split:]
    y_val = y_train[split:]
    x_train = x_train[:split]
    y_train = y_train[:split]
    print(x_val.shape, y_val.shape, x_train.shape, y_train.shape)
    
    return x_train, y_train, x_val, y_val



In [None]:
print(np.mean(x_train))

# Data augumentation

In [None]:
batch_size=16

In [None]:
from keras.preprocessing.image import ImageDataGenerator
train_datagen = ImageDataGenerator(
        width_shift_range=0.05,
        height_shift_range=0.05, 
        horizontal_flip=True,
        vertical_flip = True,
        rescale = 1./255)

In [None]:
test_datagen = ImageDataGenerator(
        rescale = 1./255)

# Create Model

In [None]:
from keras.layers import Input, concatenate, Conv2D, MaxPooling2D, UpSampling2D
from keras.models import Model
from keras.optimizers import Adam
from keras.layers.pooling import GlobalAveragePooling2D
from keras.layers import Activation

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Activation
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import GlobalAveragePooling2D
from keras import backend as K

In [None]:
from keras import applications
res50 = applications.ResNet50(include_top=False, weights='imagenet',input_shape=(224,224,3))

In [None]:
def get_fc_model(input_shape):
    fc_model = Sequential()
    fc_model.add(Flatten(input_shape=input_shape))
    #fc_model.add(Dense(256, activation='relu'))
    #fc_model.add(Dropout(0.6))
    fc_model.add(Dense(17, activation='sigmoid'))
    return fc_model

In [None]:
model = Model(inputs=res50.input, outputs=get_fc_model(res50.output_shape[1:])(res50.output))

In [None]:
model.compile(loss='binary_crossentropy', 
                      optimizer='adam',
                      metrics=['accuracy'])

# Train model

In [None]:
from keras.callbacks import LearningRateScheduler
def lr_schedule(epoch):
    if epoch <= 10:
        return 0.001
    elif epoch <= 20:
        return 0.0001
    else: 
        return 0.00005
    
w_filename = RESULT_DIR + '/res50_224.h5'
callbacks = [ EarlyStopping(monitor='val_loss', patience=40, verbose=0),
              ModelCheckpoint(w_filename, monitor='val_loss', save_best_only=True, verbose=0),
              LearningRateScheduler(lr_schedule)]

In [None]:
epochs = 100

In [None]:
ranges = [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40480]]

for rng in ranges:
    x_train, y_train = get_train_data(rng[0], rng[1])
    x_train, y_train, x_val, y_val = split_train(x_train, y_train)
    model.fit_generator(
        train_datagen.flow(x_train, y_train, batch_size=batch_size),
        steps_per_epoch=x_train.shape[0] // batch_size,
        epochs=epochs,
        validation_data=test_datagen.flow(x_val, y_val, batch_size=batch_size), 
        validation_steps=x_val.shape[0] // batch_size,
        verbose=2,
        callbacks=callbacks)

# Find best thresholds

In [None]:
model.load_weights(w_filename)

In [None]:
print(np.max(x_val))

In [None]:
x_val = x_val / 255.

In [None]:
print(np.mean(x_val))

In [None]:
pred_val = model.predict(x_val, batch_size=batch_size, verbose=2)

In [None]:
print(pred_val[:5])

In [None]:
def optimise_f2_thresholds(y, p, verbose=True, resolution=100):
    def mf(x):
        p2 = np.zeros_like(p)
        for i in range(17):
            p2[:, i] = (p[:, i] > x[i]).astype(np.int)
        score = fbeta_score(y, p2, beta=2, average='samples')
        return score

    x = [0.18] * 17
    for i in range(17):
        best_i2 = 0
        best_score = 0
        for i2 in range(resolution):
            i2 /= resolution
            x[i] = i2
            score = mf(x)
            if score > best_score:
                best_i2 = i2
                best_score = score
        x[i] = best_i2
        if verbose:
            print(i, best_i2, best_score)

    for i in range(17):
        best_i2 = 0
        best_score = 0
        for i2 in range(resolution):
            i2 /= resolution
            x[i] = i2
            score = mf(x)
            if score > best_score:
                best_i2 = i2
                best_score = score
        x[i] = best_i2
        if verbose:
            print(i, best_i2, best_score)
            
    return x

In [None]:
thr = optimise_f2_thresholds(y_val, pred_val)

In [None]:
print(thr)

In [None]:
thr = [0.2, 0.25, 0.12, 0.09, 0.31, 0.13, 0.18, 0.26, 0.26, 0.21, 0.32, 0.28, 0.18, 0.16, 0.24, 0.19, 0.33]

# Make submission

In [None]:
import bcolz
import glob

def save_array(fname, arr):
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()
def load_array(fname):
    return bcolz.open(fname)[:]


In [None]:
df_test = pd.read_csv(DATA_DIR+'/sample_submission_v2.csv')

In [None]:
preds = np.zeros((df_test.values.shape[0], 17))
index = 0
for f, tags in tqdm(df_test.values):
    fn = DATA_DIR+'/test-jpg/'+f+'.jpg'
    if not os.path.isfile(fn):
        fn = DATA_DIR+'/test-jpg-add/'+f+'.jpg'
    img = cv2.imread(fn)
    #x_test.append(cv2.resize(img, img_size))
    #x_test[index] = img / 255.
    img = img/255.
    x_test = np.expand_dims(img, axis=0)
    preds[index] = model.predict(x_test)
    index += 1


In [None]:
print(preds.shape)

In [None]:
save_array(RESULT_DIR+'/preds.dat', preds)

In [None]:
print(preds[:5])

In [None]:
def get_multi_classes(score, threshold, nil=''):
    N = len(classes)
    s = nil
    for n in range(N):
        if score[n] > threshold[n]:
            if s == nil:
                s = classes[n]
            else:
                s = '%s %s' % (s, classes[n])
    return s

In [None]:
for i, pred in enumerate(preds):
    tags = get_multi_classes(pred, thr)
    df_test['tags'][i] = tags



In [None]:
df_test.to_csv(RESULT_DIR+'/sub1.csv', index=False)

In [None]:
print(thr)

In [None]:
nfolds = 6
batch_size = 128

num_fold = 0
sum_score = 0

kf = KFold(len(y_train), n_folds=nfolds, shuffle=True, random_state=1)

for train_index, test_index in kf:
        start_time_model_fitting = time.time()
        
        X_train = x_train[train_index]
        Y_train = y_train[train_index]
        X_valid = x_train[test_index]
        Y_valid = y_train[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))
        
        w_filename = RESULT_DIR + '/wconv_' + str(num_fold) + '.h5'
        
        model = get_model()
        
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=10, verbose=0),
            ModelCheckpoint(w_filename, monitor='val_loss', save_best_only=True, verbose=0)]
        
        model.fit(x = X_train, y= Y_train, validation_data=(X_valid, Y_valid),
                  batch_size=batch_size,verbose=2, epochs=5, shuffle=True)
        
        K.set_value(model.optimizer.lr, 0.00001)
        
        model.fit(x = X_train, y= Y_train, validation_data=(X_valid, Y_valid),
                  batch_size=batch_size, verbose=2, epochs=80, shuffle=True, callbacks = callbacks)
        
        
        p_valid = model.predict(X_valid, batch_size = batch_size, verbose=2)
        print(fbeta_score(Y_valid, np.array(p_valid) > 0.18, beta=2, average='samples'))
        
        

## ensemble

In [None]:
import bcolz
import glob

def save_array(fname, arr):
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()
def load_array(fname):
    return bcolz.open(fname)[:]


PREDICTS_FILE = RESULT_DIR + '/preds'

def ensemble():
    preds = []
    w_files = glob.glob(RESULT_DIR +'/wconv_*.h5')
    for fn in w_files:
        model = get_model()
        print(fn)
        model.load_weights(fn)
        preds.append(model.predict(x_test, batch_size=128))
    m = np.mean(preds, axis=0)
    print(m.shape)
    save_array(PREDICTS_FILE, m)
    return m

In [None]:
result = ensemble()

result = pd.DataFrame(result, columns = labels)
result

In [None]:
from tqdm import tqdm

preds = []
for i in tqdm(range(result.shape[0]), miniters=1000):
    a = result.ix[[i]]
    a = a.apply(lambda x: x > 0.18, axis=1)
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))

In [None]:
df_test['tags'] = preds
df_test

In [None]:
df_test.to_csv(RESULT_DIR+'/v2_sub2.csv', index=False)

In [None]:
%pwd
