In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.normalization import BatchNormalization

import cv2
from tqdm import tqdm

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import fbeta_score
import time

Using TensorFlow backend.


In [2]:
DATA_DIR = '/home/chicm/data/planet'
x_train = []
x_test = []
y_train = []

df_train = pd.read_csv(DATA_DIR+'/train.csv')
df_test = pd.read_csv(DATA_DIR+'/sample_submission.csv')

#flatten = lambda l:[item for sublist in l for item in sublist]
labels = ['haze', 'cultivation', 'blooming', 'partly_cloudy', 'habitation', 'primary',
            'road', 'agriculture', 'selective_logging', 'artisinal_mine', 'slash_burn',
            'blow_down', 'cloudy', 'bare_ground', 'conventional_mine', 'clear', 'water']

In [3]:
label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}
print(inv_label_map)

{0: 'haze', 1: 'cultivation', 2: 'blooming', 3: 'partly_cloudy', 4: 'habitation', 5: 'primary', 6: 'road', 7: 'agriculture', 8: 'selective_logging', 9: 'artisinal_mine', 10: 'slash_burn', 11: 'blow_down', 12: 'cloudy', 13: 'bare_ground', 14: 'conventional_mine', 15: 'clear', 16: 'water'}


In [4]:
for f, tags in tqdm(df_train.values, miniters=10000):
    fn = DATA_DIR+'/train-jpg/'+f+'.jpg'
    img = cv2.imread(fn) 
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1
    x_train.append(cv2.resize(img, (64,64)))
    y_train.append(targets)
    


100%|██████████| 40479/40479 [00:45<00:00, 892.77it/s]


In [4]:
for f, tags in tqdm(df_test.values, miniters=10000):
    fn = DATA_DIR+'/test-jpg/'+f+'.jpg'
    img = cv2.imread(fn)
    x_test.append(cv2.resize(img, (64, 64)))

100%|██████████| 40669/40669 [00:45<00:00, 892.52it/s]


In [14]:
x_train = np.array(x_train, np.float32) / 255.
y_train = np.array(y_train, np.uint8)
x_test = np.array(x_test, np.float32) / 255.
print(x_train.shape) 
print(y_train.shape)
print(x_test.shape)


(0,)
(0,)
(40669, 64, 64, 3)


In [11]:
from keras.layers import Input, concatenate, Conv2D, MaxPooling2D, UpSampling2D
from keras.models import Model
from keras.optimizers import Adam
from keras.layers.pooling import GlobalAveragePooling2D
from keras.layers import Activation

def get_unet():
    inputs = Input((64, 64, 3))
    conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(conv1)
    pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)

    conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(pool1)
    conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv2)
    pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)

    conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(pool2)
    conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv3)
    pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)

    conv4 = Conv2D(256, (3, 3), activation='relu', padding='same')(pool3)
    conv4 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv4)
    pool4 = MaxPooling2D(pool_size=(2, 2))(conv4)

    conv5 = Conv2D(512, (3, 3), activation='relu', padding='same')(pool4)
    conv5 = Conv2D(512, (3, 3), activation='relu', padding='same')(conv5)

    up6 = concatenate([UpSampling2D(size=(2, 2))(conv5), conv4], axis=3)
    conv6 = Conv2D(256, (3, 3), activation='relu', padding='same')(up6)
    conv6 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv6)

    up7 = concatenate([UpSampling2D(size=(2, 2))(conv6), conv3], axis=3)
    conv7 = Conv2D(128, (3, 3), activation='relu', padding='same')(up7)
    conv7 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv7)

    up8 = concatenate([UpSampling2D(size=(2, 2))(conv7), conv2], axis=3)
    conv8 = Conv2D(64, (3, 3), activation='relu', padding='same')(up8)
    conv8 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv8)

    up9 = concatenate([UpSampling2D(size=(2, 2))(conv8), conv1], axis=3)
    conv9 = Conv2D(32, (3, 3), activation='relu', padding='same')(up9)
    conv9 = Conv2D(32, (3, 3), activation='relu', padding='same')(conv9)

    conv10 = Conv2D(1, (1, 1), activation='relu')(conv9)
    result = Flatten()(conv10)
    #result = Dense(256, activation='relu')(result)
    result = Dense(17, activation='sigmoid')(result)
    
    #result = GlobalAveragePooling2D(activation='sigmoid')(conv10)
    #result = Activation('sigmoid')

    model = Model(inputs=inputs, outputs=result)

    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

    return model


In [10]:
from keras import backend as K

RESULT_DIR = DATA_DIR +'/results'
nfolds = 5
batch_size = 128

num_fold = 0
sum_score = 0

yfull_test = []
yfull_train =[]

kf = KFold(len(y_train), n_folds=nfolds, shuffle=True, random_state=1)

for train_index, test_index in kf:
        start_time_model_fitting = time.time()
        
        X_train = x_train[train_index]
        Y_train = y_train[train_index]
        X_valid = x_train[test_index]
        Y_valid = y_train[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))
        
        kfold_weights_path = RESULT_DIR +'/w_unet_' + str(num_fold) + '.h5'
        
        model = get_unet()
        model.fit(x = X_train, y= Y_train, validation_data=(X_valid, Y_valid),
                  batch_size=batch_size, epochs=10, verbose=2)
        
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=20, verbose=0),
            ModelCheckpoint(kfold_weights_path, monitor='val_loss', save_best_only=True, verbose=0)]
        
        K.set_value(model.optimizer.lr, 0.00001)
        model.fit(x = X_train, y= Y_train, validation_data=(X_valid, Y_valid),
                  batch_size=batch_size, epochs=50,callbacks=callbacks, verbose=2)
        
        
        #if os.path.isfile(kfold_weights_path):
        #    model.load_weights(kfold_weights_path)
        
        #p_valid = model.predict(X_valid, batch_size = batch_size, verbose=2)
        #print(fbeta_score(Y_valid, np.array(p_valid) > 0.15, beta=2, average='samples'))
        
        #p_test = model.predict(x_train, batch_size = batch_size, verbose=2)
        #yfull_train.append(p_test)
        
        #p_test = model.predict(x_test, batch_size = batch_size, verbose=2)
        #yfull_test.append(p_test)

Start KFold number 1 from 5
Split train:  32383 32383
Split valid:  8096 8096
Train on 32383 samples, validate on 8096 samples
Epoch 1/10
49s - loss: 0.2311 - acc: 0.9075 - val_loss: 0.1785 - val_acc: 0.9231
Epoch 2/10
48s - loss: 0.1663 - acc: 0.9321 - val_loss: 0.1608 - val_acc: 0.9353
Epoch 3/10
48s - loss: 0.1539 - acc: 0.9376 - val_loss: 0.1507 - val_acc: 0.9388
Epoch 4/10
48s - loss: 0.1448 - acc: 0.9411 - val_loss: 0.1478 - val_acc: 0.9396
Epoch 5/10
48s - loss: 0.1402 - acc: 0.9429 - val_loss: 0.1398 - val_acc: 0.9435
Epoch 6/10
48s - loss: 0.1353 - acc: 0.9451 - val_loss: 0.1323 - val_acc: 0.9464
Epoch 7/10
48s - loss: 0.1320 - acc: 0.9462 - val_loss: 0.1334 - val_acc: 0.9452
Epoch 8/10
48s - loss: 0.1289 - acc: 0.9475 - val_loss: 0.1308 - val_acc: 0.9474
Epoch 9/10
48s - loss: 0.1257 - acc: 0.9489 - val_loss: 0.1264 - val_acc: 0.9497
Epoch 10/10
47s - loss: 0.1233 - acc: 0.9500 - val_loss: 0.1300 - val_acc: 0.9477
Train on 32383 samples, validate on 8096 samples
Epoch 1/50
48

In [19]:
import bcolz
import glob

def save_array(fname, arr):
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()
def load_array(fname):
    return bcolz.open(fname)[:]


RESULT_DIR = DATA_DIR + '/results'
PREDICTS_FILE = RESULT_DIR + '/preds_unet'

def ensemble():
    preds = []
    w_files = glob.glob(RESULT_DIR +'/w_unet_*.h5')
    for fn in w_files:
        model = get_unet()
        print(fn)
        model.load_weights(fn)
        preds.append(model.predict(x_test, batch_size=128))
    m = np.mean(preds, axis=0)
    print(m.shape)
    save_array(PREDICTS_FILE, m)
    return m

In [20]:
result = ensemble()

/home/chicm/data/planet/results/w_unet_2.h5
/home/chicm/data/planet/results/w_unet_4.h5
/home/chicm/data/planet/results/w_unet_3.h5
/home/chicm/data/planet/results/w_unet_5.h5
/home/chicm/data/planet/results/w_unet_1.h5
(40669, 17)


In [21]:
print(result[:2])

[[ 0.02869967  0.03061962  0.03616161  0.03835546  0.02895715  0.97363645
   0.04487121  0.06509243  0.02450787  0.0235552   0.02343875  0.02384736
   0.02583506  0.02438288  0.02332936  0.93729609  0.04892532]
 [ 0.02693769  0.0521944   0.04016238  0.0408045   0.03084663  0.97363508
   0.0646693   0.10611455  0.03913807  0.02358452  0.02372024  0.03400344
   0.02585673  0.02559867  0.02337409  0.93529862  0.12509288]]


In [22]:
result = pd.DataFrame(result, columns = labels)
result

Unnamed: 0,haze,cultivation,blooming,partly_cloudy,habitation,primary,road,agriculture,selective_logging,artisinal_mine,slash_burn,blow_down,cloudy,bare_ground,conventional_mine,clear,water
0,0.028700,0.030620,0.036162,0.038355,0.028957,0.973636,0.044871,0.065092,0.024508,0.023555,0.023439,0.023847,0.025835,0.024383,0.023329,0.937296,0.048925
1,0.026938,0.052194,0.040162,0.040805,0.030847,0.973635,0.064669,0.106115,0.039138,0.023585,0.023720,0.034003,0.025857,0.025599,0.023374,0.935299,0.125093
2,0.026471,0.063229,0.024015,0.836597,0.036900,0.972833,0.073065,0.128349,0.024471,0.023968,0.024879,0.024058,0.026394,0.025432,0.023435,0.139797,0.130319
3,0.031146,0.348414,0.037752,0.041573,0.037574,0.973624,0.060415,0.317086,0.031035,0.023622,0.032051,0.030560,0.025854,0.034290,0.023419,0.932555,0.110725
4,0.027897,0.043863,0.023641,0.625688,0.036407,0.822585,0.096402,0.154245,0.023595,0.023720,0.023486,0.023383,0.269760,0.028236,0.023494,0.139999,0.144990
5,0.026563,0.030426,0.030768,0.038251,0.028648,0.973687,0.046380,0.065224,0.024200,0.023555,0.023432,0.023834,0.025831,0.024158,0.023328,0.939171,0.049384
6,0.057428,0.355900,0.025466,0.265411,0.107987,0.972856,0.201706,0.642872,0.027171,0.024039,0.041086,0.026417,0.026000,0.039254,0.023888,0.503907,0.218843
7,0.030707,0.036791,0.023561,0.040037,0.763600,0.790682,0.818932,0.270763,0.024248,0.048928,0.023457,0.023327,0.025879,0.060152,0.035618,0.930863,0.173067
8,0.026597,0.030199,0.027406,0.038283,0.028414,0.973676,0.043991,0.064822,0.023731,0.023555,0.023430,0.023494,0.025827,0.024164,0.023328,0.939115,0.044568
9,0.726279,0.141329,0.024536,0.048267,0.044932,0.972255,0.164649,0.409359,0.024145,0.023572,0.024541,0.023517,0.026720,0.028662,0.023430,0.309497,0.298286


In [23]:
from tqdm import tqdm

preds = []
for i in tqdm(range(result.shape[0]), miniters=1000):
    a = result.ix[[i]]
    a = a.apply(lambda x: x > 0.18, axis=1)
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))

100%|██████████| 40669/40669 [01:01<00:00, 656.74it/s]


In [24]:
df_test['tags'] = preds
df_test

Unnamed: 0,image_name,tags
0,test_0,primary clear
1,test_1,primary clear
2,test_2,partly_cloudy primary
3,test_3,cultivation primary agriculture clear
4,test_4,partly_cloudy primary cloudy
5,test_5,primary clear
6,test_6,cultivation partly_cloudy primary road agricul...
7,test_7,habitation primary road agriculture clear
8,test_8,primary clear
9,test_9,haze primary agriculture clear water


In [25]:
df_test.to_csv(RESULT_DIR+'/sub13.csv', index=False)

In [13]:
%pwd


'/home/chicm/ml/cnnpractices/planet'