In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.normalization import BatchNormalization

import cv2
from tqdm import tqdm

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import fbeta_score
import time

Using TensorFlow backend.


In [2]:
DATA_DIR = '/home/chicm/data/planet'
x_train = []
x_test = []
y_train = []

df_train = pd.read_csv(DATA_DIR+'/train.csv')
df_test = pd.read_csv(DATA_DIR+'/sample_submission.csv')

flatten = lambda l:[item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))

In [3]:
label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}
print(inv_label_map)

{0: 'blooming', 1: 'water', 2: 'conventional_mine', 3: 'selective_logging', 4: 'blow_down', 5: 'primary', 6: 'cultivation', 7: 'artisinal_mine', 8: 'slash_burn', 9: 'haze', 10: 'habitation', 11: 'cloudy', 12: 'road', 13: 'clear', 14: 'bare_ground', 15: 'partly_cloudy', 16: 'agriculture'}


In [4]:
for f, tags in tqdm(df_train.values, miniters=10000):
    fn = DATA_DIR+'/train-jpg/'+f+'.jpg'
    img = cv2.imread(fn) 
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1
    x_train.append(cv2.resize(img, (64,64)))
    y_train.append(targets)
    


100%|██████████| 40479/40479 [00:37<00:00, 1066.01it/s]


In [5]:
for f, tags in tqdm(df_test.values, miniters=10000):
    fn = DATA_DIR+'/test-jpg/'+f+'.jpg'
    img = cv2.imread(fn)
    x_test.append(cv2.resize(img, (64, 64)))

100%|██████████| 40669/40669 [00:37<00:00, 1075.37it/s]


In [6]:
x_train = np.array(x_train, np.float32) / 255.
y_train = np.array(y_train, np.uint8)
x_test = np.array(x_test, np.float32) / 255.
print(x_train.shape) 
print(y_train.shape)
print(x_test.shape)


(40479, 64, 64, 3)
(40479, 17)
(40669, 64, 64, 3)


In [7]:
nfolds = 4
batch_size = 64

num_fold = 0
sum_score = 0

yfull_test = []
yfull_train =[]

kf = KFold(len(y_train), n_folds=nfolds, shuffle=True, random_state=1)

for train_index, test_index in kf:
        start_time_model_fitting = time.time()
        
        X_train = x_train[train_index]
        Y_train = y_train[train_index]
        X_valid = x_train[test_index]
        Y_valid = y_train[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))
        
        kfold_weights_path = os.path.join('', 'weights_kfold_' + str(num_fold) + '.h5')
        
        model = Sequential()
        model.add(Conv2D(24, 3, 3, activation='relu', input_shape=(64, 64, 3)))
        model.add(BatchNormalization(axis=1))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(24, 3, 3, activation='relu'))
        model.add(BatchNormalization(axis=1))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        model.add(Conv2D(48, 3, 3, activation='relu'))
        model.add(BatchNormalization(axis=1))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(0.5))
        model.add(Dense(17, activation='sigmoid'))
        

        model.compile(loss='binary_crossentropy', 
                      optimizer='adam',
                      metrics=['accuracy'])
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=30, verbose=0),
            ModelCheckpoint(kfold_weights_path, monitor='val_loss', save_best_only=True, verbose=0)]
        
        model.fit(x = X_train, y= Y_train, validation_data=(X_valid, Y_valid),
                  batch_size=batch_size,verbose=2, epochs=200,callbacks=callbacks,
                  shuffle=True)
        
        
        if os.path.isfile(kfold_weights_path):
            model.load_weights(kfold_weights_path)
        
        p_valid = model.predict(X_valid, batch_size = batch_size, verbose=2)
        print(fbeta_score(Y_valid, np.array(p_valid) > 0.15, beta=2, average='samples'))
        
        p_test = model.predict(x_train, batch_size = batch_size, verbose=2)
        yfull_train.append(p_test)
        
        p_test = model.predict(x_test, batch_size = batch_size, verbose=2)
        yfull_test.append(p_test)

Start KFold number 1 from 4
Split train:  30359 30359
Split valid:  10120 10120




Train on 30359 samples, validate on 10120 samples
Epoch 1/200
41s - loss: 0.3607 - acc: 0.8484 - val_loss: 0.3241 - val_acc: 0.9075
Epoch 2/200
39s - loss: 0.2039 - acc: 0.9193 - val_loss: 0.2341 - val_acc: 0.9208
Epoch 3/200
39s - loss: 0.1958 - acc: 0.9222 - val_loss: 0.2290 - val_acc: 0.9088
Epoch 4/200
39s - loss: 0.1885 - acc: 0.9245 - val_loss: 0.2015 - val_acc: 0.9216
Epoch 5/200
39s - loss: 0.1766 - acc: 0.9293 - val_loss: 0.1814 - val_acc: 0.9265
Epoch 6/200
39s - loss: 0.1705 - acc: 0.9321 - val_loss: 0.1788 - val_acc: 0.9295
Epoch 7/200
39s - loss: 0.1664 - acc: 0.9338 - val_loss: 0.2513 - val_acc: 0.8952
Epoch 8/200
40s - loss: 0.1636 - acc: 0.9346 - val_loss: 0.1492 - val_acc: 0.9383
Epoch 9/200
39s - loss: 0.1587 - acc: 0.9364 - val_loss: 0.1560 - val_acc: 0.9383
Epoch 10/200
39s - loss: 0.1580 - acc: 0.9366 - val_loss: 0.1697 - val_acc: 0.9320
Epoch 11/200
39s - loss: 0.1551 - acc: 0.9380 - val_loss: 0.1524 - val_acc: 0.9397
Epoch 12/200
39s - loss: 0.1538 - acc: 0.9386 

In [8]:
result = np.array(yfull_test[0])
for i in range(1, nfolds):
    result += np.array(yfull_test[i])
result /= nfolds
result = pd.DataFrame(result, columns = labels)
result

Unnamed: 0,blooming,water,conventional_mine,selective_logging,blow_down,primary,cultivation,artisinal_mine,slash_burn,haze,habitation,cloudy,road,clear,bare_ground,partly_cloudy,agriculture
0,4.187037e-03,0.006508,2.239051e-06,3.265152e-04,2.784243e-04,0.999925,0.002206,2.901490e-06,1.460756e-05,1.912363e-03,0.000581,2.735349e-05,0.002223,0.998504,1.956200e-04,0.000173,0.004219
1,1.574001e-02,0.014201,1.427954e-05,2.192134e-03,8.971733e-04,0.999848,0.005349,2.357117e-05,4.387184e-05,3.943304e-04,0.001893,5.316763e-05,0.005112,0.993583,3.820121e-04,0.004240,0.007561
2,3.989369e-10,0.012993,7.408844e-12,1.583066e-09,7.009374e-09,0.999999,0.001438,3.620583e-11,7.892543e-08,2.713171e-09,0.000017,6.973309e-08,0.002749,0.000002,1.404901e-07,0.999955,0.010156
3,1.166559e-02,0.013358,1.200881e-05,1.334625e-03,1.367413e-03,0.999413,0.009360,1.972745e-05,8.155288e-05,5.460133e-03,0.001859,5.822387e-04,0.004693,0.928549,4.373735e-04,0.029141,0.017166
4,5.999260e-08,0.013059,3.894754e-08,7.484086e-07,8.856111e-07,0.775569,0.000396,1.659431e-07,1.228211e-06,5.391875e-05,0.000039,4.643797e-01,0.001562,0.000189,8.331506e-06,0.617367,0.002580
5,1.591140e-03,0.002748,5.342253e-07,1.111360e-04,9.911368e-05,0.999953,0.000804,8.614473e-07,4.840585e-06,4.161067e-04,0.000323,2.022511e-05,0.000872,0.999511,5.425073e-05,0.000045,0.001556
6,3.653046e-03,0.362232,3.012814e-03,1.026863e-02,2.481610e-03,0.990403,0.253619,5.221325e-03,1.040746e-02,1.749028e-01,0.124173,6.014342e-03,0.243665,0.254685,3.541502e-02,0.474365,0.530505
7,1.278380e-03,0.200582,6.601067e-04,1.530771e-03,2.555231e-04,0.723589,0.017740,4.885948e-03,2.286214e-04,8.135287e-03,0.839634,1.353330e-02,0.859779,0.977337,4.830353e-02,0.003038,0.262824
8,1.289203e-03,0.003475,5.258984e-07,9.464831e-05,1.033047e-04,0.999944,0.000975,6.654043e-07,5.843174e-06,9.032952e-04,0.000286,2.250541e-05,0.001005,0.999287,6.425558e-05,0.000078,0.001947
9,1.566804e-03,0.286961,2.339527e-04,1.351018e-03,7.270651e-04,0.974669,0.181596,3.290103e-04,2.742939e-03,8.155764e-01,0.025796,1.415047e-02,0.149848,0.166722,1.513151e-02,0.011640,0.509606


In [9]:
from tqdm import tqdm

preds = []
for i in tqdm(range(result.shape[0]), miniters=1000):
    a = result.ix[[i]]
    a = a.apply(lambda x: x > 0.18, axis=1)
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))

100%|██████████| 40669/40669 [01:00<00:00, 671.58it/s]


In [10]:
df_test['tags'] = preds
df_test

Unnamed: 0,image_name,tags
0,test_0,primary clear
1,test_1,primary clear
2,test_2,primary partly_cloudy
3,test_3,primary clear
4,test_4,primary cloudy partly_cloudy
5,test_5,primary clear
6,test_6,water primary cultivation road clear partly_cl...
7,test_7,water primary habitation road clear agriculture
8,test_8,primary clear
9,test_9,water primary cultivation haze agriculture


In [11]:
df_test.to_csv('sub7.csv', index=False)

In [13]:
%pwd


'/home/chicm/ml/cnnpractices/planet'