In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.normalization import BatchNormalization

import cv2
from tqdm import tqdm

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import fbeta_score
import time

Using TensorFlow backend.


In [2]:
DATA_DIR = '/home/chicm/data/planet'
x_train = []
x_test = []
y_train = []

df_train = pd.read_csv(DATA_DIR+'/train.csv')
df_test = pd.read_csv(DATA_DIR+'/sample_submission.csv')

flatten = lambda l:[item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))

In [3]:
label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}
print(inv_label_map)

{0: 'partly_cloudy', 1: 'slash_burn', 2: 'clear', 3: 'agriculture', 4: 'haze', 5: 'primary', 6: 'habitation', 7: 'cultivation', 8: 'cloudy', 9: 'selective_logging', 10: 'water', 11: 'blow_down', 12: 'road', 13: 'bare_ground', 14: 'blooming', 15: 'artisinal_mine', 16: 'conventional_mine'}


In [4]:
for f, tags in tqdm(df_train.values, miniters=10000):
    fn = DATA_DIR+'/train-jpg/'+f+'.jpg'
    img = cv2.imread(fn) 
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1
    x_train.append(cv2.resize(img, (64,64)))
    y_train.append(targets)
    


100%|██████████| 40479/40479 [00:41<00:00, 979.91it/s] 


In [5]:
for f, tags in tqdm(df_test.values, miniters=10000):
    fn = DATA_DIR+'/test-jpg/'+f+'.jpg'
    img = cv2.imread(fn)
    x_test.append(cv2.resize(img, (64, 64)))

100%|██████████| 40669/40669 [00:41<00:00, 982.60it/s] 


In [6]:
x_train = np.array(x_train, np.float32) / 255.
y_train = np.array(y_train, np.uint8)
x_test = np.array(x_test, np.float32) / 255.
print(x_train.shape) 
print(y_train.shape)
print(x_test.shape)


(40479, 64, 64, 3)
(40479, 17)
(40669, 64, 64, 3)


In [7]:
nfolds = 4
batch_size = 64

num_fold = 0
sum_score = 0

yfull_test = []
yfull_train =[]

kf = KFold(len(y_train), n_folds=nfolds, shuffle=True, random_state=1)

for train_index, test_index in kf:
        start_time_model_fitting = time.time()
        
        X_train = x_train[train_index]
        Y_train = y_train[train_index]
        X_valid = x_train[test_index]
        Y_valid = y_train[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))
        
        kfold_weights_path = os.path.join('', 'weights_kfold_' + str(num_fold) + '.h5')
        
        model = Sequential()
        model.add(Conv2D(24, 3, 3, activation='relu', input_shape=(64, 64, 3)))
        model.add(BatchNormalization(axis=1))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(24, 3, 3, activation='relu'))
        model.add(BatchNormalization(axis=1))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        model.add(Conv2D(48, 3, 3, activation='relu'))
        model.add(BatchNormalization(axis=1))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(0.5))
        model.add(Dense(17, activation='sigmoid'))
        

        model.compile(loss='binary_crossentropy', 
                      optimizer='adam',
                      metrics=['accuracy'])
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=4, verbose=0),
            ModelCheckpoint(kfold_weights_path, monitor='val_loss', save_best_only=True, verbose=0)]
        
        model.fit(x = X_train, y= Y_train, validation_data=(X_valid, Y_valid),
                  batch_size=batch_size,verbose=2, epochs=50,callbacks=callbacks,
                  shuffle=True)
        
        
        if os.path.isfile(kfold_weights_path):
            model.load_weights(kfold_weights_path)
        
        p_valid = model.predict(X_valid, batch_size = batch_size, verbose=2)
        print(fbeta_score(Y_valid, np.array(p_valid) > 0.15, beta=2, average='samples'))
        
        p_test = model.predict(x_train, batch_size = batch_size, verbose=2)
        yfull_train.append(p_test)
        
        p_test = model.predict(x_test, batch_size = batch_size, verbose=2)
        yfull_test.append(p_test)

Start KFold number 1 from 4
Split train:  30359 30359
Split valid:  10120 10120




Train on 30359 samples, validate on 10120 samples
Epoch 1/50
43s - loss: 0.3655 - acc: 0.8513 - val_loss: 0.2502 - val_acc: 0.9124
Epoch 2/50
41s - loss: 0.2148 - acc: 0.9152 - val_loss: 0.2537 - val_acc: 0.8984
Epoch 3/50
41s - loss: 0.2001 - acc: 0.9198 - val_loss: 0.9647 - val_acc: 0.6900
Epoch 4/50
41s - loss: 0.1972 - acc: 0.9211 - val_loss: 0.2028 - val_acc: 0.9231
Epoch 5/50
41s - loss: 0.1840 - acc: 0.9269 - val_loss: 0.1757 - val_acc: 0.9313
Epoch 6/50
41s - loss: 0.1859 - acc: 0.9261 - val_loss: 0.1859 - val_acc: 0.9279
Epoch 7/50
41s - loss: 0.1725 - acc: 0.9314 - val_loss: 0.1641 - val_acc: 0.9346
Epoch 8/50
42s - loss: 0.1656 - acc: 0.9342 - val_loss: 0.1565 - val_acc: 0.9377
Epoch 9/50
42s - loss: 0.1612 - acc: 0.9359 - val_loss: 0.1469 - val_acc: 0.9415
Epoch 10/50
42s - loss: 0.1567 - acc: 0.9377 - val_loss: 0.1440 - val_acc: 0.9429
Epoch 11/50
42s - loss: 0.1870 - acc: 0.9258 - val_loss: 0.2500 - val_acc: 0.9166
Epoch 12/50
41s - loss: 0.1714 - acc: 0.9307 - val_loss: 

In [8]:
result = np.array(yfull_test[0])
for i in range(1, nfolds):
    result += np.array(yfull_test[i])
result /= nfolds
result = pd.DataFrame(result, columns = labels)
result

Unnamed: 0,partly_cloudy,slash_burn,clear,agriculture,haze,primary,habitation,cultivation,cloudy,selective_logging,water,blow_down,road,bare_ground,blooming,artisinal_mine,conventional_mine
0,0.000321,0.000096,0.998689,0.007809,0.001745,0.999700,0.000908,0.004365,0.000403,0.000848,0.016251,0.000483,0.005040,0.000432,0.006839,0.000033,0.000026
1,0.002104,0.000422,0.997810,0.023605,0.000923,0.999595,0.003187,0.015995,0.000360,0.003834,0.031903,0.002209,0.013872,0.001018,0.020174,0.000095,0.000053
2,0.997227,0.000251,0.000194,0.026053,0.000017,0.998330,0.001742,0.011111,0.002125,0.000081,0.035165,0.000081,0.013927,0.000401,0.000014,0.000050,0.000030
3,0.111453,0.001030,0.872645,0.062407,0.008446,0.998819,0.006256,0.040440,0.001145,0.003955,0.063875,0.002134,0.023253,0.002405,0.011309,0.000296,0.000146
4,0.808044,0.000217,0.000535,0.078357,0.000742,0.841615,0.003982,0.011551,0.193764,0.000049,0.055296,0.000038,0.040184,0.001016,0.000010,0.000074,0.000091
5,0.000162,0.000027,0.999354,0.004036,0.000798,0.999813,0.000476,0.002087,0.000350,0.000225,0.009384,0.000246,0.002463,0.000144,0.003312,0.000010,0.000008
6,0.449951,0.009997,0.363133,0.524360,0.136666,0.979262,0.092321,0.224516,0.014529,0.009087,0.382357,0.003146,0.232080,0.026271,0.004999,0.005501,0.001915
7,0.006448,0.000549,0.966423,0.479572,0.014555,0.736510,0.670810,0.039208,0.015563,0.003460,0.265492,0.000781,0.855151,0.066760,0.001034,0.014814,0.003849
8,0.000221,0.000044,0.998997,0.004893,0.001216,0.999793,0.000556,0.002644,0.000328,0.000382,0.010858,0.000304,0.002944,0.000224,0.004229,0.000016,0.000012
9,0.010317,0.001357,0.365857,0.310131,0.732570,0.964247,0.024126,0.071171,0.030132,0.001128,0.242182,0.000907,0.125871,0.013572,0.001504,0.000403,0.000329


In [23]:
from tqdm import tqdm

preds = []
for i in tqdm(range(result.shape[0]), miniters=1000):
    a = result.ix[[i]]
    a = a.apply(lambda x: x > 0.18, axis=1)
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))

100%|██████████| 40669/40669 [01:00<00:00, 670.40it/s]


In [24]:
df_test['tags'] = preds
df_test

Unnamed: 0,image_name,tags
0,test_0,clear primary
1,test_1,clear primary
2,test_2,partly_cloudy primary
3,test_3,clear primary
4,test_4,partly_cloudy primary cloudy
5,test_5,clear primary
6,test_6,partly_cloudy clear agriculture primary cultiv...
7,test_7,clear agriculture primary habitation water road
8,test_8,clear primary
9,test_9,clear agriculture haze primary water


In [25]:
df_test.to_csv('sub6.csv', index=False)

In [13]:
%pwd


'/home/chicm/ml/cnnpractices/planet'