In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
from sklearn.model_selection import train_test_split
from os.path import join as opj

train = pd.read_json("../data/train.json")
test = pd.read_json("../data/test.json")

In [9]:
X_band_1=np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in train["band_1"]])
X_band_2=np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in train["band_2"]])
X_train = np.concatenate([X_band_1[:, :, :, np.newaxis], 
                          X_band_2[:, :, :, np.newaxis],
                          ((X_band_1+X_band_2)/2)[:, :, :, np.newaxis]], axis=-1)

In [4]:
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Input, Flatten, Activation
from keras.layers import GlobalMaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import Concatenate
from keras.models import Model
from keras import initializers
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping

Using TensorFlow backend.
  return f(*args, **kwds)


In [29]:
def getModel():
    #Building the model
    gmodel=Sequential()
    #Conv Layer 1
    gmodel.add(Conv2D(64, kernel_size=(3, 3),activation='relu', input_shape=(75, 75, 3)))
    gmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))
    gmodel.add(Dropout(0.2))

    #Conv Layer 2
    gmodel.add(Conv2D(128, kernel_size=(3, 3), activation='relu' ))
    gmodel.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    gmodel.add(Dropout(0.2))

    #Conv Layer 3
    gmodel.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
    gmodel.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    gmodel.add(Dropout(0.2))

    #Conv Layer 4
    gmodel.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    gmodel.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    gmodel.add(Dropout(0.2))

    #Flatten the data for upcoming dense layers
    gmodel.add(Flatten())

    #Dense Layers
    gmodel.add(Dense(512))
    gmodel.add(Activation('relu'))
    gmodel.add(Dropout(0.2))

    #Dense Layer 2
    gmodel.add(Dense(256))
    gmodel.add(Activation('relu'))
    gmodel.add(Dropout(0.2))

    #Sigmoid Layer
    gmodel.add(Dense(1))
    gmodel.add(Activation('sigmoid'))

    mypotim=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.001)
    gmodel.compile(loss='binary_crossentropy',
                  optimizer=mypotim,
                  metrics=['accuracy'])
    gmodel.summary()
    return gmodel


def get_callbacks(filepath, patience=2):
    es = EarlyStopping('val_loss', patience=patience, mode="min")
    msave = ModelCheckpoint(filepath, save_best_only=True)
    return [es, msave]

In [30]:
target_train=train['is_iceberg']
X_train_cv, X_valid, y_train_cv, y_valid = train_test_split(X_train, target_train, random_state=66, train_size=0.75)



In [31]:
#Data Augmentation
batch_size = 64
file_path = "../weights11100229.hdf5"
callbacks = get_callbacks(filepath=file_path, patience=15)

# this is the augmentation configuration we will use for training
train_datagen = ImageDataGenerator(
            #rescale=1./255,
            rotation_range=20,
            horizontal_flip=True,
            vertical_flip=True,
            width_shift_range = 0.30,
            height_shift_range = 0.30,
            zoom_range = 0.1
            )

# this is the augmentation configuration we will use for testing:
# only rescaling
test_datagen = ImageDataGenerator(
            #rescale=1./255
            )

# this is a generator that will read pictures found in
# subfolers of 'data/train', and indefinitely generate
# batches of augmented image data
train_generator = train_datagen.flow(X_train_cv, y_train_cv, batch_size=batch_size)  

# this is a similar generator, for validation data
validation_generator = train_datagen.flow(X_valid, y_valid, batch_size=401)

In [32]:
import os
gmodel=getModel()
gmodel.fit_generator(train_generator,
                     steps_per_epoch=len(X_train_cv)/batch_size,
                     epochs=200,
                     verbose=1,
                     validation_data = (X_valid, y_valid),
                     #validation_data = validation_generator,
                     #validation_steps = 1,
                     callbacks=callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_24 (Conv2D)           (None, 73, 73, 64)        1792      
_________________________________________________________________
max_pooling2d_22 (MaxPooling (None, 36, 36, 64)        0         
_________________________________________________________________
dropout_29 (Dropout)         (None, 36, 36, 64)        0         
_________________________________________________________________
conv2d_25 (Conv2D)           (None, 34, 34, 128)       73856     
_________________________________________________________________
max_pooling2d_23 (MaxPooling (None, 17, 17, 128)       0         
_________________________________________________________________
dropout_30 (Dropout)         (None, 17, 17, 128)       0         
_________________________________________________________________
conv2d_26 (Conv2D)           (None, 15, 15, 128)       147584    
__________

<keras.callbacks.History at 0x1aba56940>

In [33]:
gmodel.load_weights(filepath=file_path)
#score = gmodel.evaluate_generator(validation_generator,steps = 1)
score = gmodel.evaluate(X_valid, y_valid)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.232776457291
Test accuracy: 0.905236907731


In [35]:
X_band_test_1=np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in test["band_1"]])
X_band_test_2=np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in test["band_2"]])
X_test = np.concatenate([X_band_test_1[:, :, :, np.newaxis]
                          , X_band_test_2[:, :, :, np.newaxis]
                         , ((X_band_test_1+X_band_test_2)/2)[:, :, :, np.newaxis]], axis=-1)

In [36]:
#predicted_test=gmodel.predict_generator(test_datagen.flow(X_test,batch_size=len(X_test)), steps=1)
predicted_test=gmodel.predict_proba(X_test)
submission = pd.DataFrame()
submission['id']=test['id']
submission['is_iceberg']=predicted_test.reshape((predicted_test.shape[0]))
#submission.to_csv('../submit/submission11072000.csv', index=False)
#predicted_test=gmodel.predict_proba(X_test)



In [None]:
#11070205: test loss: 0.2707
#11071435: test loss: 0.252399
#11080014: PL: 0.2161 Test Loss: 0.246969386935
#11081153: PL: 0.2190 Test Loss: 0.2518
#11091153_0: PL: 0.2161 Test Loss: 0.2209

In [37]:
leaky_angle = [34.4721, 42.5591, 33.6352, 36.1061, 39.2340]
mask = [test['inc_angle'][i] in leaky_angle for i in range(len(test))]
column_name = 'is_iceberg'
submission.loc[mask, column_name] = 1

In [38]:
submission.to_csv('../submit/submission11101117_1.csv', index=False)