# CNN #2

In [1]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils

# fix dimension ordering issue
from keras import backend as K
K.set_image_dim_ordering('th')

from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

seed = 28
np.random.seed(seed)

Using TensorFlow backend.


In [2]:
floydhub_dir = "/floyd/input/volcanoes_venus"

# Load data - Floydhub
data_train = pd.read_csv("/floyd/input/volcanoes_venus/train_images.csv", header=None)
labels_train = pd.read_csv("/floyd/input/volcanoes_venus/train_labels.csv")
data_test = pd.read_csv("/floyd/input/volcanoes_venus/test_images.csv", header=None)
labels_test = pd.read_csv("/floyd/input/volcanoes_venus/test_labels.csv")

# Load data - Local
#data_train = pd.read_csv("data/train_images.csv", header=None)
#labels_train = pd.read_csv("data/train_labels.csv")
#data_test = pd.read_csv("data/test_images.csv", header=None)
#labels_test = pd.read_csv("data/test_labels.csv")

print("Train-data rows: {}, Train-data columns: {}".format(data_train.shape[0], data_train.shape[1]))
print("Train-labels rows: {}, Train-labels columns: {}".format(labels_train.shape[0], labels_train.shape[1]))
print("Test-data rows: {}, Test-data columns: {}".format(data_test.shape[0], data_test.shape[1]))
print("Test-labels rows: {}, Test-labels columns: {}".format(labels_test.shape[0], labels_test.shape[1]))

Train-data rows: 7000, Train-data columns: 12100
Train-labels rows: 7000, Train-labels columns: 4
Test-data rows: 2734, Test-data columns: 12100
Test-labels rows: 2734, Test-labels columns: 4


In [3]:
# Prep data for modeling
X_train = np.array(data_train.values).astype("float32")
y_train = np.array(labels_train["Volcano?"].values).astype("float32")
X_test = np.array(data_test.values).astype("float32")
y_test = np.array(labels_test["Volcano?"].values).astype("float32")

# Shape to include channel dim for Conv2D
X_train = X_train.reshape(X_train.shape[0], 1, 110, 110)
X_test = X_test.reshape(X_test.shape[0], 1, 110, 110)

# Normalize input
X_train = X_train / 255
X_test = X_test / 255

# Up-sample to balance target class
X_upsampled, y_upsampled = resample(X_train[y_train == 1], y_train[y_train == 1], replace=True,
                                   n_samples=X_train[y_train == 0].shape[0], random_state=seed)

X_train_up = np.vstack((X_train[y_train == 0], X_upsampled))
y_train_up = np.hstack((y_train[y_train == 0], y_upsampled))

y_pred = np.zeros(y_train_up.shape[0])
print("New class distribution:", np.mean(y_pred == y_train_up))

print("X_train_up shape:", X_train_up.shape)
print("y_train_up shape:", y_train_up.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

New class distribution: 0.5
X_train_up shape: (12000, 1, 110, 110)
y_train_up shape: (12000,)
X_test shape: (2734, 1, 110, 110)
y_test shape: (2734,)


In [4]:
input_dim = (X_train_up.shape[1], X_train_up.shape[2], X_train_up.shape[3])
print("Input dim:", input_dim)

Input dim: (1, 110, 110)


In [5]:
# Define model
def create_model(input_dim, output_dim):
    model = Sequential()
    model.add(Conv2D(filters=30, kernel_size=(5, 5), input_shape=input_dim, activation="relu"))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=15, kernel_size=(3, 3), activation="relu"))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(rate=0.2))
    model.add(Flatten())
    model.add(Dense(units=128, activation="relu"))
    model.add(Dense(units=50, activation="relu"))
    model.add(Dense(units=output_dim, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

In [6]:
model = create_model(input_dim, 1)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 30, 106, 106)      780       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 30, 53, 53)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 15, 51, 51)        4065      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 15, 25, 25)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 15, 25, 25)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 9375)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               1200128   
__________

In [7]:
model.fit(X_train_up, y_train_up, validation_data=(X_test, y_test), epochs=25, batch_size=200, verbose=1)

Train on 12000 samples, validate on 2734 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f643282e160>

In [8]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Baseline accuracy: {:.2f}%".format(100 * scores[1]))
print("Baseline error rate: {:.2f}%".format(100 * (1 - scores[1])))

Baseline accuracy: 96.42%
Baseline error rate: 3.58%


In [9]:
preds = model.predict_classes(X_test)

In [10]:
print("Test accuracy:", accuracy_score(y_test, preds))
print("\nConfusion matrix:\n")
print(confusion_matrix(y_test, preds))
print("\nClassification report:\n")
print(classification_report(y_test, preds))

Test accuracy: 0.964155084125823

Confusion matrix:

[[2265   35]
 [  63  371]]

Classification report:

             precision    recall  f1-score   support

        0.0       0.97      0.98      0.98      2300
        1.0       0.91      0.85      0.88       434

avg / total       0.96      0.96      0.96      2734

