# Baseline NN

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils

from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

%matplotlib inline

seed = 28
np.random.seed(seed)

Using TensorFlow backend.


In [2]:
floydhub_dir = "/floyd/input/volcanoes_venus"

# Load data - Floydhub
data_train = pd.read_csv("/floyd/input/volcanoes_venus/train_images.csv", header=None)
labels_train = pd.read_csv("/floyd/input/volcanoes_venus/train_labels.csv")
data_test = pd.read_csv("/floyd/input/volcanoes_venus/test_images.csv", header=None)
labels_test = pd.read_csv("/floyd/input/volcanoes_venus/test_labels.csv")

# Load data - Local
#data_train = pd.read_csv("data/train_images.csv", header=None)
#labels_train = pd.read_csv("data/train_labels.csv")
#data_test = pd.read_csv("data/test_images.csv", header=None)
#labels_test = pd.read_csv("data/test_labels.csv")

print("Train-data rows: {}, Train-data columns: {}".format(data_train.shape[0], data_train.shape[1]))
print("Train-labels rows: {}, Train-labels columns: {}".format(labels_train.shape[0], labels_train.shape[1]))
print("Test-data rows: {}, Test-data columns: {}".format(data_test.shape[0], data_test.shape[1]))
print("Test-labels rows: {}, Test-labels columns: {}".format(labels_test.shape[0], labels_test.shape[1]))

Train-data rows: 7000, Train-data columns: 12100
Train-labels rows: 7000, Train-labels columns: 4
Test-data rows: 2734, Test-data columns: 12100
Test-labels rows: 2734, Test-labels columns: 4


In [3]:
# Prep data for modeling
X_train = np.array(data_train.values).astype("float32")
y_train = np.array(labels_train["Volcano?"].values).astype("float32")
X_test = np.array(data_test.values).astype("float32")
y_test = np.array(labels_test["Volcano?"].values).astype("float32")

# Normalize input
X_train = X_train / 255
X_test = X_test / 255

# Up-sample to balance target class
X_upsampled, y_upsampled = resample(X_train[y_train == 1], y_train[y_train == 1], replace=True,
                                   n_samples=X_train[y_train == 0].shape[0], random_state=seed)

X_train_up = np.vstack((X_train[y_train == 0], X_upsampled))
y_train_up = np.hstack((y_train[y_train == 0], y_upsampled))

y_pred = np.zeros(y_train_up.shape[0])
print("New class distribution:", np.mean(y_pred == y_train_up))

print("X_train_up shape:", X_train_up.shape)
print("y_train_up shape:", y_train_up.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

New class distribution: 0.5
X_train_up shape: (12000, 12100)
y_train_up shape: (12000,)
X_test shape: (2734, 12100)
y_test shape: (2734,)


In [4]:
num_pixels = X_train.shape[1]
input_dim = X_train.shape[1]
print("Number of pixels:", num_pixels)
print("Input dim:", input_dim)

Number of pixels: 12100
Input dim: 12100


In [5]:
# Define baseline model
def create_model(num_pixels, input_dim, output_dim):
    model = Sequential()
    model.add(Dense(units=num_pixels, input_dim=input_dim, kernel_initializer="normal", activation="relu"))
    model.add(Dense(units=output_dim, kernel_initializer="normal", activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

In [6]:
model = create_model(num_pixels, input_dim, 1)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 12100)             146422100 
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 12101     
Total params: 146,434,201
Trainable params: 146,434,201
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.fit(X_train_up, y_train_up, validation_data=(X_test, y_test), epochs=10, batch_size=200, verbose=1)

Train on 12000 samples, validate on 2734 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4e80280c88>

In [8]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Baseline accuracy: {:.2f}%".format(100 * scores[1]))
print("Baseline error rate: {:.2f}%".format(100 * (1 - scores[1])))

Baseline accuracy: 18.00%
Baseline error rate: 82.00%


In [9]:
preds = model.predict_classes(X_test)

In [10]:
print("Test accuracy:", accuracy_score(y_test, preds))
print("\nConfusion matrix:\n")
print(confusion_matrix(y_test, preds))
print("\nClassification report:\n")
print(classification_report(y_test, preds))

Test accuracy: 0.1799561082662765

Confusion matrix:

[[  58 2242]
 [   0  434]]

Classification report:

             precision    recall  f1-score   support

        0.0       1.00      0.03      0.05      2300
        1.0       0.16      1.00      0.28       434

avg / total       0.87      0.18      0.09      2734

