In [None]:
# coding: utf-8
import pickle
import pandas as pd
import numpy as np

from keras import models
from keras import layers
from keras.callbacks import TensorBoard
from keras.callbacks import EarlyStopping
from keras.initializers import TruncatedNormal
from keras.optimizers import Adam


In [None]:
with open('../../data/transformed_data.pkl','rb') as f:
    transformed_data = pickle.load(f)
    f.close()

with open('../../data/targets.pkl','rb') as f:
    targets = pickle.load(f)
    f.close()
    
    
transformed_data = np.c_[transformed_data,transformed_data[:,[9,11,13]].sum(axis=1)]
transformed_data = np.c_[transformed_data,transformed_data[:,[10,12,14]].mean(axis=1)]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

strlength_cat = np.ceil(transformed_data[:,15] / 0.1)
strlength_cat[strlength_cat >= 2] = 2.0
strlength_cat = scaler.fit_transform(strlength_cat.reshape(-1, 1))
transformed_data = np.c_[transformed_data, strlength_cat]

In [None]:
delta_cat = transformed_data[:,7]
delta_cat = np.ceil(delta_cat/0.0001)
delta_cat[delta_cat > 1] = 2
delta_cat = scaler.fit_transform(delta_cat.reshape(-1, 1))

transformed_data = np.c_[transformed_data, delta_cat]

Add bias node to the data

In [None]:
transformed_data = np.c_[np.ones((transformed_data.shape[0], 1)), transformed_data]
transformed_data.shape

Create a stratified test and train set, ensuring that there is a good proportion of targets and non-targets in the test/training set.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size=0.2, random_state=1337)
for train_index, test_index in split.split(transformed_data, strlength_cat):
    train_index = train_index
    test_index=test_index


In [None]:
print('test_m =', len(test_index))
print('test_m =', len(train_index))
print('proportion of targets =',sum(targets[test_index])/len(targets[test_index]))
print('proportion of targets =',sum(targets[train_index])/len(targets[train_index]))

train_X = transformed_data[train_index]
train_y = targets[train_index]
test_X = transformed_data[test_index]
test_y = targets[test_index]

from sklearn.utils import column_or_1d 
train_y = column_or_1d(train_y)
test_y = column_or_1d(test_y)

# What about string length categories

print(pd.Series(train_X[:,17]).value_counts() / train_X.shape[0])
print(pd.Series(test_X[:,17]).value_counts() / test_X.shape[0])

In [None]:
m = train_X.shape[0]
n = train_X.shape[1]

In [None]:
n_nodes = 150

tnorm=TruncatedNormal(mean=0.0, stddev=0.05, seed=None)

network = models.Sequential()
network.add(
    layers.Dense(n_nodes, activation='elu', 
                 input_shape=(n,), kernel_initializer=tnorm))
network.add(layers.Dense(n_nodes, activation='elu', input_shape=(n,)))
network.add(layers.Dense(n_nodes, activation='elu', input_shape=(n,)))
network.add(layers.Dense(2, activation='softmax'))


am = Adam(
    lr=0.001, beta_1=0.9, beta_2=0.999, 
    epsilon=1e-08, decay=0.0)


network.compile(optimizer=am,
                loss='categorical_crossentropy',
                metrics=['accuracy'])

from keras.utils import to_categorical

train_y = to_categorical(train_y)
test_y = to_categorical(test_y)

# Write logs to tensorboard (note you need to start a server with
# tensorboard --logdir ./tf_logs/)

tb = TensorBoard(
    log_dir='./tf_logs', histogram_freq=0, batch_size=32, 
    write_graph=True, write_grads=False, write_images=False, 
    embeddings_freq=0, embeddings_layer_names=None, 
    embeddings_metadata=None)

# Employ early stopping as a means of regularisations

In [None]:
es = EarlyStopping(
    monitor='val_loss', 
    min_delta=0.001, 
    patience=10, verbose=1, 
    mode='auto')

# 200 epochs seems to minimise the test set error.

network.fit(
    train_X, train_y, epochs=200, 
    batch_size=128, callbacks=[tb],
    validation_data=(test_X, test_y))

In [None]:
test_loss, test_acc = network.evaluate(test_X, test_y)

print('test_acc:', test_acc)

In [None]:
test_pred = network.predict_classes(test_X)
#69

In [None]:
from sklearn.metrics import classification_report

print(classification_report(test_y[:,1], test_pred))