# Neural Networks with Bayesian Optimization

In [1]:
# Importing the required libraries

import numpy as np
import pandas as pd
import os
import tensorflow as tf
import pandas as pd
import keras_tuner as kt
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn import metrics

## Read the train and test datasets

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

## Identifying the real and synthetic records

In [3]:
test = df_test.drop(['ID_code'], axis=1).values

unique_count = np.zeros_like(test)

for feature in range(test.shape[1]):
    _, index, count = np.unique(test[:, feature], return_counts=True, return_index=True)
    unique_count[index[count == 1], feature] += 1
    
real_samples = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synth_samples = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

print('Number of real samples in test set is {}'.format(len(real_samples)))
print('Number of synthetic samples in test set is {}'.format(len(synth_samples)))

Number of real samples in test set is 100000
Number of synthetic samples in test set is 100000


## Magic Features Creation

In [4]:
features = [col for col in df_train.columns if col.startswith('var')]
df_all = pd.concat([df_train, df_test.iloc[real_samples]])

for feature in features:
    temp = df_all[feature].value_counts(dropna=True)

    df_train[feature + 'vc'] = df_train[feature].map(temp).map(lambda x: min(10, x)).astype(np.uint8)
    df_test[feature + 'vc'] = df_test[feature].map(temp).map(lambda x: min(10, x)).astype(np.uint8)

    df_train[feature + 'sum'] = ((df_train[feature] - df_all[feature].mean()) * df_train[feature + 'vc'] \
                                 .map(lambda x: int(x > 1))).astype(np.float32)
    df_test[feature + 'sum'] = ((df_test[feature] - df_all[feature].mean()) * df_test[feature + 'vc'] \
                                .map(lambda x: int(x > 1))).astype(np.float32) 

    df_train[feature + 'sum2'] = ((df_train[feature]) * df_train[feature + 'vc'] \
                                  .map(lambda x: int(x > 2))).astype(np.float32)
    df_test[feature + 'sum2'] = ((df_test[feature]) * df_test[feature + 'vc'] \
                                 .map(lambda x: int(x > 2))).astype(np.float32)

    df_train[feature + 'sum3'] = ((df_train[feature]) * df_train[feature + 'vc'] \
                                  .map(lambda x: int(x > 4))).astype(np.float32) 
    df_test[feature + 'sum3'] = ((df_test[feature]) * df_test[feature + 'vc'] \
                                 .map(lambda x: int(x > 4))).astype(np.float32)
    
print('Training set shape after creating magic features: {}'.format(df_train.shape))
print('Test set shape after creating magic features: {}'.format(df_test.shape))

  df_train[feature + 'sum'] = ((df_train[feature] - df_all[feature].mean()) * df_train[feature + 'vc'] \
  df_train[feature + 'sum2'] = ((df_train[feature]) * df_train[feature + 'vc'] \
  df_test[feature + 'sum2'] = ((df_test[feature]) * df_test[feature + 'vc'] \
  df_train[feature + 'sum3'] = ((df_train[feature]) * df_train[feature + 'vc'] \
  df_test[feature + 'sum3'] = ((df_test[feature]) * df_test[feature + 'vc'] \
  df_train[feature + 'vc'] = df_train[feature].map(temp).map(lambda x: min(10, x)).astype(np.uint8)
  df_test[feature + 'vc'] = df_test[feature].map(temp).map(lambda x: min(10, x)).astype(np.uint8)
  df_test[feature + 'sum'] = ((df_test[feature] - df_all[feature].mean()) * df_test[feature + 'vc'] \


Training set shape after creating magic features: (200000, 1002)
Test set shape after creating magic features: (200000, 1001)


In [5]:
# Seperating the predictor and target variables 
x = df_train.iloc[:,2:].values
y = df_train.iloc[:, 1].values

In [None]:
# # For Class Imbalance

# from imblearn.over_sampling import SMOTE
# # sm = SMOTE(random_state=42)
# # x, y = sm.fit_resample(x, y)

## Test and train split for the model fit and evaluation

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.1, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

## Metrics shown during the training process

In [9]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

## Bayesian Optimization for optimal hyper parameters using Keras Tuner

KerasTuner is an easy-to-use, scalable hyperparameter optimization framework that solves the pain points of hyperparameter search. KerasTuner comes with Bayesian Optimization, Hyperband, and Random Search algorithms built-in, and is also designed to be easy for researchers to extend in order to experiment with new search algorithms.

For the current project we've used Bayesian Optimization for hyper parameter tuning.

Reference: https://keras.io/keras_tuner/

### Building a base model with parameter search space

The function that creates and returns a Keras model. Use the hp argument to define the hyperparameters during model creation.

In [36]:
def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Flatten())
    # Tune the number of layers.
    for i in range(hp.Int("num_layers", 1, 3)):
        model.add(
            layers.Dense(
                # Tune number of units separately.
                units=hp.Int(f"units_{i}", min_value=32, max_value=512, step=32),
                activation=hp.Choice("activation", ["relu", "tanh"]),
            )
        )
    if hp.Boolean("dropout"):
        model.add(layers.Dropout(rate=0.2))
    model.add(layers.Dense(1, activation="sigmoid"))
    learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss="binary_crossentropy",
        metrics=METRICS, # ["val_accuracy"]
    )
    return model

build_model(kt.HyperParameters())

<keras.engine.sequential.Sequential at 0x1c5aa0e5fd0>

### Defining the tuner which will use bayesian optimization to find the optimal hyper parameters from the pre defined search space

Tuner is initialized here. We use objective to specify the objective to select the best models, and we use max_trials to specify the number of different models to try.

In [37]:
tuner = kt.BayesianOptimization(
    hypermodel = build_model,
    objective = kt.Objective("auc", direction="max"), # 'val_accuracy'
    max_trials = 5,
    num_initial_points=2,
    alpha=0.0001,
    beta=2.6,
    seed=42,
    hyperparameters=None,
    tune_new_entries=True,
    allow_new_entries=True
)
tuner.search_space_summary()

In [40]:
tuner.search(X_train, Y_train, epochs=2, validation_data=(X_test, y_test))
# tuner.search(X_res, y_res, epochs=2, validation_data=(X_val, y_val)) # With SMOTE

Trial 5 Complete [00h 01m 46s]
auc: 0.8594379425048828

Best auc So Far: 0.8604117035865784
Total elapsed time: 00h 06m 37s
INFO:tensorflow:Oracle triggered exit


### Get the best hyperparameters after the optimization

In [41]:
best_hps = tuner.get_best_hyperparameters()[0]
print(best_hps.values)

{'num_layers': 2, 'units_0': 288, 'activation': 'relu', 'dropout': False, 'lr': 0.0003069273045576997, 'units_1': 512}


### Use the above optimal hyperparameters to build the Neural Network model

In [12]:
model = Sequential()
model.add(Dense(288,kernel_initializer='normal', activation='relu', input_dim=1000))
# model.add(tf.keras.layers.BatchNormalization())
# model.add(Dropout(0.2))
model.add(Dense(512,activation='relu'))
model.add(Dense(1, kernel_initializer='normal', activation= 'sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Fit the model on train set
model.fit(X_train, Y_train, epochs=5) # validation_data=(X_test, Y_test)

### Model evaluation on validation data

In [15]:
y_pred = model.predict(X_test)
metrics.roc_auc_score(Y_test, y_pred)

## Model prediction on Kaggle Test Data

In [26]:
# Kaggle test data
df_test.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_197sum2,var_197sum3,var_198vc,var_198sum,var_198sum2,var_198sum3,var_199vc,var_199sum,var_199sum2,var_199sum3
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,10.72,0.0,4,-0.402238,15.4722,0.0,4,-5.42092,-8.7197,-0.0
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,9.8714,9.8714,3,3.254862,19.129299,0.0,3,-17.677219,-20.976,-0.0
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,7.0618,0.0,2,4.021162,0.0,0.0,1,-0.0,-0.0,-0.0
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,9.2295,9.2295,7,-2.857638,13.0168,13.0168,1,-0.0,-0.0,-0.0
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,7.2882,0.0,2,-1.948438,0.0,0.0,3,-5.88582,-9.1846,-0.0


In [56]:
x_test = df_test.iloc[:, 1:].values
Y_pred = model.predict(x_test)
sub = pd.read_csv('sample_submission.csv')
sub['target'] = Y_pred
sub.to_csv('submission_6.csv', index=False)