In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import warnings
import gc
import time
import sys
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

from sklearn.utils import shuffle



In [3]:
train= pd.read_csv("../input/train.csv")
test= pd.read_csv("../input/test.csv")
test_lgbm = pd.read_csv("../input/output_test_lgbm_base1.csv") # contains the lgbm prediction on the test set
lgbm = pd.read_csv("../input/output_lgbm_base1.csv")   #contains the lgbm predictions on the train set

#squash together to give both sets an additional column from the lgbm

train_df = train.copy()
test_df = test.copy()
train_df["lgbm_out"] = lgbm["output"]
test_df["lgbm_out"] = test_lgbm["target"]

train_df.info()
test_df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 203 entries, ID_code to lgbm_out
dtypes: float64(201), int64(1), object(1)
memory usage: 309.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 202 entries, ID_code to lgbm_out
dtypes: float64(201), object(1)
memory usage: 308.2+ MB


In [19]:
feats = ["var_{}".format(i) for i in range(200)]
feats.append("lgbm_out")
targets2 = ["target{}".format(i) for i in range(0,2)]

split_tar_df = train_df.copy(deep=True)
split_tar_df['target1'] = train_df['target']
split_tar_df['target0'] = 1 - train_df['target']
split_tar_df.drop(columns=['target'])

zero_subset_df = split_tar_df[split_tar_df['target0'] > 0.9]
ones_subset_df = split_tar_df[split_tar_df['target1'] > 0.9]

#Balance the data by adding more ones.
frames = [zero_subset_df]
for i in range(0,4):
    frames.append(ones_subset_df)  #NN will add Gaussian noise.

combined_df = pd.concat(frames)
combined_df = shuffle(combined_df)

combined_df.info()

X = combined_df[feats]
X_test = test_df[feats]
y = combined_df[targets2]


scaler = StandardScaler()

X_sc = scaler.fit_transform(X)
X_test_sc = scaler.fit_transform(X_test)

print(X_sc.shape)
print(y.shape)



<class 'pandas.core.frame.DataFrame'>
Int64Index: 260294 entries, 129009 to 196381
Columns: 205 entries, ID_code to target0
dtypes: float64(201), int64(3), object(1)
memory usage: 409.1+ MB
(260294, 201)
(260294, 2)


In [20]:
import keras
from keras.layers import (Flatten, Conv1D, Conv2D, Input, Dense, Dropout, BatchNormalization,
                          concatenate, GaussianNoise, Reshape, TimeDistributed, LeakyReLU, PReLU, Embedding)
from keras.models import Model, load_model, save_model
from keras.optimizers import SGD, Adam
from sklearn.base import BaseEstimator, ClassifierMixin
from pathlib import Path
from keras.callbacks import Callback

class ROC_AUC(Callback):
    def __init__(self, validation_data):
        self.X_val, self.y_val = validation_data
    
    def on_epoch_end(self, epoch, logs={}):
        print("ROC AUC for this fold is ", roc_auc_score(self.y_val, self.model.predict(X_val)))
        
class NNv1(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 inp_shape=len(feats),
                 gaussian_noise=0.02,
                 dense1_dim=128,
                 dense2_dim=128,
                 dense3_dim=64,
                 dense1_kwargs=None,
                 dense2_kwargs=None,
                 dense3_kwargs=None,
                 classifier_kwargs=None,
                 optimizer=SGD,
                 opt_kwargs=None,
                 ):
        self.inp_shape = inp_shape
        self.gaussian_noise = gaussian_noise
        self.dense1_dim = dense1_dim
        self.dense2_dim = dense2_dim
        self.dense3_dim = dense3_dim
        self.dense1_kwargs = dense1_kwargs
        self.dense2_kwargs = dense2_kwargs
        self.dense3_kwargs = dense3_kwargs
        self.classifier_kwargs = classifier_kwargs
        self.optimizer = optimizer
        self.opt_kwargs = opt_kwargs
        self._default_initiaization()

    def _default_initiaization(self):
        if self.dense1_kwargs is None:
            self.dense1_kwargs = {"kernel_initializer": "glorot_uniform"}
        if self.dense2_kwargs is None:
            self.dense2_kwargs = {"kernel_initializer": "he_uniform"}
        if self.classifier_kwargs is None:
            self.classifier_kwargs = {"kernel_initializer": "he_uniform"}
        if self.opt_kwargs is None:
            self.opt_kwargs = {}

    def _build_model(self):
        inp = Input(shape=(self.inp_shape,))
        x = GaussianNoise(self.gaussian_noise)(inp)
        x = Reshape((self.inp_shape, 1))(inp)
        x = Dense(self.dense1_dim, activation='relu',)(x)
        #d1 = TimeDistributed(Dropout(0.2))(d1)
        x = Dense(self.dense1_dim, activation='relu',)(x)
        #d2 = PReLU()(d2)
        #d2 = TimeDistributed(Dropout(0.2))(d2)
        #x = concatenate([d1, d2])
        x = Flatten()(x)
        out = Dense(2, activation='softmax', **self.classifier_kwargs)(x)

        model = Model(inputs=inp, outputs=out)
        opt = self.optimizer(**self.opt_kwargs)
        model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
        return model

    def fit(self, X, y, *args, **kwargs):
        self.model = self._build_model()
        print(self.model.summary())
        self.model.fit(X, y, *args, **kwargs)
        return self

    def predict(self, X, y=None, weight_path=None, **kwargs):
        if self.model:
            if weight_path is not None:
                self.model.load_weights(weight_path)
            y_hat = self.model.predict(X, **kwargs)
        else:
            raise ValueError("Model not fit yet")
        return y_hat

In [21]:
y_test_pred = np.zeros(len(test))
    
X_train, X_valid, y_train, y_valid = train_test_split(X_sc, y, test_size=0.2, random_state=42)

model = NNv1(opt_kwargs = {"lr": 0.02, "momentum": 0.9, "nesterov": True, "clipnorm": 1})

model.fit(X_train, y_train, epochs=300, batch_size = 1000, validation_data =(X_valid, y_valid))
pred = model.predict(X_valid)
print(pred.shape)
print( "  auc = ", roc_auc_score(y_valid, pred) )
y_test_pred = model.predict(X_test_sc)[:,1]
    

#save base submission
sub_df1 = pd.DataFrame({"ID_code":test["ID_code"].values})
sub_df1["target"] = y_test_pred 
sub_df1.to_csv("submission_full_nn_lgbm_guide.csv", index=False)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 201)               0         
_________________________________________________________________
reshape_7 (Reshape)          (None, 201, 1)            0         
_________________________________________________________________
dense_19 (Dense)             (None, 201, 128)          256       
_________________________________________________________________
dense_20 (Dense)             (None, 201, 128)          16512     
_________________________________________________________________
flatten_7 (Flatten)          (None, 25728)             0         
_________________________________________________________________
dense_21 (Dense)             (None, 2)                 51458     
Total params: 68,226
Trainable params: 68,226
Non-trainable params: 0
_________________________________________________________________
None
T

Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300


Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300


Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300


Epoch 225/300
Epoch 226/300
Epoch 227/300
Epoch 228/300
Epoch 229/300
Epoch 230/300
Epoch 231/300
Epoch 232/300
Epoch 233/300
Epoch 234/300
Epoch 235/300
Epoch 236/300
Epoch 237/300
Epoch 238/300
Epoch 239/300
Epoch 240/300
Epoch 241/300
Epoch 242/300
Epoch 243/300
Epoch 244/300
Epoch 245/300
Epoch 246/300
Epoch 247/300
Epoch 248/300
Epoch 249/300
Epoch 250/300
Epoch 251/300
Epoch 252/300
Epoch 253/300
Epoch 254/300
Epoch 255/300
Epoch 256/300
Epoch 257/300
Epoch 258/300
Epoch 259/300
Epoch 260/300
Epoch 261/300
Epoch 262/300
Epoch 263/300
Epoch 264/300
Epoch 265/300
Epoch 266/300
Epoch 267/300
Epoch 268/300
Epoch 269/300
Epoch 270/300
Epoch 271/300
Epoch 272/300
Epoch 273/300
Epoch 274/300
Epoch 275/300
Epoch 276/300
Epoch 277/300
Epoch 278/300
Epoch 279/300
Epoch 280/300
Epoch 281/300


Epoch 282/300
Epoch 283/300
Epoch 284/300
Epoch 285/300
Epoch 286/300
Epoch 287/300
Epoch 288/300
Epoch 289/300
Epoch 290/300
Epoch 291/300
Epoch 292/300
Epoch 293/300
Epoch 294/300
Epoch 295/300
Epoch 296/300
Epoch 297/300
Epoch 298/300
Epoch 299/300
Epoch 300/300
(52059, 2)
  auc =  0.997381271782025
