In [23]:
import pandas as pd
from feature_engine.outliers import Winsorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

from tensorflow.keras import Sequential
from tensorflow.keras import callbacks
from tensorflow.keras.layers import Dense

In [2]:
tb_data = pd.read_csv('../data/train.csv', index_col='id')
tb_test = pd.read_csv('../data/test.csv', index_col='id')

In [3]:
top = Pipeline(steps=[('outliers', Winsorizer(capping_method='iqr',
                                              tail='both')),
                      ('power', PowerTransformer()),
                      ('range', MinMaxScaler()),
                      ('select', SelectKBest(score_func=mutual_info_classif, k=80))])
top.get_params()

{'memory': None,
 'steps': [('outliers', Winsorizer(capping_method='iqr', tail='both')),
  ('power', PowerTransformer()),
  ('range', MinMaxScaler()),
  ('select',
   SelectKBest(k=80,
               score_func=<function mutual_info_classif at 0x0000019006EDC3A0>))],
 'verbose': False,
 'outliers': Winsorizer(capping_method='iqr', tail='both'),
 'power': PowerTransformer(),
 'range': MinMaxScaler(),
 'select': SelectKBest(k=80,
             score_func=<function mutual_info_classif at 0x0000019006EDC3A0>),
 'outliers__capping_method': 'iqr',
 'outliers__fold': 3,
 'outliers__missing_values': 'raise',
 'outliers__tail': 'both',
 'outliers__variables': None,
 'power__copy': True,
 'power__method': 'yeo-johnson',
 'power__standardize': True,
 'range__clip': False,
 'range__copy': True,
 'range__feature_range': (0, 1),
 'select__k': 80,
 'select__score_func': <function sklearn.feature_selection._mutual_info.mutual_info_classif(X, y, *, discrete_features='auto', n_neighbors=3, copy=True, ran

In [5]:
tb_data_Y = tb_data['target']
tb_data_fe = top.fit_transform(tb_data.loc[:, :'f99'], tb_data_Y)

In [7]:
tb_f_trainX, tb_f_testX, tb_f_trainY, tb_f_testY = train_test_split(tb_data_fe, tb_data_Y, test_size=0.3)

In [8]:
tb_data_fe.shape, tb_data_Y.shape, tb_f_trainX.shape, tb_f_trainY.shape, tb_f_testX.shape, tb_f_testY.shape

((600000, 80), (600000,), (420000, 80), (420000,), (180000, 80), (180000,))

In [21]:
model = Sequential()

model.add(Dense(units=8, input_shape=[80]))
model.add(Dense(units=8))
model.add(Dense(units=1))

model.compile(optimizer='sgd',
              loss='mean_squared_error',
              metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 8)                 648       
_________________________________________________________________
dense_5 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 9         
Total params: 729
Trainable params: 729
Non-trainable params: 0
_________________________________________________________________


In [22]:
early_stopping = callbacks.EarlyStopping(min_delta=0.001,
                                         patience=10,
                                         restore_best_weights=True)

model_history = model.fit(tb_f_trainX, tb_f_trainY,
                          validation_data=(tb_f_testX, tb_f_testY),
                          batch_size=512,
                          epochs=50,
                          callbacks=[early_stopping])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50


In [24]:

fpr, tpr, thresh = roc_curve(tb_f_testY, model.predict(tb_f_testX))
auc(fpr, tpr)

0.737507117518082