In [1]:
import pandas as pd
from feature_engine.outliers import Winsorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import callbacks
from tensorflow.keras.layers import Dense

In [2]:
tb_data = pd.read_csv('../data/train.csv', index_col='id')
tb_test = pd.read_csv('../data/test.csv', index_col='id')

In [3]:
top = Pipeline(steps=[('outliers', Winsorizer(capping_method='iqr',
                                              tail='both')),
                      ('power', PowerTransformer()),
                      ('range', MinMaxScaler()),
                      ('select', SelectKBest(score_func=mutual_info_classif, k=80))])
top.get_params()

{'memory': None,
 'steps': [('outliers', Winsorizer(capping_method='iqr', tail='both')),
  ('power', PowerTransformer()),
  ('range', MinMaxScaler()),
  ('select',
   SelectKBest(k=80,
               score_func=<function mutual_info_classif at 0x000001E5136ED3A0>))],
 'verbose': False,
 'outliers': Winsorizer(capping_method='iqr', tail='both'),
 'power': PowerTransformer(),
 'range': MinMaxScaler(),
 'select': SelectKBest(k=80,
             score_func=<function mutual_info_classif at 0x000001E5136ED3A0>),
 'outliers__capping_method': 'iqr',
 'outliers__fold': 3,
 'outliers__missing_values': 'raise',
 'outliers__tail': 'both',
 'outliers__variables': None,
 'power__copy': True,
 'power__method': 'yeo-johnson',
 'power__standardize': True,
 'range__clip': False,
 'range__copy': True,
 'range__feature_range': (0, 1),
 'select__k': 80,
 'select__score_func': <function sklearn.feature_selection._mutual_info.mutual_info_classif(X, y, *, discrete_features='auto', n_neighbors=3, copy=True, ran

In [4]:
tb_data_Y = tb_data['target']
tb_data_fe = top.fit_transform(tb_data.loc[:, :'f99'], tb_data_Y)

In [34]:
pd.to_pickle(tb_data_fe,'../data/engineered_x.pkl')
pd.to_pickle(tb_data_Y,'../data/y.pkl')

In [2]:
tb_data_fe = pd.read_pickle('../data/engineered_x.pkl')
tb_data_Y = pd.read_pickle('../data/y.pkl')

In [3]:
tb_f_trainX, tb_f_testX, tb_f_trainY, tb_f_testY = train_test_split(tb_data_fe, tb_data_Y, test_size=0.3)

In [4]:
tb_data_fe.shape, tb_data_Y.shape, tb_f_trainX.shape, tb_f_trainY.shape, tb_f_testX.shape, tb_f_testY.shape

((600000, 80), (600000,), (420000, 80), (420000,), (180000, 80), (180000,))

In [11]:
model = Sequential()

model.add(Dense(units=52, input_shape=[80], activation=tf.nn.relu))
model.add(Dense(units=52, activation=tf.nn.relu))
model.add(Dense(units=52, activation=tf.nn.relu))
model.add(Dense(units=1, activation=tf.nn.sigmoid))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 52)                4212      
_________________________________________________________________
dense_11 (Dense)             (None, 52)                2756      
_________________________________________________________________
dense_12 (Dense)             (None, 52)                2756      
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 53        
Total params: 9,777
Trainable params: 9,777
Non-trainable params: 0
_________________________________________________________________


In [12]:
early_stopping = callbacks.EarlyStopping(min_delta=0.001,
                                         patience=10,
                                         restore_best_weights=True)

model_history = model.fit(tb_f_trainX, tb_f_trainY,
                          validation_data=(tb_f_testX, tb_f_testY),
                          batch_size=128,
                          epochs=100,
                          callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


In [13]:

fpr_t, tpr_t, thresh_t = roc_curve(tb_f_trainY, model.predict(tb_f_trainX))
fpr, tpr, thresh = roc_curve(tb_f_testY, model.predict(tb_f_testX))

auc(fpr_t, tpr_t), auc(fpr, tpr)

(0.7390590063351319, 0.7386457862808006)

In [25]:
trans_data = top.transform(tb_test)
trans_data

array([[0.8354811 , 0.84425748, 0.90762651, ..., 0.4440909 , 0.7867716 ,
        0.68700636],
       [0.83701809, 0.29304954, 0.69375135, ..., 0.68175982, 0.68667289,
        0.82702683],
       [0.9979642 , 0.55584032, 0.68191053, ..., 0.63458976, 0.7618954 ,
        0.76260336],
       ...,
       [0.91343771, 0.5674381 , 0.7257823 , ..., 0.02102096, 1.        ,
        0.83088971],
       [1.        , 0.67689124, 0.78345354, ..., 0.24430531, 0.6960673 ,
        0.78498785],
       [0.92569181, 0.585279  , 0.86814269, ..., 0.22375937, 0.71845969,
        0.80751571]])

### submission 06

In [31]:
nn_predictions = model.predict(trans_data)
nn_predictions

array([[0.73354024],
       [0.60515183],
       [0.8091847 ],
       ...,
       [0.51023966],
       [0.65409553],
       [0.59418863]], dtype=float32)

In [32]:
temp_nn = pd.DataFrame(data=nn_predictions, index=tb_test.index, columns=['target'])
temp_nn.to_csv('../data/nn_prediction_baseline.csv')