In [1]:
import pandas as pd
from helpers.helper_functions import load_data, get_scaled_data
from sklearn.preprocessing import OneHotEncoder, StandardScaler


## Load data

In [2]:
train, test = load_data('data')
X_train, y_train = pd.read_csv('data/X_prepped.csv', index_col = 'id'), train.target
X_test = get_scaled_data(test.reset_index(drop = True), is_test=True)

In [3]:
X_test.head()

Unnamed: 0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,...,10_F,10_G,10_H,10_I,10_J,10_K,10_L,10_M,10_N,10_O
0,0.443296,0.173355,-1.000476,0.763976,0.187318,-1.075194,0.502626,6,6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.605986,-0.307128,0.626171,-0.577429,-1.75007,1.355436,-0.190213,1,3,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.304615,2.445921,0.245214,0.819474,0.360241,-1.332297,1.359411,3,3,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.154511,0.25917,-1.367563,-0.091791,-1.110279,-0.948885,1.119995,0,0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.653458,-0.425775,-0.668187,-0.3207,-0.088877,0.181443,1.785797,2,2,2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
X_train.head()

Unnamed: 0_level_0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,...,10_F,10_G,10_H,10_I,10_J,10_K,10_L,10_M,10_N,10_O
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-1.37449,0.237914,-0.244425,0.568674,-0.647037,0.839148,0.113849,1,5,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.699197,-1.712872,-2.23036,-0.544198,1.113558,-1.552654,0.448561,1,3,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.683885,0.616078,-1.028335,0.811719,-0.608415,0.113695,-0.707992,1,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.118017,-0.589476,-0.805398,2.087827,0.371515,-0.129132,-0.281882,3,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.150047,-0.177876,-0.665703,-1.099783,0.468368,0.499896,0.408249,3,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## ML 

In [5]:
from tensorflow import keras
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from helpers.train_helpers import BATCH_SIZE, EPOCHS

In [6]:

INPUT_SHAPE = X_test.shape[1]
model_3 = keras.models.Sequential(
    [
        keras.layers.Dense(100, activation="swish", input_dim = INPUT_SHAPE),
        keras.layers.Dense(50, activation="swish"),
        keras.layers.Dense(1, activation="sigmoid"),
    ],
    name="Dense_model_3",
)

In [9]:
def test_predictor(model_in, X_train, y_train, X_test, n_folds = 5):
    early_stopping = keras.callbacks.EarlyStopping(
                    patience=20, monitor="val_loss", restore_best_weights=True, verbose = 1
                )
    learn_reducer = keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss', factor = 0.7, patience = 5, verbose = 1)
    kf = KFold(n_folds)
    store = []

    model_in.summary()

    for fold, (train_idx, val_idx) in enumerate(
        kf.split(X_train)
    ):
        
        print(f"Fitting fold {fold} for {model_in.name}...")
        model = keras.models.clone_model(model_in)
        model.compile(
            optimizer="adam", loss="binary_crossentropy", metrics=[keras.metrics.AUC()]
        )

        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(
            X_tr,
            y_tr,
            validation_data=(X_val, y_val),
            epochs=EPOCHS,
            verbose=1,
            batch_size=BATCH_SIZE,
            callbacks=[
                early_stopping, learn_reducer
            ],
        )
        auc = roc_auc_score(y_val, model.predict(X_val).squeeze())
        print(f"The val auc for fold {fold}, {model_in.name} is {auc}")
        store.append(model.predict(X_test).squeeze())

    result = sum(store) / n_folds 
    return result

In [10]:
preds = test_predictor(model_3, X_train, y_train, X_test, n_folds = 5)

Model: "Dense_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               14800     
                                                                 
 dense_1 (Dense)             (None, 50)                5050      
                                                                 
 dense_2 (Dense)             (None, 1)                 51        
                                                                 
Total params: 19,901
Trainable params: 19,901
Non-trainable params: 0
_________________________________________________________________
Fitting fold 0 for Dense_model_3...
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400


IndexError: positional indexers are out-of-bounds

In [None]:
sub = pd.read_csv('data/sample_submission.csv')
sub['state'] = preds.round()

In [None]:
import os
if not os.path.exists('data/submissions'):
    os.mkdir('data/submissions')

sub.to_csv('data/submissions/nn_sub.csv', index = False)