In [1]:
import os # for detecting CPU cores
import configparser # to load standard config and parameters
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

import tensorflow as tf
from tensorflow import keras

warnings.filterwarnings('ignore')
%load_ext watermark
%matplotlib inline

In [2]:
# Load external config file
config = configparser.ConfigParser()
config.read("../src/config.ini")

PATH_DATA_RAW = config["PATHS"]["PATH_DATA_RAW"]
PATH_DATA_INT = config["PATHS"]["PATH_DATA_INT"]
PATH_DATA_PRO = config["PATHS"]["PATH_DATA_PRO"]
PATH_REPORTS = config["PATHS"]["PATH_REPORTS"]
PATH_MODELS = config["PATHS"]["PATH_MODELS"]
PATH_SUB = config["PATHS"]["PATH_SUB"]

# Telegram Bot
token = config["TELEGRAM"]["token"]
chat_id = config["TELEGRAM"]["chat_id"]
FILENAME_NB = "keras" # for Telegram messages

# Set global randome state
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

# Define available cpu cores
n_cpu = os.cpu_count()
print("Number of CPUs used:", n_cpu)

Number of CPUs used: 16


In [3]:
### check gpu before training
print("Tensorflow Number of GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Tensorflow Number of GPUs Available:  1


# Load Data

In [4]:
# RAW
train_df = pd.read_csv(PATH_DATA_RAW+'train.csv')
test_df = pd.read_csv(PATH_DATA_RAW+'test.csv')
sample_df = pd.read_csv(PATH_DATA_RAW+'sample_submission.csv')

# INTERIM OPT
train_df = pd.read_pickle(PATH_DATA_INT+'train-opt.pkl')
test_df = pd.read_pickle(PATH_DATA_INT+'test-opt.pkl')
sample_df = pd.read_csv(PATH_DATA_RAW+'sample_submission.csv')

# Memory Usage
train_df.info(memory_usage="deep")
print()
test_df.info(memory_usage="deep")

# Preparing features and target
features_num = train_df.drop(['id','target'], axis=1).columns
feature_cols = features_num.to_list()

X = train_df.drop(['id','target'], axis=1).copy()
y = train_df['target'].copy()
X_test = test_df.drop(['id'], axis=1).copy()

X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Columns: 102 entries, id to target
dtypes: float32(100), int32(1), int8(1)
memory usage: 231.7 MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540000 entries, 0 to 539999
Columns: 101 entries, id to f99
dtypes: float32(100), int32(1)
memory usage: 208.1 MB


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.106643,3.59437,132.804001,3.18428,0.081971,1.18859,3.73238,2.26627,2.09959,0.01233,...,0.010739,1.09862,0.013331,-0.011715,0.052759,0.0654,4.21125,1.97877,0.085974,0.240496
1,0.125021,1.67336,76.5336,3.37825,0.0994,5.09366,1.27562,-0.471318,4.54594,0.037706,...,0.135838,3.46017,0.017054,0.124863,0.154064,0.606848,-0.267928,2.57786,-0.020877,0.024719
2,0.03633,1.49747,233.546005,2.19435,0.026914,3.12694,5.05687,3.84946,1.80187,0.056995,...,0.11731,4.883,0.085222,0.032396,0.116092,-0.001689,-0.520069,2.14112,0.124464,0.148209
3,-0.014077,0.246,779.96698,1.89064,0.006948,1.53112,2.698,4.51733,4.50332,0.123494,...,-0.015347,3.47439,-0.017103,-0.0081,0.062013,0.041193,0.511657,1.9686,0.040017,0.044873
4,-0.003259,3.71542,156.128006,2.14772,0.018284,2.09859,4.15492,-0.038236,3.37145,0.034166,...,0.013781,1.91059,-0.042943,0.105616,0.125072,0.037509,1.04379,1.07481,-0.012819,0.072798


# Standardize / Scale

In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

### standardize data
scaler = StandardScaler()
# scaler = MinMaxScaler()
# scaler = RobustScaler()

X = pd.DataFrame(columns=X.columns, data=scaler.fit_transform(X))
X_test = pd.DataFrame(columns=X_test.columns, data=scaler.transform(X_test))

# Modeling

In [6]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

### define callbacks
early_stopping = EarlyStopping(
    monitor='val_auc', 
    min_delta=0, 
    patience=20, 
    verbose=0,
    mode='max', 
    baseline=None, 
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_auc', 
    factor=0.2,
    patience=5,
    mode='max'
)


In [7]:
#from tensorflow.keras import Sequential, Model

In [12]:
def create_model():
    model = keras.models.Sequential()
    model.add(keras.layers.Dense(units=128, activation="swish", input_dim=dim))
    model.add(keras.layers.Dropout(0.2))
    model.add(keras.layers.Dense(units=64, activation="swish"))
    model.add(keras.layers.Dropout(0.2))
    model.add(keras.layers.Dense(units=32, activation="swish"))
    model.add(keras.layers.Dropout(0.2))
    model.add(keras.layers.Dense(units=16, activation="swish"))
    model.add(keras.layers.Dropout(0.2))
    model.add(keras.layers.Dense(units=8, activation="swish"))
    model.add(keras.layers.Dropout(0.2))
    model.add(keras.layers.Dense(units=1, activation="sigmoid"))

    # model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["AUC"])
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss=keras.losses.BinaryCrossentropy(label_smoothing=1e-3),
        metrics=["AUC"],
    )

    return model


In [11]:
from sklearn.model_selection import StratifiedKFold

EPOCHS = 100
BATCH_SIZE = 1024
VERBOSE = 0
N_SPLITS = 10

# Cross-validation
cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)

In [None]:
model = create_model()
model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data = (X_test, y_test))