# Submission 3: Denoising Autoencoder

## Introduction 
This DAE machine learning architecture is inspired by the first place solution in Tabular Playground January by Danzel [1st place - turn your data into DAEta](https://www.kaggle.com/springmanndaniel/1st-place-turn-your-data-into-daeta/report). It is speculated that this works well since the data itself is artificially created with noise using CTGAN. 

In [1]:
# fundamentals
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd 
import numpy as np
import scipy
import math

# data exploration 
from pandas_profiling import ProfileReport
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=True, world_readable=True)
from plotly.offline import iplot

# data preprocessing 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

#tensorflow
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# hyperparameter tuning 
import kerastuner as kt


# metrics for evaluation
from sklearn.metrics import mean_squared_error
from scipy import stats

# saving parameters
from joblib import dump, load

# hyperparameter searching and tuning 
import optuna
import tqdm

In [2]:
#fixing random seed for reproducability
import random

random.seed(0)
np.random.seed(0)

In [3]:
import joblib

### model.rmse

In [4]:
encoded_X_train = joblib.load("encoded_X_train.joblib")

In [5]:
encoded_X_test = joblib.load("encoded_X_test.joblib")

In [6]:
encoded_X_train[0][0]

array([    0.     ,     0.     ,     0.     ,     0.     ,     0.     ,
           0.     , 37180.562  , 24219.914  ,     0.     ,     0.     ,
       20001.055  , 14856.365  , 23469.883  ,     0.     ,     0.     ,
           0.     ,     0.     ,    78.19136,     0.     ,     0.     ,
           0.     ,     0.     ,     0.     , 58539.926  ,     0.     ,
           0.     ,     0.     ,     0.     ,     0.     ,     0.     ,
           0.     , 36162.508  ,     0.     , 67226.34   ,     0.     ,
           0.     ,     0.     ,     0.     ,     0.     ,     0.     ,
       43411.902  , 63334.99   ,  7321.4873 , 25025.104  ,     0.     ,
           0.     ,     0.     ,     0.     ,     0.     ,     0.     ,
       10153.46   , 12536.168  ,     0.     ,     0.     ,     0.     ,
           0.     ,     0.     ,     0.     ,     0.     , 12306.902  ,
           0.     ,     0.     ,     0.     , 37416.43   ], dtype=float32)

In [7]:
encoded_X_train[0].shape

(300000, 64)

### Optuna 

In [8]:
original_df = pd.read_csv('train.csv', index_col = 'id')

In [9]:
X_train = original_df.drop(columns = 'target', axis =1)
Y_train = original_df['target']

In [10]:
np.zeros_like(Y_train).shape

(300000,)

In [11]:
encoded_X_train.shape

(3, 300000, 64)

In [12]:
class KFoldsAverageMLP():
    def __init__(self, FOLDS):
        self.models = []
        self.kfolds = KFold(n_splits = FOLDS, shuffle = False)
        
    def fit(self, trial, train_x, train_y, prune = True):
        oof_preds = np.zeros_like(train_y)
        self.train_x = train_x
        self.train_y = train_y.values

        
        
        # adding callbacks
        model_save = ModelCheckpoint('./best_MLP_model.h5', 
                             save_best_only = True, 
                             save_weights_only = True,
                             monitor = 'val_loss', 
                             mode = 'min', verbose = 10)
        early_stop = EarlyStopping(monitor = 'val_loss', min_delta = 0.001, 
                           patience = 5, mode = 'min', verbose = 10,
                           restore_best_weights = True)
        reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.1, 
                              patience = 3, min_delta = 0.001, 
                              mode = 'min', verbose = 10)
        
        ###############################################################
        stack_num, num_data, encoded_features = self.train_x.shape
        
        # tunable hyperparameters
        input_dense = trial.suggest_int('input_dense_layer', 16, 1024, log = True)
        input_dropout = trial.suggest_loguniform('input_dropout', 1e-5, 1)
        num_layers =  trial.suggest_int('num_layers', 2, 20)
        learning_rate = trial.suggest_categorical('learning_rate', [0.999, 1e-1, 5e-2, 1e-2])
        
        combined_dense = trial.suggest_int('combined_dense_layer', 2, 2048, log = True)
        combined_dropout = trial.suggest_loguniform('combined_dropout', 1e-5, 1)
                    
        # the 3 deepstack layers would go through a mini-model before concat to combined NN
        input_a = keras.Input(shape = (encoded_features,), name = 'deepstack layer 1 input')
        x = Dense(input_dense, activation = 'relu')(input_a)
        x = Dropout(input_dropout)(x)
        model_a = keras.Model(input_a, x, name = 'stack_1')
        
        input_b = keras.Input(shape = (encoded_features,), name = 'deepstack layer 2 input')
        y = Dense(input_dense, activation = 'relu')(input_b)
        y = Dropout(input_dropout)(y)
        model_b = keras.Model(input_b,  y, name = 'stack_2')
        
        
        input_c = keras.Input(shape = (encoded_features,), name = 'deepstack layer 3 input')
        z = Dense(input_dense, activation = 'relu')(input_c)
        z = Dropout(input_dropout)(z)
        model_c = keras.Model(input_c,  z, name = 'stack_3')
        
        # the output from the three mini models 
        merged = keras.layers.Concatenate(axis=1)([x, y, z])
        combined = Dense(combined_dense, activation = 'relu', name = 'combining_dense')(merged)
        piped_data = Dropout(combined_dropout)(combined)
        
        for i in range(num_layers):
            num_hidden = trial.suggest_int(f'n_units_l{i}', 4, 1024, log = True)
            dropout_rate = trial.suggest_loguniform(f'dropout_rate{i}', 1e-5, 1)
            piped_data = Dense(num_hidden, activation='relu')(piped_data)
            piped_data = Dropout(rate=dropout_rate)(piped_data)

        output = Dense(units=1, activation = 'linear')(piped_data)
        MLP = keras.Model([input_a, input_b, input_c], output)
        
        # Tune the learning rate for the optimizer 
        # Choose an optimal value from 0.01, 0.001, or 0.0001
        MLP.compile(optimizer = Adam(learning_rate = learning_rate), loss = 'mse', metrics=[keras.metrics.RootMeanSquaredError()])
    
        
        for train_idx, val_idx in self.kfolds.split(train_x[0]):
            
            
            # spliting the three deepstack layers into different input models before combining 
            X_train_CV0, X_val_CV0 = self.train_x[0][train_idx], self.train_x[0][val_idx]
            X_train_CV1, X_val_CV1 = self.train_x[1][train_idx], self.train_x[1][val_idx]
            X_train_CV2, X_val_CV2 = self.train_x[2][train_idx], self.train_x[2][val_idx]
            
            Y_train_CV, Y_val_CV = self.train_y[train_idx], self.train_y[val_idx]
            
            MLP.fit(x = [X_train_CV0, X_train_CV1, X_train_CV2], 
                      y = Y_train_CV, 
                      epochs = 1000,
                      verbose = 1, 
                      validation_data = ([X_val_CV0, X_val_CV1, X_val_CV2] , Y_val_CV),
                      callbacks = [early_stop,
                                    reduce_lr])       
        
            self.models.append(MLP)
            oof_pred = MLP.predict([X_val_CV0, X_val_CV1, X_val_CV2])
            oof_preds[val_idx] = oof_pred[0]
            
        self.oof_preds = oof_preds
        
        self.rmse = mean_squared_error(Y_train, oof_preds, squared = False)

    def predict(self, test_x):
        preds = []
        for model in tqdm.tqdm(self.models):
            pred = model.predict(test_x)
            preds.append(pred)
        preds = np.mean(preds, axis=0)
        return preds

In [13]:
def objective_keras(trial):   
    optuna_MLP = KFoldsAverageMLP(FOLDS = 5)
    optuna_MLP.fit(trial = trial, train_x = encoded_X_train, train_y = Y_train, prune = True)
    return optuna_MLP.rmse

In [14]:
mlp_study = optuna.create_study(direction="minimize", pruner = optuna.pruners.HyperbandPruner())
mlp_study.optimize(objective_keras, n_trials=50)
print(cb_study.best_trial)

[32m[I 2021-02-21 12:15:30,543][0m A new study created in memory with name: no-name-e761c9af-1b25-463e-bd75-d27ff2ec1bfa[0m


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0009999999776482583.
Epoch 8/1000
Epoch 9/1000
Restoring model weights from the end of the best epoch.
Epoch 00009: early stopping
Epoch 1/1000
Epoch 2/1000

KeyboardInterrupt: 

In [93]:
from tensorflow import keras