In [1]:
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import seaborn as sns
import gc

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score
from scipy.stats import norm, rankdata

import keras
from keras import regularizers
from keras.layers import Input,Dropout,BatchNormalization,Activation,Add,PReLU, LSTM
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Reshape, Flatten, Conv2D, MaxPooling2D
from keras.optimizers import SGD
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
import tensorflow as tf
import horovod.keras as hvd

Using TensorFlow backend.


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


In [2]:
# reduce memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
# Horovod: initialize Horovod.
hvd.init()

# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
K.set_session(tf.Session(config=config))

In [4]:
train = reduce_mem_usage(pd.read_csv('../input/train.csv'))
test = reduce_mem_usage(pd.read_csv('../input/test.csv'))

Memory usage after optimization is: 78.01 MB
Decreased by 74.7%
Memory usage after optimization is: 77.82 MB
Decreased by 74.6%


In [5]:
features = [f for f in train if f not in ['ID_code','target']]

In [6]:
df_original = pd.concat([train, test],axis=0,sort=False)
df = df_original[features]
target = df_original['target'].values
id = df_original['ID_code']

In [7]:
#for feature in features:
#    df['mean_'+feature] = (train[feature].mean()-train[feature])
#    df['z_'+feature] = (train[feature] - train[feature].mean())/train[feature].std(ddof=0)
#    df['sq_'+feature] = (train[feature])**2
#    df['sqrt_'+feature] = np.abs(train[feature])**(1/2)
#    df['cp_'+feature] = pd.DataFrame(rankdata(train[feature]))
#    df['cnp_'+feature] = pd.DataFrame((norm.cdf(train[feature])))
#df = reduce_mem_usage(df)

In [8]:
for df in [df]:
#####Handling Missing Values#####     
    for i in range(len(df.columns)):
        df.iloc[:,i] = (df.iloc[:,i]).fillna(-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [9]:
df.isnull().values.any()

False

In [10]:
from scipy.special import erfinv
trafo_columns = [c for c in df.columns if len(df[c].unique()) != 2]
for col in trafo_columns:
    values = sorted(set(df[col]))
    # Because erfinv(1) is inf, we shrink the range into (-0.9, 0.9)
    f = pd.Series(np.linspace(-0.9, 0.9, len(values)), index=values)
    f = np.sqrt(2) * erfinv(f)
    f -= f.mean()
    df[col] = df[col].map(f)

df = reduce_mem_usage(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Memory usage after optimization is: 155.64 MB
Decreased by 74.6%


In [11]:
#from sklearn.decomposition import PCA
#pca = PCA(n_components=200)
#pca.fit(df[trafo_columns])
#df = pca.transform(df[trafo_columns])
#df = pd.DataFrame(df)

In [12]:
df['target'] = df_original.target.values
df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,target
0,0.276123,-1.248047,0.492676,0.178345,0.301514,-1.103516,0.08905,0.470459,-1.241211,-0.489014,...,0.264893,1.095703,0.69043,0.287354,-1.189453,1.255859,0.127563,-0.142212,-0.439697,0.0
1,0.544922,-1.052734,0.788086,0.232422,0.491943,1.179688,0.271484,0.289795,1.054688,0.565918,...,0.720703,1.007812,1.376953,-0.075195,1.229492,1.270508,0.197388,0.609375,0.682129,0.0
2,0.245361,-0.890137,0.516602,0.739258,0.126343,-1.097656,0.821289,0.021896,-1.241211,-0.388916,...,0.89209,0.833008,0.689453,0.625488,1.481445,-1.262695,0.036865,0.174316,0.313965,0.0
3,0.49707,-0.824707,0.111084,0.583984,0.54248,-0.621582,0.354492,0.07074,-1.325195,0.606445,...,0.355225,0.544922,0.640137,0.805664,-0.897949,-0.942871,0.70752,0.571777,-0.994141,0.0
4,0.368164,-0.703613,0.632324,0.469238,0.4729,0.818848,0.391846,0.526855,1.381836,0.387207,...,0.871094,-0.407227,1.296875,-0.51709,-0.96582,0.975586,0.424805,0.574707,-0.98877,0.0


In [13]:
#train = df[df['target'].notnull()]
#target = train['target']
#test = df[df['target'].isnull()]
#trafo_columns = [c for c in train.columns if c not in ['target']]
#train.shape

In [14]:
trafo_columns = [c for c in df.columns if c not in ['target']]

In [15]:
from keras import backend as K
from keras.activations import elu
from keras.layers import Input, Dense, Lambda
from keras.models import Model
from keras.objectives import binary_crossentropy
from keras.callbacks import LearningRateScheduler
from keras import backend as K
from imblearn.keras import balanced_batch_generator
from imblearn.under_sampling import NearMiss, RandomUnderSampler, CondensedNearestNeighbour, AllKNN, InstanceHardnessThreshold
from sklearn.model_selection import KFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from keras.utils import multi_gpu_model
import math

learning_rate = 0.0001
mom = 0.15
dcy = 0.996
nb_folds = 4
nb_epoch = int(math.ceil(200.0 / hvd.size()))
batch_size = 128
encoding_dim =1500
hidden_dim = int(encoding_dim) #i.e. 7
sgd = SGD(lr=learning_rate, momentum=mom, decay=dcy)
#folds = StratifiedKFold(n_splits=nb_folds, shuffle=True, random_state=420)
folds = KFold(n_splits = nb_folds, random_state = 338, shuffle = True)
auto = np.zeros(df[trafo_columns].shape)
layer_output = np.zeros((len(df), encoding_dim)) # change when nn shape changes
#layer_output = np.zeros(df[trafo_columns].shape)
#train_auto = np.zeros(train[trafo_columns].shape)
#test_auto = np.zeros(test[trafo_columns].shape)
predictions = np.zeros(len(df))
#label_cols = ["target"]
#y_split = train[label_cols].values

# Horovod: adjust learning rate based on number of GPUs.
opt = keras.optimizers.SGD(lr=(learning_rate * hvd.size()), decay=dcy, momentum=mom, nesterov=True)
# Horovod: add Horovod Distributed Optimizer.
opt = hvd.DistributedOptimizer(opt)

cp = ModelCheckpoint(filepath="autoencoder_0.h5",
                               save_best_only=True,
                               verbose=0)

tb = TensorBoard(log_dir='./logs',
                histogram_freq=0,
                write_graph=True,
                write_images=True)

es= EarlyStopping(monitor='val_acc',
                  min_delta=0,
                  patience=10,
                  verbose=1, mode='min')


    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
bgvc= hvd.callbacks.BroadcastGlobalVariablesCallback(0)

    # Horovod: average metrics among workers at the end of every epoch.
    #
    # Note: This callback must be in the list before the ReduceLROnPlateau,
    # TensorBoard or other metrics-based callbacks.
mac= hvd.callbacks.MetricAverageCallback()

    # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
    # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
    # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
lrwc = hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)

    # Reduce the learning rate if training plateaues.
rlp = keras.callbacks.ReduceLROnPlateau(patience=5, verbose=1)


#for fold_, (trn_idx, val_idx) in enumerate(folds.split(y_split[:,0], y_split[:,0])):
#    print("fold {}".format(fold_))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(df)):
    print("fold {}".format(fold_))

    trn_data = df[trafo_columns].iloc[trn_idx]
    val_data = df[trafo_columns].iloc[val_idx]

    def add_noise(series, noise_level):
        return series * (1 + noise_level * np.random.randn(series.shape[1]))
    
    noisy_trn_data = add_noise(trn_data, 0.05)

    input_dim = noisy_trn_data.shape[1] #num of columns, 30
    input_layer = Input(shape=(input_dim, ))
    encoder = Dense(encoding_dim, activation="tanh", activity_regularizer=regularizers.l1(learning_rate))(input_layer)
    encoder = BatchNormalization()(encoder)
    encoder = Dense(hidden_dim, activation="relu")(encoder)
    decoder = Dense(hidden_dim, activation='relu')(encoder)
    decoder = Dense(input_dim, activation='tanh')(decoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)    
#    model = Model(inputs=input_layer, outputs=decoder)
#   autoencoder = multi_gpu_model(model, gpus=2)
    autoencoder.summary()
    
    autoencoder.compile(metrics=['accuracy'],
                        loss='mean_squared_error',
                        optimizer='sgd')

    cp = ModelCheckpoint(filepath="autoencoder_fraud.h5",
                                   save_best_only=True,
                                   verbose=0)

    tb = TensorBoard(log_dir='./logs',
                    histogram_freq=0,
                    write_graph=True,
                    write_images=True)
    
    es= EarlyStopping(monitor='val_acc',
                  min_delta=0,
                  patience=12,
                  verbose=1, mode='auto')

    history = autoencoder.fit(noisy_trn_data, trn_data,
                        epochs=nb_epoch,
                        batch_size=batch_size,
                        shuffle=True,
                        validation_data=(val_data, val_data),
                        verbose=1,
                        callbacks=[cp, tb, es, bgvc, mac, lrwc, rlp]).history
    
    
    auto[val_idx] += autoencoder.predict(df.iloc[val_idx][trafo_columns], verbose=1)
    mse = autoencoder.predict(df[trafo_columns] / folds.n_splits, verbose=1)
    predictions += np.mean(np.power(df[trafo_columns] - mse, 2), axis=1)
    # we build a new model with the activations of the old model
    # this model is truncated after the first layer
    get_1st_layer_output = K.function([autoencoder.layers[0].input],
                                  [autoencoder.layers[1].output])
    layer_output[val_idx] += pd.DataFrame(np.concatenate(get_1st_layer_output([df.iloc[val_idx][trafo_columns]])))
    
auto_final = pd.DataFrame(auto / folds.n_splits)
hidden = pd.DataFrame(layer_output / folds.n_splits)

fold 0
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1500)              301500    
_________________________________________________________________
batch_normalization_1 (Batch (None, 1500)              6000      
_________________________________________________________________
dense_2 (Dense)              (None, 1500)              2251500   
_________________________________________________________________
dense_3 (Dense)              (None, 1500)              2251500   
_________________________________________________________________
dense_4 (Dense)              (None, 200)               300200    
Total params: 5,110,700
Trainable params: 5,107,700
Non-trainable params: 3,000
_______________________________________________________

In [16]:
#hidden
hidden['target'] = target
hidden['ID_code'] = id.values
hidden.head(5)
#final
auto_final['target'] = target
auto_final['ID_code'] = id.values
auto_final.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,192,193,194,195,196,197,198,199,target,ID_code
0,0.095245,-0.232096,0.114008,0.052224,0.060247,-0.235883,0.008829,0.152451,-0.234009,-0.10907,...,0.219451,0.166259,0.07768,-0.242313,0.239918,0.04223,-0.002304,-0.158073,0.0,train_0
1,0.144771,-0.237629,0.152369,0.055982,0.128035,0.232224,0.065603,0.06517,0.199946,0.129121,...,0.209524,0.240359,0.014283,0.23691,0.240021,0.041866,0.077078,0.190503,0.0,train_1
2,0.071671,-0.22784,0.108156,0.158549,0.049578,-0.226953,0.160241,0.010538,-0.239851,-0.109305,...,0.188004,0.208677,0.156755,0.241404,-0.221771,-0.000449,-0.004868,0.094834,0.0,train_2
3,0.133116,-0.193924,0.02262,0.147084,0.127124,-0.208932,0.078299,0.025473,-0.230423,0.13954,...,0.179277,0.19062,0.168366,-0.1847,-0.201374,0.159318,0.113921,-0.193304,0.0,train_3
4,0.115653,-0.181926,0.145287,0.120495,0.09648,0.16727,0.079684,0.161471,0.240113,0.10496,...,-0.106068,0.227203,-0.122813,-0.231156,0.226698,0.109604,0.149273,-0.227088,0.0,train_4


In [17]:
%whos DataFrame

Variable         Type         Data/Info
---------------------------------------
auto_final       DataFrame                   0         <...>00000 rows x 202 columns]
df               DataFrame               var_0     var_<...>00000 rows x 201 columns]
df_original      DataFrame                ID_code  targ<...>00000 rows x 202 columns]
hidden           DataFrame                   0         <...>0000 rows x 1502 columns]
noisy_trn_data   DataFrame               var_0     var_<...>00000 rows x 200 columns]
test             DataFrame                ID_code      <...>00000 rows x 201 columns]
train            DataFrame                 ID_code  tar<...>00000 rows x 202 columns]
trn_data         DataFrame               var_0     var_<...>00000 rows x 200 columns]
val_data         DataFrame               var_0     var_<...>00000 rows x 200 columns]


In [18]:
del df_original, df, noisy_trn_data, test, train, trn_data, val_data
gc.collect()

171

In [19]:
%whos DataFrame

Variable      Type         Data/Info
------------------------------------
auto_final    DataFrame                   0         <...>00000 rows x 202 columns]
df_original   DataFrame                ID_code  targ<...>00000 rows x 202 columns]
hidden        DataFrame                   0         <...>0000 rows x 1502 columns]


In [89]:
#hidden
#ae_columns = [c for c in hidden.columns if c not in ['ID_code', 'target']]
#df_train = hidden[hidden['target'].notnull()]
#target = df_train['target']
#df_test = hidden[hidden['target'].isnull()]
#df_train.shape

#final
ae_columns = [c for c in auto_final.columns if c not in ['ID_code', 'target']]
df_train = auto_final[auto_final['target'].notnull()]
target = df_train['target']
df_test = auto_final[auto_final['target'].isnull()]
df_train.shape

(200000, 202)

In [99]:
df_train['ID_code'] = df_train['ID_code'].astype('category')
df_test['ID_code'] = df_test['ID_code'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import PredefinedSplit
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

#%% Prepare data
def prepLGB(data,
            classCol='',
            IDCol='',
            fDrop=[]):

        # Drop class column
        if classCol != '':
            labels = data[classCol]
#            fDrop = fDrop + [classCol]
        else:
            labels = []

        if IDCol != '':
            IDs = data[IDCol]
#            fDrop = fDrop + [IDCol]
        else:
            IDs = []

        if fDrop != []:
            data = data.drop(fDrop,
                            axis=1)

        # Create LGB mats
        lData = lgb.Dataset(data, label=labels,
                            free_raw_data=False,
                            feature_name=list(data.columns),
                            categorical_feature=['ID_code'])

        return lData, labels, IDs, data


# Specify columns to drop
fDrop = []

# Split training data in to training and validation sets.
# Validation set is used for early stopping.
trainData, validData = train_test_split(df_train,
                                        test_size=0.3,
                                        stratify=df_train.target)

# Prepare the data sets
trainDataL, trainLabels, trainIDs, trainData = prepLGB(trainData,
                                                       classCol='target',
                                                       IDCol='ID_code',
                                                       fDrop=fDrop)

validDataL, validLabels, validIDs, validData = prepLGB(validData,
                                                       classCol='target',
                                                       IDCol='ID_code',
                                                       fDrop=fDrop)

testDataL, _, _ , testData = prepLGB(df_test,
                                     classCol='target',
                                     IDCol='ID_code',
                                     fDrop=fDrop)

# Prepare data set using all the training data
allTrainDataL, allTrainLabels, _ , allTrainData = prepLGB(df_train,
                                                          classCol='target',
                                                          IDCol='ID_code',
                                                          fDrop=fDrop)

gridParams = {'learning_rate': [0.005],
              'n_estimators': [40],
              'num_leaves': [6,8,12,16],
              'boosting_type' : ['gbdt'],
              'objective' : ['binary'],
              'random_state' : [420], # Updated from 'seed'
              'colsample_bytree' : [0.65, 0.66],
              'subsample' : [0.7,0.75],
              'reg_alpha' : [1,1.2],
              'reg_lambda' : [1,1.2,1.4],}

# Create parameters to search
params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'binary',
          'nthread': 3, # Updated from nthread
          'num_leaves': 64,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'auc',
          'n_jobs' : -1,
          'device' : 'gpu'}

mdl = lgb.LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])

# To view the default model params:
mdl.get_params().keys()

# Create the grid
grid = GridSearchCV(mdl, gridParams,
                    verbose=10,
                    cv=4,
                    n_jobs=6)
# Run the grid
grid.fit(allTrainData, allTrainLabels)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

# Using parameters already set above, replace in the best from the grid search
params['colsample_bytree'] = grid.best_params_['colsample_bytree']
params['learning_rate'] = grid.best_params_['learning_rate']
# params['max_bin'] = grid.best_params_['max_bin']
params['num_leaves'] = grid.best_params_['num_leaves']
params['reg_alpha'] = grid.best_params_['reg_alpha']
params['reg_lambda'] = grid.best_params_['reg_lambda']
params['subsample'] = grid.best_params_['subsample']
# params['subsample_for_bin'] = grid.best_params_['subsample_for_bin']


Fitting 4 folds for each of 96 candidates, totalling 384 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:    8.0s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:    8.3s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   20.2s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:   26.3s
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   36.1s
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   44.0s
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:   56.6s


In [None]:
print('Fitting with params: ')
print(params)

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import PredefinedSplit
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4590)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()
label_cols = ["target"]
y_split = df_train[label_cols].values

for fold_, (trn_idx, val_idx) in enumerate(folds.split(y_split[:,0], y_split[:,0])):
    print("fold {}".format(fold_))

    trn_data = lgb.Dataset(df_train.iloc[trn_idx][ae_columns], label=df_train['target'].iloc[trn_idx])
    val_data = lgb.Dataset(df_train.iloc[val_idx][ae_columns], label=df_train['target'].iloc[val_idx])

    num_round = 100000
    clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], valid_names=['train', 'test'],
                    verbose_eval=100, early_stopping_rounds=200)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][ae_columns], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(df_test[ae_columns], num_iteration=clf.best_iteration) / 5

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

In [None]:
sub_df = pd.DataFrame({"ID_code":df_test["ID_code"].values})
sub_df['target'] = np.int32(predsTest > 0.5)
sub_df.to_csv("submission.csv", index=False)

In [None]:
#df_original.to_csv("df_original.csv", index=False)
df_train.to_csv("auto_model_reconstructions/train_auto_0.csv", index=False)
df_test.to_csv("auto_model_reconstructions/test_auto_0.csv", index=False)