In [1]:
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import seaborn as sns
import gc

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score
from scipy.stats import norm, rankdata

import keras
from keras import regularizers
from keras.layers import Input,Dropout,BatchNormalization,Activation,Add,PReLU, LSTM
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Reshape, Flatten, Conv2D, MaxPooling2D
from keras.optimizers import SGD
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
import tensorflow as tf
import horovod.keras as hvd

Using TensorFlow backend.


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


In [2]:
# reduce memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
# Horovod: initialize Horovod.
hvd.init()

# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
K.set_session(tf.Session(config=config))

In [4]:
train = reduce_mem_usage(pd.read_csv('../input/train.csv'))
test = reduce_mem_usage(pd.read_csv('../input/test.csv'))

Memory usage after optimization is: 78.01 MB
Decreased by 74.7%
Memory usage after optimization is: 77.82 MB
Decreased by 74.6%


In [5]:
features = [f for f in train if f not in ['ID_code','target']]

In [6]:
df_original = pd.concat([train, test],axis=0,sort=False)
df = df_original[features]
target = df_original['target'].values
id = df_original['ID_code']

In [7]:
#for feature in features:
#    df['mean_'+feature] = (train[feature].mean()-train[feature])
#    df['z_'+feature] = (train[feature] - train[feature].mean())/train[feature].std(ddof=0)
#    df['sq_'+feature] = (train[feature])**2
#    df['sqrt_'+feature] = np.abs(train[feature])**(1/2)
#    df['cp_'+feature] = pd.DataFrame(rankdata(train[feature]))
#    df['cnp_'+feature] = pd.DataFrame((norm.cdf(train[feature])))

In [8]:
for df in [df]:
#####Handling Missing Values#####     
    for i in range(len(df.columns)):
        df.iloc[:,i] = (df.iloc[:,i]).fillna(-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [9]:
df.isnull().values.any()
df = reduce_mem_usage(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Memory usage after optimization is: 155.64 MB
Decreased by 0.0%


In [11]:
from scipy.special import erfinv
trafo_columns = [c for c in df.columns if len(df[c].unique()) != 2]
for col in trafo_columns:
    values = sorted(set(df[col]))
    # Because erfinv(1) is inf, we shrink the range into (-0.9, 0.9)
    f = pd.Series(np.linspace(-0.9, 0.9, len(values)), index=values)
    f = np.sqrt(2) * erfinv(f)
    f -= f.mean()
    df[col] = df[col].map(f)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=200)
pca.fit(df[trafo_columns])
df = pca.transform(df[trafo_columns])
df = pd.DataFrame(df)

In [12]:
df['target'] = df_original.target.values
df = reduce_mem_usage(df)
df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Memory usage after optimization is: 156.40 MB
Decreased by 74.6%


Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,target
0,0.276123,-1.248047,0.492676,0.178345,0.301514,-1.103516,0.08905,0.470459,-1.241211,-0.489014,...,0.264893,1.095703,0.69043,0.287354,-1.189453,1.255859,0.127563,-0.142212,-0.439697,0.0
1,0.544922,-1.052734,0.788086,0.232422,0.491943,1.179688,0.271484,0.289795,1.054688,0.565918,...,0.720703,1.007812,1.376953,-0.075195,1.229492,1.270508,0.197388,0.609375,0.682129,0.0
2,0.245361,-0.890137,0.516602,0.739258,0.126343,-1.097656,0.821289,0.021896,-1.241211,-0.388916,...,0.89209,0.833008,0.689453,0.625488,1.481445,-1.262695,0.036865,0.174316,0.313965,0.0
3,0.49707,-0.824707,0.111084,0.583984,0.54248,-0.621582,0.354492,0.07074,-1.325195,0.606445,...,0.355225,0.544922,0.640137,0.805664,-0.897949,-0.942871,0.70752,0.571777,-0.994141,0.0
4,0.368164,-0.703613,0.632324,0.469238,0.4729,0.818848,0.391846,0.526855,1.381836,0.387207,...,0.871094,-0.407227,1.296875,-0.51709,-0.96582,0.975586,0.424805,0.574707,-0.98877,0.0


In [13]:
#train = df[df['target'].notnull()]
#target = train['target']
#test = df[df['target'].isnull()]
#trafo_columns = [c for c in train.columns if c not in ['target']]
#train.shape

In [14]:
trafo_columns = [c for c in df.columns if c not in ['target']]

In [15]:
from keras import backend as K
from keras.activations import elu
from keras.layers import Input, Dense, Lambda
from keras.models import Model
from keras.objectives import binary_crossentropy
from keras.callbacks import LearningRateScheduler
from keras import backend as K
from imblearn.keras import balanced_batch_generator
from imblearn.under_sampling import NearMiss, RandomUnderSampler, CondensedNearestNeighbour, AllKNN, InstanceHardnessThreshold
from sklearn.model_selection import KFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

learning_rate = 0.0003
mom = 0.2
dcy = 0.996
nb_folds = 4
nb_epoch = 100
batch_size = 140
encoding_dim =1500
hidden_dim = int(encoding_dim) #i.e. 7
sgd = SGD(lr=learning_rate, momentum=mom, decay=dcy)
#folds = StratifiedKFold(n_splits=nb_folds, shuffle=True, random_state=420)
folds = KFold(n_splits = nb_folds, random_state = 338, shuffle = True)
auto = np.zeros(df[trafo_columns].shape)
layer_output = np.zeros((len(df), 1500)) # change when nn shape changes
#layer_output = np.zeros(df[trafo_columns].shape)
#train_auto = np.zeros(train[trafo_columns].shape)
#test_auto = np.zeros(test[trafo_columns].shape)
predictions = np.zeros(len(df))
#label_cols = ["target"]
#y_split = train[label_cols].values

# Horovod: adjust learning rate based on number of GPUs.
opt = keras.optimizers.SGD(lr=learning_rate, decay=dcy, momentum=mom, nesterov=True)
# Horovod: add Horovod Distributed Optimizer.
opt = hvd.DistributedOptimizer(opt)

cp = ModelCheckpoint(filepath="autoencoder_0.h5",
                               save_best_only=True,
                               verbose=0)

tb = TensorBoard(log_dir='./logs',
                histogram_freq=0,
                write_graph=True,
                write_images=True)

es= EarlyStopping(monitor='val_acc',
                  min_delta=0,
                  patience=20,
                  verbose=1, mode='min')

#for fold_, (trn_idx, val_idx) in enumerate(folds.split(y_split[:,0], y_split[:,0])):
#    print("fold {}".format(fold_))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(df)):
    print("fold {}".format(fold_))

    trn_data = df[trafo_columns].iloc[trn_idx]
    val_data = df[trafo_columns].iloc[val_idx]

    def add_noise(series, noise_level):
        return series * (1 + noise_level * np.random.randn(series.shape[1]))
    
    noisy_trn_data = add_noise(trn_data, 0.04)

    input_dim = noisy_trn_data.shape[1] #num of columns, 30
    input_layer = Input(shape=(input_dim, ))
    encoder = Dense(encoding_dim, activation="tanh", activity_regularizer=regularizers.l1(learning_rate))(input_layer)
    encoder = BatchNormalization()(encoder)
    encoder = Dense(hidden_dim, activation="relu")(encoder)
    encoder = BatchNormalization()(encoder)
    decoder = Dense(hidden_dim, activation='relu')(encoder)
    decoder = Dense(input_dim, activation='tanh')(decoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)
    autoencoder.summary()
    
    autoencoder.compile(metrics=['accuracy'],
                        loss='mean_squared_error',
                        optimizer='sgd')

    cp = ModelCheckpoint(filepath="autoencoder_fraud.h5",
                                   save_best_only=True,
                                   verbose=0)

    tb = TensorBoard(log_dir='./logs',
                    histogram_freq=0,
                    write_graph=True,
                    write_images=True)
    
    es= EarlyStopping(monitor='val_acc',
                  min_delta=0,
                  patience=10,
                  verbose=1, mode='auto')

    history = autoencoder.fit(noisy_trn_data, trn_data,
                        epochs=nb_epoch,
                        batch_size=batch_size,
                        shuffle=True,
                        validation_data=(val_data, val_data),
                        verbose=1,
                        callbacks=[cp, tb, es]).history
    
    
    auto[val_idx] += autoencoder.predict(df.iloc[val_idx][trafo_columns], verbose=1)
    mse = autoencoder.predict(df[trafo_columns] / folds.n_splits, verbose=1)
    predictions += np.mean(np.power(df[trafo_columns] - mse, 2), axis=1)
    # we build a new model with the activations of the old model
    # this model is truncated after the first layer
    get_1st_layer_output = K.function([autoencoder.layers[0].input],
                                  [autoencoder.layers[1].output])
    layer_output[val_idx] += pd.DataFrame(np.concatenate(get_1st_layer_output([df.iloc[val_idx][trafo_columns]])))
    
auto_final = pd.DataFrame(auto / folds.n_splits)
hidden = pd.DataFrame(layer_output / folds.n_splits)

fold 0
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1500)              301500    
_________________________________________________________________
batch_normalization_1 (Batch (None, 1500)              6000      
_________________________________________________________________
dense_2 (Dense)              (None, 1500)              2251500   
_________________________________________________________________
batch_normalization_2 (Batch (None, 1500)              6000      
_________________________________________________________________
dense_3 (Dense)              (None, 1500)              2251500   
_________________________________________________________________
dense_4 (Dense)              (None, 200)               300200    
Tot

In [16]:
#hidden
hidden['target'] = target
hidden['ID_code'] = id.values
hidden.head(5)
#final
auto_final['target'] = target
auto_final['ID_code'] = id.values
auto_final.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,192,193,194,195,196,197,198,199,target,ID_code
0,0.037098,-0.246259,0.161921,0.113324,0.109084,-0.227911,0.037021,0.170681,-0.204606,-0.160791,...,0.23347,0.127236,0.102966,-0.207186,0.208276,0.066652,-0.016748,-0.201666,0.0,train_0
1,0.049664,-0.236835,0.141943,0.095345,0.099269,0.18861,0.047097,0.006559,0.243091,0.164853,...,0.241266,0.237709,-0.021244,0.240099,0.229344,0.072282,0.083841,0.192511,0.0,train_1
2,0.11584,-0.209504,0.185729,0.17269,0.040462,-0.242156,0.173452,-0.095028,-0.18904,-0.041403,...,0.239066,0.208457,0.174665,0.246209,-0.241916,0.02983,-0.000666,0.056036,0.0,train_2
3,0.147483,-0.206408,0.021531,0.128069,0.142741,-0.172584,0.084944,0.025649,-0.22332,0.156077,...,0.177676,0.200196,0.172018,-0.199926,-0.2229,0.133089,0.133401,-0.229188,0.0,train_3
4,0.105962,-0.22293,0.163316,0.110949,0.146369,0.101697,0.109621,0.110542,0.244619,0.073885,...,-0.095144,0.230949,-0.115674,-0.20846,0.2236,0.136776,0.172106,-0.239442,0.0,train_4


In [17]:
%whos DataFrame

Variable         Type         Data/Info
---------------------------------------
auto_final       DataFrame                   0         <...>00000 rows x 202 columns]
df               DataFrame               var_0     var_<...>00000 rows x 201 columns]
df_original      DataFrame                ID_code  targ<...>00000 rows x 202 columns]
hidden           DataFrame                   0         <...>0000 rows x 1502 columns]
noisy_trn_data   DataFrame               var_0     var_<...>00000 rows x 200 columns]
test             DataFrame                ID_code      <...>00000 rows x 201 columns]
train            DataFrame                 ID_code  tar<...>00000 rows x 202 columns]
trn_data         DataFrame               var_0     var_<...>00000 rows x 200 columns]
val_data         DataFrame               var_0     var_<...>00000 rows x 200 columns]


In [18]:
del df, noisy_trn_data, test, train, trn_data, val_data
gc.collect()

172

In [19]:
%whos DataFrame

Variable      Type         Data/Info
------------------------------------
auto_final    DataFrame                   0         <...>00000 rows x 202 columns]
df_original   DataFrame                ID_code  targ<...>00000 rows x 202 columns]
hidden        DataFrame                   0         <...>0000 rows x 1502 columns]


In [22]:
#hidden
ae_columns = [c for c in hidden.columns if c not in ['ID_code', 'target']]
df_train = hidden[hidden['target'].notnull()]
target = df_train['target']
df_test = hidden[hidden['target'].isnull()]
df_train.shape

#final
#ae_columns = [c for c in auto_final.columns if c not in ['ID_code', 'target']]
#df_train = auto_final[auto_final['target'].notnull()]
#target = df_train['target']
#df_test = auto_final[auto_final['target'].isnull()]
#df_train.shape


(200000, 1502)

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import PredefinedSplit
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4590)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()
label_cols = ["target"]
y_split = df_train[label_cols].values

param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}


for fold_, (trn_idx, val_idx) in enumerate(folds.split(y_split[:,0], y_split[:,0])):
    print("fold {}".format(fold_))

    train_x, train_y = df_train[ae_columns].iloc[trn_idx], df_train['target'].iloc[trn_idx]
#    trn_data, trn_y = df_train[ae_columns].iloc[trn_idx], df_train['target'].iloc[trn_idx]
    val_x, val_y = df_train[ae_columns].iloc[val_idx], df_train['target'].iloc[val_idx]

#    classes=[]
#    for i in np.unique(trn_y):
#        classes.append(i)
#        print("Before OverSampling, counts of label " + str(i) + ": {}".format(sum(trn_y==i)))

#    sm=SMOTE(random_state=2)
#    train_x, train_y = sm.fit_sample(trn_data, trn_y.ravel())

#    print('After OverSampling, the shape of train_X: {}'.format(train_x.shape))
#    print('After OverSampling, the shape of train_y: {} \n'.format(train_y.shape))

#    for eachClass in classes:
#        print("After OverSampling, counts of label " + str(eachClass) + ": {}".format(sum(train_y==eachClass)))  
    
    num_round = 20000 
    train_data = pd.DataFrame(train_x)
    train_y = pd.DataFrame(train_y)
    train = lgb.Dataset(train_x, label=train_y)#, categorical_feature=categorical_feats)
    val = lgb.Dataset(val_x, label=val_y)#, categorical_feature=categorical_feats)
    
    # Create parameters to search
    fit_params={"early_stopping_rounds":100, 
                "eval_metric" : 'auc', 
                'eval_set' : [(val_x,val_y)],
                'eval_names': ['valid'],
                #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
                'verbose': 100}

    clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=-1, device = 'gpu', n_estimators=5000)
    gs = RandomizedSearchCV(
        estimator=clf, param_distributions=param_test, 
        n_iter=num_round,
        scoring='roc_auc',
        cv=3,
        refit=True,
        random_state=314,
        verbose=True)
    
    gs.fit(train_data, train_y, **fit_params)
    oof[val_idx] = gs.predict(df_train.iloc[val_idx][ae_columns], num_iteration=clf.best_iteration)

    
    train_prediction = gs.predict(df_train[ae_columns] / folds.n_splits)
    predictions += gs.predict(df_test[ae_columns] / folds.n_splits)
    print("BEST PARAMETERS: " + str(gs.best_params_))
    print("BEST CV SCORE: " + str(gs.best_score_))

np.sqrt(mean_squared_error(oof, target))

fold 0
Fitting 3 folds for each of 20000 candidates, totalling 60000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	valid's auc: 0.782865
[200]	valid's auc: 0.810072
[300]	valid's auc: 0.821876
[400]	valid's auc: 0.827531
[500]	valid's auc: 0.830813
[600]	valid's auc: 0.832416
[700]	valid's auc: 0.833096
[800]	valid's auc: 0.833342
[900]	valid's auc: 0.833351
[1000]	valid's auc: 0.833519
[1100]	valid's auc: 0.833789
[1200]	valid's auc: 0.834354
[1300]	valid's auc: 0.834716
[1400]	valid's auc: 0.834623
Early stopping, best iteration is:
[1325]	valid's auc: 0.834793


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.


In [None]:
sub_df = pd.DataFrame({"ID_code":df_test["ID_code"].values})
sub_df["target"] = predictions
sub_df.to_csv("submission.csv", index=False)

In [None]:
#df_original.to_csv("df_original.csv", index=False)
df_train.to_csv("auto_model_reconstructions/train_auto_0.csv", index=False)
df_test.to_csv("auto_model_reconstructions/test_auto_0.csv", index=False)