In [None]:
# https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/44629
# https://www.kaggle.com/rspadim/simple-denoise-autoencoder-with-keras
import numpy as np
import pandas as pd
import re
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(font_scale=1.56)

import gc
import os, sys, random, math

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, confusion_matrix
from tqdm.autonotebook import tqdm

import keras
import tensorflow as tf
import keras.backend as K

from keras import layers
from keras.models import Model
from keras.layers import Dense, Input, Dropout, BatchNormalization, Activation
from keras.utils.generic_utils import get_custom_objects
from keras.optimizers import Adam, Nadam
from keras.callbacks import Callback

from scipy.stats import rankdata, spearmanr
from scipy.special import erfinv

In [None]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
%%time

print('Loading data...')

temp_train = pd.read_csv('E-Sun_Credit_Card_Fraud_Data/train.csv')
print('\tSuccessfully loaded train!')

temp_test = pd.read_csv('E-Sun_Credit_Card_Fraud_Data/test.csv')
print('\tSuccessfully loaded test!')

print('Data was successfully loaded!\n')

print("Train shape : "+str(temp_train.shape))
print("Test shape  : "+str(temp_test.shape))

In [None]:
temp_train['loctm'] = (temp_train['loctm']//10000).astype(int)
temp_test['loctm'] = (temp_test['loctm']//10000).astype(int)
temp_test['fraud_ind'] = np.nan


temp_train['flbmk'] = temp_train['flbmk'].fillna('Nan')
temp_train['flg_3dsmk'] = temp_train['flg_3dsmk'].fillna('Nan')

temp_test['flbmk'] = temp_test['flbmk'].fillna('Nan')
temp_test['flg_3dsmk'] = temp_test['flg_3dsmk'].fillna('Nan')

In [None]:
print('Merging test and train')
temp_train = temp_train.append(temp_test).reset_index() # merge train and test
del temp_test
print('Done, shape=',np.shape(temp_train))

In [None]:
def rank_gauss(x):
    N = x.shape[0]
    temp = x.argsort()
    rank_x = temp.argsort() / N
    rank_x -= rank_x.mean()
    rank_x *= 2
    efi_x = erfinv(rank_x)
    efi_x -= efi_x.mean()
    return efi_x

In [None]:
from sklearn.preprocessing import MinMaxScaler
from scipy.special import erfinv
categorical_features = [
                        'cano', 'bacno', 'acqic', 'mchno', 'mcc', 'stocn','scity','csmcu',
                        'contp', 'etymd', 'hcefg', 'iterm', 'stscd',
                        'loctm', 'locdt'
                       ]
#for col in categorical_features:
#    min_max_scaler = MinMaxScaler()
#    temp_train[col] = min_max_scaler.fit_transform(temp_train[col].values.reshape(-1,1))
for col in categorical_features:
    temp_train[col] = rank_gauss(temp_train[col].values)

binary_features = [
                   'ecfg','flbmk','flg_3dsmk','insfg','ovrlt',
                  ]
for col in binary_features:
    temp_train[col] = temp_train[col].map({'Y':1, 'N':-1, 'Nan':0})

continuous_features = ['conam',]
for column in continuous_features:
    scaler = StandardScaler()
    if temp_train[column].max() > 100 and temp_train[column].min() >= 0:
        temp_train[column] = np.log1p(temp_train[column])
    scaler.fit(temp_train[col].values.reshape(-1,1))
    temp_train[column] = scaler.transform(temp_train[column].values.reshape(-1,1))

In [None]:
temp_train.head()

In [None]:
import threading
class ReadWriteLock:
    def __init__(self):
        self._read_ready = threading.Condition(threading.Lock())
        self._readers = 0
    def acquire_read(self):
        self._read_ready.acquire()
        try:
            self._readers += 1
        finally:
            self._read_ready.release()
    def release_read(self):
        self._read_ready.acquire()
        try:
            self._readers -= 1
            if not self._readers:
                self._read_ready.notifyAll()
        finally:
            self._read_ready.release()
    def acquire_write(self):
        self._read_ready.acquire()
        while self._readers > 0:
            self._read_ready.wait()
    def release_write(self):
        self._read_ready.release()

In [None]:
from math import ceil
from keras.utils import Sequence
class DAESequence(Sequence):
    def __init__(self, df, batch_size=128, random_cols=.15, random_rows=1, use_cache=False, use_lock=False, verbose=True): #batch_size=128
        self.df = df.values.copy()     # ndarray baby
        self.batch_size = int(batch_size)
        self.len_data = df.shape[0]
        self.len_input_columns = df.shape[1]
        if(random_cols <= 0):
            self.random_cols = 0
        elif(random_cols >= 1):
            self.random_cols = self.len_input_columns
        else:
            self.random_cols = int(random_cols*self.len_input_columns)
        if(self.random_cols > self.len_input_columns):
            self.random_cols = self.len_input_columns
        self.random_rows = random_rows
        self.cache = None
        self.use_cache = use_cache
        self.use_lock = use_lock
        self.verbose = verbose
        
        self.lock = ReadWriteLock()
        self.on_epoch_end()

    def on_epoch_end(self):
        if(not self.use_cache):
            return
        if(self.use_lock):
            self.lock.acquire_write()
        if(self.verbose):
            print("Doing Cache")
        self.cache = {}
        for i in range(0, self.__len__()):
            self.cache[i] = self.__getitem__(i, True)
        if(self.use_lock):
            self.lock.release_write()
        gc.collect()
        if(self.verbose):
            print("Done")

    def __len__(self):
        return int(ceil(self.len_data / float(self.batch_size)))

    def __getitem__(self, idx, doing_cache=False):
        if(not doing_cache and self.cache is not None and not (self.random_cols <=0 or self.random_rows<=0)):
            if(idx in self.cache.keys()):
                if(self.use_lock):
                    self.lock.acquire_read()
                ret0, ret1 = self.cache[idx][0], self.cache[idx][1]
                if(self.use_lock):
                    self.lock.release_read()
                if (not doing_cache and self.verbose):
                    print('DAESequence Cache ', idx)
                return ret0, ret1
        idx_end = min(idx + self.batch_size, self.len_data)
        cur_len = idx_end - idx
        rows_to_sample = int(self.random_rows * cur_len)
        input_x = self.df[idx: idx_end]
        if (self.random_cols <= 0 or self.random_rows <= 0 or rows_to_sample<=0):
            return input_x, input_x

        random_rows = np.random.randint(low=0, high=self.len_data-rows_to_sample, size=rows_to_sample)
        random_rows[random_rows>idx] += cur_len # just to don't select twice the current rows
        cols_to_shuffle = np.random.randint(low=0, high=self.len_input_columns, size=self.random_cols)
        noise_x = input_x.copy()
        noise_x[0:rows_to_sample, cols_to_shuffle] = self.df[random_rows[:,None], cols_to_shuffle]
        if(not doing_cache and self.verbose):
            print('DAESequence ', idx)
        return noise_x, input_x

In [None]:
from keras.models import Sequential
print("Create Model")
dae_data = temp_train[temp_train.columns.drop(['index','txkey','fraud_ind'])] # only get "X" vector

len_input_columns, len_data = dae_data.shape[1], dae_data.shape[0]
NUM_GPUS=1

kernel_initializer_1 = keras.initializers.RandomUniform(-math.sqrt(1.0/(len_input_columns*10)),math.sqrt(1.0/(len_input_columns*10)))
kernel_initializer_0 = keras.initializers.RandomUniform(-math.sqrt(1.0/(len_input_columns)),math.sqrt(1.0/(len_input_columns)))

print("Input len=", len_input_columns, len_data)
model_dae = Sequential()
model_dae.add(Dense(units=len_input_columns*5, activation='relu', name='Hidden1', input_shape=(len_input_columns,), kernel_initializer=kernel_initializer_0))
model_dae.add(Dense(units=len_input_columns*5, activation='relu', name='Hidden2', kernel_initializer=kernel_initializer_1))
model_dae.add(Dense(units=len_input_columns*5, activation='relu', name='Hidden3', kernel_initializer=kernel_initializer_1))
model_dae.add(Dense(units=len_input_columns, activation='linear', name='Output', kernel_initializer=kernel_initializer_1))
model_opt = keras.optimizers.SGD(lr=0.003, decay=1-0.995, momentum=0, nesterov=False)

try:
    print('Loading model from file')
    model_dae = keras.models.load_model('DAE.keras.model.h5')
except Exception as e:
    print("Can't load previous fitting parameters and model", repr(e))
if(NUM_GPUS>1):
    try:
        multi_gpu_model = keras.utils.multi_gpu_model(model_dae, gpus=NUM_GPUS)
        multi_gpu_model.compile(loss='mean_squared_error', optimizer=model_opt) #model_opt Nadam()
        print("MULTI GPU MODEL")
        print(multi_gpu_model.summary())
    except Exception as e:
        print("Can't run multi gpu, error=", repr(e))
        model_dae.compile(loss='mean_squared_error', optimizer=model_opt)
        NUM_GPUS=0
else:
    model_dae.compile(loss='mean_squared_error', optimizer=model_opt)

print("BASE MODEL")
print(model_dae.summary())

In [None]:
from math import ceil
batch_size = 128
epochs = 1000
multi_process_workers = 1
if (NUM_GPUS > 1):
    multi_gpu_model.fit_generator(
        DAESequence(dae_data, batch_size=batch_size*NUM_GPUS, verbose=False),
        steps_per_epoch=int(ceil(dae_data.shape[0]/(batch_size*NUM_GPUS))),
        workers=multi_process_workers, use_multiprocessing=True if multi_process_workers>1 else False,
        epochs=epochs,
        verbose=1,
        callbacks=[
            # keras.callbacks.LambdaCallback(on_epoch_end=lambda x,y: model_dae.save('DAE.keras.model.h5')) # save weights 
        ])
else: # single CPU/GPU
    model_dae.fit_generator(
        DAESequence(dae_data, batch_size=batch_size, verbose=False),
        steps_per_epoch=int(ceil(dae_data.shape[0]/batch_size)),
        epochs=epochs,
        workers=multi_process_workers, use_multiprocessing=True if multi_process_workers>1 else False,
        verbose=1, callbacks=[
            # keras.callbacks.LambdaCallback(on_epoch_end=lambda x,y: model_dae.save('DAE.keras.model.h5')) # save weights
        ])
    
#model_dae.save('DAE.keras.model.h5') # save weights
plt.hist(model_dae.get_weights(), bins = 100)
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
if (NUM_GPUS > 1):
    dae_denoised_data = multi_gpu_model.predict(dae_data)
else:
    dae_denoised_data = model_dae.predict(dae_data)

print("DAE MSE from train data: ", mean_squared_error(dae_data, dae_denoised_data))

In [None]:
DAE_new_df=temp_train[['txkey','fraud_ind']].copy()

for i in ['1','2','3']:
    print('Hidden layer',i)
    columns_names = ['Hidden_'+str(i)+'_'+str(l) for l in range(0, len_input_columns*5)]
    for l in columns_names:
        DAE_new_df[l] = 0 # create columns
    intermediate_layer_model = Model(inputs=model_dae.input, outputs=model_dae.get_layer('Hidden' + i).output)
    DAE_new_df[columns_names] = intermediate_layer_model.predict(dae_data)

print('DONE!')
print(DAE_new_df)

In [None]:
%%time
for df in [DAE_new_df]:
    df = reduce_mem_usage(df)

DAE_new_df.to_csv("DAE-hidden-features_new.csv")