In [None]:
import numpy as np
import pandas as pd
import re
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(font_scale=1.56)

import gc
import os, sys, random, math

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, confusion_matrix
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold, train_test_split
from tqdm.autonotebook import tqdm
import itertools

import keras
import tensorflow as tf
import keras.backend as K

from keras import layers
from keras.models import Model
from keras.layers import Dense, Input, Dropout, BatchNormalization, Activation
from keras.utils.generic_utils import get_custom_objects
from keras.optimizers import Adam, Nadam
from keras.callbacks import Callback

from scipy.stats import rankdata, spearmanr

In [None]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024 ** 2 # just added 
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage(deep=True).sum() / 1024 ** 2
    percent = 100 * (start_mem - end_mem) / start_mem
    print('Mem. usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, percent))
    return df

In [None]:
%%time

print('Loading data...')

train = pd.read_csv('E-Sun_Credit_Card_Fraud_Data/train.csv')#, index_col='txkey')
print('\tSuccessfully loaded train!')

test = pd.read_csv('E-Sun_Credit_Card_Fraud_Data/test.csv')#, index_col='txkey')
print('\tSuccessfully loaded test!')

print('Data was successfully loaded!\n')

In [None]:
def date_time_split(dataframe):
    dataframe['locdt_M'] = dataframe['locdt'].map({1:1,2:1,3:1,4:1,5:1,6:1,7:1,8:1,9:1,10:1,11:1,12:1,13:1,14:1,15:1,
                                                   16:1,17:1,18:1,19:1,20:1,21:1,22:1,23:1,24:1,25:1,26:1,27:1,28:1,29:1,30:1,31:1,
                                                   32:2,33:2,34:2,35:2,36:2,37:2,38:2,39:2,40:2,41:2,42:2,43:2,44:2,45:2,
                                                   46:2,47:2,48:2,49:2,50:2,51:2,52:2,53:2,54:2,55:2,56:2,57:2,58:2,59:2,60:2,61:2,
                                                   62:3,63:3,64:3,65:3,66:3,67:3,68:3,69:3,70:3,71:3,72:3,73:3,74:3,75:3,
                                                   76:3,77:3,78:3,79:3,80:3,81:3,82:3,83:3,84:3,85:3,86:3,87:3,88:3,89:3,90:3,91:3,92:3,
                                                   93:4,94:4,95:4,96:4,97:4,98:4,99:4,100:4,101:4,102:4,103:4,104:4,105:4,106:4,107:4,
                                                   108:4,109:4,110:4,111:4,112:4,113:4,114:4,115:4,116:4,117:4,118:4,119:4,120:4})
    
    dataframe['locdt_W'] = dataframe['locdt'].map({1:1,2:1,3:1,4:1,5:1,6:1,7:1,
                                                   8:2,9:2,10:2,11:2,12:2,13:2,14:2,
                                                   15:3,16:3,17:3,18:3,19:3,20:3,21:3,
                                                   22:4,23:4,24:4,25:4,26:4,27:4,28:4,
                                                   29:5,30:5,31:5,32:5,33:5,34:5,35:5,
                                                   36:6,37:6,38:6,39:6,40:6,41:6,42:6,
                                                   43:7,44:7,45:7,46:7,47:7,48:7,49:7,
                                                   50:8,51:8,52:8,53:8,54:8,55:8,56:8,
                                                   57:9,58:9,59:9,60:9,61:9,62:9,63:9,
                                                   64:10,65:10,66:10,67:10,68:10,69:10,70:10,
                                                   71:11,72:11,73:11,74:11,75:11,76:11,77:11,
                                                   78:12,79:12,80:12,81:12,82:12,83:12,84:12,
                                                   85:13,86:13,87:13,88:13,89:13,90:13,91:13,
                                                   92:14,93:14,94:14,95:14,96:14,97:14,98:14,
                                                   99:15,100:15,101:15,102:15,103:15,104:15,105:15,
                                                   106:16,107:16,108:16,109:16,110:16,111:16,112:16,
                                                   113:17,114:17,115:17,116:17,117:17,118:17,119:17,
                                                   120:18})
    
    dataframe['loct_hour'] = (dataframe['loctm']//10000).astype(int)
    gc.collect()
    
    return dataframe

train = date_time_split(train)
test = date_time_split(test)

In [None]:
train['flbmk'] = train['flbmk'].fillna('Nan')
train['flg_3dsmk'] = train['flg_3dsmk'].fillna('Nan')

test['flbmk'] = test['flbmk'].fillna('Nan')
test['flg_3dsmk'] = test['flg_3dsmk'].fillna('Nan')

binary_features = [
                   'ecfg','flbmk','flg_3dsmk','insfg','ovrlt',
                  ]
for col in binary_features:
    train[col] = train[col].map({'Y':1, 'N':-1, 'Nan':0})
    test[col] = test[col].map({'Y':1, 'N':-1, 'Nan':0})

train['txn_info'] = train['ecfg'].astype(str)+train['flg_3dsmk'].astype(str)+train['flbmk'].astype(str)
test['txn_info'] = test['ecfg'].astype(str)+test['flg_3dsmk'].astype(str)+test['flbmk'].astype(str)

cat_columns = ['txn_info']

for col in cat_columns:
    train[col] = train[col].fillna('unseen_before_label')
    test[col]  = test[col].fillna('unseen_before_label')
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)
    
    lbl =  LabelEncoder()
    lbl.fit(list(train[col]) + list(test[col]))
    train[col] = lbl.transform(train[col])
    test[col]  = lbl.transform(test[col])

train.fillna(-999,inplace = True)
test.fillna(-999,inplace = True)

In [None]:
# check amount
train['conam_check'] = np.where(train['conam'].isin(test['conam']), 1, -1)
test['conam_check']  = np.where(test['conam'].isin(train['conam']), 1, -1)

# check cano
train['cano_check'] = np.where(train['cano'].isin(test['cano']), 1, -1)
test['cano_check']  = np.where(test['cano'].isin(train['cano']), 1, -1)

In [None]:
train_len = len(train)
temp_train = train.copy()
temp_test = test.copy()

feq_cols = ['mchno','stocn','csmcu','etymd','stscd','txn_info']
for col in feq_cols:
    train_test_all = pd.concat([temp_train[['cano','conam',col]],temp_test[['cano','conam',col]]]) 
    
    new_col_name = 'cano_'+col+'_fq'
    train_test_all[new_col_name] = train_test_all.groupby(['cano',col])['conam'].transform('count')
    train[new_col_name] = train_test_all[:train_len][new_col_name].tolist()
    test[new_col_name] = train_test_all[train_len:][new_col_name].tolist()
    
    print("'"+new_col_name+"'",', ',end='')

In [None]:
feq_cols = ['locdt','loct_hour']
for col in feq_cols:
    train_test_all = pd.concat([temp_train[['cano','conam',col]],temp_test[['cano','conam',col]]]) 
    
    new_col_name = col+'_cano_count'
    train_test_all[new_col_name] = train_test_all.groupby([col,'cano'])['conam'].transform('count')
    train[new_col_name] = train_test_all[:train_len][new_col_name].tolist()
    test[new_col_name] = train_test_all[train_len:][new_col_name].tolist()
    
    print("'"+new_col_name+"'",', ',end='')

In [None]:
train_test_all = pd.concat([temp_train[['locdt_W','acqic', 'mchno', 'scity','csmcu','conam']],temp_test[['locdt_W','acqic', 'mchno', 'scity','csmcu','conam']]])
train_test_all['past_w_same_store'] = train_test_all.groupby(['locdt_W','acqic', 'mchno','scity','csmcu',])['conam'].transform('count')
train['past_w_same_store'] = train_test_all[:train_len].past_w_same_store.tolist()
test['past_w_same_store'] = train_test_all[train_len:].past_w_same_store.tolist()

train['past_w_same_store'] = np.where(train['past_w_same_store']>=10, 10, train['past_w_same_store'])
test['past_w_same_store']  = np.where(test['past_w_same_store']>=10, 10, test['past_w_same_store'])

train['past_w_same_store'] = np.where(((train['past_w_same_store']>=5)&(train['past_w_same_store']<10)), 5, train['past_w_same_store'])
test['past_w_same_store'] = np.where(((test['past_w_same_store']>=5)&(test['past_w_same_store']<10)), 5, test['past_w_same_store'])

train['past_w_same_store'] = np.where(((train['past_w_same_store']>1)&(train['past_w_same_store']<5)), 2, train['past_w_same_store'])
test['past_w_same_store'] = np.where(((test['past_w_same_store']>1)&(test['past_w_same_store']<5)), 2, test['past_w_same_store'])

train['past_w_same_store'] = train['past_w_same_store'].astype(str)
test['past_w_same_store'] = test['past_w_same_store'].astype(str)

In [None]:
feq_cols = ['mchno','stocn','csmcu','etymd','stscd','txn_info']
for col in feq_cols:
    train_test_all = pd.concat([temp_train[['cano',col]],temp_test[['cano',col]]]) 
    
    new_col_name = col+'_cano'+'_fq'
    train_test_all[new_col_name] = train_test_all.groupby([col])['cano'].transform('count')
    train[new_col_name] = train_test_all[:train_len][new_col_name].tolist()
    test[new_col_name] = train_test_all[train_len:][new_col_name].tolist()
    
    print("'"+new_col_name+"'",', ',end='')

In [None]:
feq_cols = ['mchno','stocn','csmcu']
for col in feq_cols:
    train_test_all = pd.concat([temp_train[['locdt','cano',col]],temp_test[['locdt','cano',col]]]) 
    
    new_col_name = 'day_'+col+'_cano'+'_fq'
    train_test_all[new_col_name] = train_test_all.groupby(['locdt',col])['cano'].transform('count')
    train[new_col_name] = train_test_all[:train_len][new_col_name].tolist()
    test[new_col_name] = train_test_all[train_len:][new_col_name].tolist()
    
    print("'"+new_col_name+"'",', ',end='')

In [None]:
def values_normalization(dt_df, periods, columns):
    for period in periods:
        for col in columns:
            new_col = col +'_'+ period
            dt_df[col] = dt_df[col].astype(float)  

            temp_min = dt_df.groupby([period])[col].agg(['min']).reset_index()
            temp_min.index = temp_min[period].values
            temp_min = temp_min['min'].to_dict()

            temp_max = dt_df.groupby([period])[col].agg(['max']).reset_index()
            temp_max.index = temp_max[period].values
            temp_max = temp_max['max'].to_dict()

            dt_df['temp_min'] = dt_df[period].map(temp_min)
            dt_df['temp_max'] = dt_df[period].map(temp_max)

            dt_df[new_col+'_min_max'] = (dt_df[col]-dt_df['temp_min'])/(dt_df['temp_max']-dt_df['temp_min'])

            del dt_df['temp_min'],dt_df['temp_max']
    return dt_df


train_test_all = pd.concat([temp_train[['cano','locdt']],temp_test[['cano','locdt']]])
train_test_all.reset_index(inplace=True, drop=True)
train_test_all = train_test_all.sort_values('locdt')
train_test_all.drop_duplicates('cano',keep='first',inplace=True)

train_test_all.set_index('cano',inplace=True)
cano_date = train_test_all['locdt'].to_dict()

train['first_cano_dt'] = train['cano'].map(cano_date)
test['first_cano_dt'] = test['cano'].map(cano_date)

train = values_normalization(train, ['locdt_W'], ['first_cano_dt'])
test = values_normalization(test, ['locdt_W'], ['first_cano_dt'])

del temp_train, temp_test, train_test_all

In [None]:
# FREQUENCY ENCODE
def encode_FE(df1, df2, cols):
    for col in cols:
        df = pd.concat([df1[col],df2[col]])
        vc = df.value_counts(dropna=True, normalize=True).to_dict()
        vc[-1] = -1
        nm = col+'_FE'
        df1[nm] = df1[col].map(vc)
        df1[nm] = df1[nm].astype('float32')
        df2[nm] = df2[col].map(vc)
        df2[nm] = df2[nm].astype('float32')
        print("'"+nm+"'",', ',end='')
        
# LABEL ENCODE
def encode_LE(col,train=train,test=test,verbose=True):
    df_comb = pd.concat([train[col],test[col]],axis=0)
    df_comb,_ = df_comb.factorize(sort=True)
    nm = col
    if df_comb.max()>32000: 
        train[nm] = df_comb[:len(train)].astype('int32')
        test[nm] = df_comb[len(train):].astype('int32')
    else:
        train[nm] = df_comb[:len(train)].astype('int16')
        test[nm] = df_comb[len(train):].astype('int16')
    del df_comb; x=gc.collect()
    if verbose: print("'"+nm+"'",', ',end='')

# COMBINE FEATURES
def encode_CB(col1,col2,df1=train,df2=test):
    nm = col1+'_'+col2
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)
    df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str) 
    encode_LE(nm,verbose=False)
    print("'"+nm+"'",', ',end='')

# GROUP AGGREGATION MEAN AND STD
def encode_AG(main_columns, uids, aggregations=['mean'], train_df=train, test_df=test, 
              fillna=True, usena=False):
    # AGGREGATION OF MAIN WITH UID FOR GIVEN STATISTICS
    for main_column in main_columns:  
        for col in uids:
            for agg_type in aggregations:
                new_col_name = main_column+'_'+col+'_'+agg_type
                temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
                if usena: temp_df.loc[temp_df[main_column]==-999,main_column] = np.nan
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()   

                train_df[new_col_name] = train_df[col].map(temp_df).astype('float32')
                test_df[new_col_name]  = test_df[col].map(temp_df).astype('float32')
                
                if fillna:
                    train_df[new_col_name].fillna(-999,inplace=True)
                    test_df[new_col_name].fillna(-999,inplace=True)
                
                print("'"+new_col_name+"'",', ',end='')

# GROUP AGGREGATION NUNIQUE
def encode_AG2(main_columns, uids, train_df=train, test_df=test):
    for main_column in main_columns:  
        for col in uids:
            comb = pd.concat([train_df[[col]+[main_column]],test_df[[col]+[main_column]]],axis=0)
            mp = comb.groupby(col)[main_column].agg(['nunique'])['nunique'].to_dict()
            train_df[col+'_'+main_column+'_ct'] = train_df[col].map(mp).astype('int')
            test_df[col+'_'+main_column+'_ct'] = test_df[col].map(mp).astype('int')
            print("'"+col+'_'+main_column+'_ct'+"'",', ',end='')

# COUNT ENCODE            
def encode_CT(uids,train=train,test=test):
    for col in uids:
        train[col + '_count_full'] = train[col].map(pd.concat([train[col], test[col]]).value_counts(dropna=False)).astype('int')
        test[col + '_count_full'] = test[col].map(pd.concat([train[col], test[col]]).value_counts(dropna=False)).astype('int')
        print("'"+col+ '_count_full'+"'",', ',end='')

def encode_TG(uids,train=train,test=test):
    for col in uids:
        temp_dict = train.groupby([col])['fraud_ind'].agg(['mean']).reset_index().rename(columns={'mean': col+'_target_mean'})
        temp_dict.index = temp_dict[col].values
        temp_dict = temp_dict[col+'_target_mean'].to_dict()
    
        train[col+'_target_mean'] = train[col].map(temp_dict)
        test[col+'_target_mean']  = test[col].map(temp_dict)
        print("'"+col+'_target_mean'+"'",', ',end='')

In [None]:
encode_AG(['conam'],['cano', 'bacno', 'mcc', 'mchno', 'acqic',],['mean','std'],usena=True)

encode_FE(train,test,['cano', 'bacno', 'mcc', 'mchno', 'acqic',])

encode_FE(train,test,['etymd', 'stscd', 'txn_info', 'contp', 'hcefg', 'iterm',])

encode_TG(['etymd', 'stscd', 'txn_info'],train,test)

encode_CB('mcc','mchno')
encode_CB('scity','stocn')
encode_CB('mchno','stocn')
#encode_FE(train,test,['mcc_mchno','scity_stocn','mchno_stocn'])
encode_CT(['mcc_mchno','scity_stocn','mchno_stocn'],train,test,)

encode_CB('csmcu','stscd')
encode_CB('acqic','etymd')
encode_FE(train,test,['acqic_etymd','csmcu_stscd'])
#encode_CT(['acqic_etymd','csmcu_stscd'],train,test,)

encode_CB('stscd','etymd')
encode_CB('txn_info','etymd')

In [None]:
train['uid'] = train['cano'].astype(str)+'_'+train['bacno'].astype(str)
test['uid'] = test['cano'].astype(str)+'_'+test['bacno'].astype(str)

train['uid2'] = train['uid'].astype(str)+'_'+train['mchno'].astype(str)
test['uid2'] = test['uid'].astype(str)+'_'+test['mchno'].astype(str)

encode_FE(train,test,['uid'])
encode_AG(['conam'],['uid'],['mean','std'],usena=True)

encode_AG(['mcc_mchno','scity_stocn','mchno_stocn'],['uid'],['mean','std'],usena=True)
encode_AG(['etymd','stscd','txn_info'],['uid'],['mean'],fillna=True,usena=True)
encode_AG(['stscd_etymd','txn_info_etymd'],['uid'],['std'],fillna=True,usena=True)
encode_AG2(['conam', 'scity_stocn', 'mcc_mchno', 'mchno_stocn'], ['uid'], train_df=train, test_df=test)

encode_FE(train,test,['uid2'])
encode_AG(['conam'],['uid2'],['mean','std'],usena=True)

encode_AG(['acqic_etymd','csmcu_stscd'],['uid2'],['mean','std'],usena=True)
encode_AG2(['acqic_etymd', 'csmcu_stscd'], ['uid2'], train_df=train, test_df=test)

In [None]:
%%time
from sklearn.decomposition import PCA, FastICA, SparsePCA, KernelPCA, TruncatedSVD

cat_cols = ['cano', 'bacno', 'mcc', 'mchno', 'acqic', 'stocn', 'scity', 'csmcu',
            'etymd', 'stscd', 'txn_info', 'contp', 'hcefg', 'iterm', 'ovrlt']
n_comp = 5

temp_train = train[cat_cols].copy()
temp_test = test[cat_cols].copy()

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica_results_train = ica.fit_transform(temp_train)
ica_results_test = ica.transform(temp_test)

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=42)
tsvd_results_train = tsvd.fit_transform(temp_train)
tsvd_results_test = tsvd.transform(temp_test)

# SparsePCA
spca = SparsePCA(n_components=n_comp, random_state=42)
spca_results_train = spca.fit_transform(temp_train)
spca_results_test = spca.transform(temp_test)

for i in range(1, n_comp + 1):

    train['tSVD_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tSVD_' + str(i)] = tsvd_results_test[:, i - 1]

    train['ica_' + str(i)] = ica_results_train[:, i - 1]
    test['ica_' + str(i)] = ica_results_test[:, i - 1]
    
    train['spca_' + str(i)] = spca_results_train[:, i - 1]
    test['spca_' + str(i)] = spca_results_test[:, i - 1]

del temp_train, temp_test, ica_results_train, ica_results_test, tsvd_results_train, tsvd_results_test, spca_results_train, spca_results_test
gc.collect()

In [None]:
for df in [train, test]:
    df = reduce_mem_usage(df)
gc.collect()

In [None]:
%%time
DAE_new_df = pd.read_csv('DAE-hidden-features_new.csv')
print('\tSuccessfully loaded DAE!')

for df in [DAE_new_df]:
    df = reduce_mem_usage(df)

DAE_new_df.head()

In [None]:
train_dae = DAE_new_df[:train_len]
test_dae = DAE_new_df[train_len:]
del DAE_new_df, train_dae['fraud_ind'], test_dae['fraud_ind'], train_dae['Unnamed: 0'], test_dae['Unnamed: 0']
gc.collect()

In [None]:
#train.reset_index(inplace=True)
#test.reset_index(inplace=True)
train = pd.merge(train, train_dae, on = 'txkey', how = 'left')
test = pd.merge(test, test_dae, on = 'txkey', how = 'left')
#train.set_index('txkey',drop=True,inplace=True)
#test.set_index('txkey',drop=True,inplace=True)
del train_dae, test_dae
gc.collect()

In [None]:
print("Train shape : "+str(train.shape))
print("Test shape  : "+str(test.shape))

In [None]:
check_null_train = train.isnull().sum()
check_null_train[check_null_train>0].sort_values(ascending=False)

In [None]:
check_null_test = test.isnull().sum()
check_null_test[check_null_test>0].sort_values(ascending=False)

In [None]:
remove_features = ['loctm', 'locdt', 'insfg', 'uid', 'uid2',
                   'first_cano_dt', 'first_cano_dt_locdt_W_min_max', 
                   'locdt_W', 'locdt_M', 'txkey'
                  ]

features_columns = [col for col in list(train) if col not in remove_features]
features_columns.remove('fraud_ind')
target = 'fraud_ind'

In [None]:
print('NOW USING THE FOLLOWING',len(features_columns),'FEATURES.')
np.array(features_columns)

In [None]:
for col in features_columns:
    if train[col].dtype=='O':
        print(col)
        train[col] = train[col].fillna('unseen_before_label')
        test[col]  = test[col].fillna('unseen_before_label')
        
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        
        le = LabelEncoder()
        le.fit(list(train[col])+list(test[col]))
        train[col] = le.transform(train[col])
        test[col]  = le.transform(test[col])

In [None]:
y = train[target]
X = train[features_columns]
X_index = train['txkey']

X_test = test[features_columns]

print ("Size of X data : {}" .format(X.shape))
print ("Size of X_test data : {}" .format(X_test.shape))

del train, test
gc.collect()

In [None]:
categorical_features = [
                        'cano', 'bacno', 'acqic', 'mchno', 'mcc', 'stocn', 'scity', 'csmcu',
                        'contp', 'etymd', 'hcefg', 'iterm', 'stscd', 'txn_info',
                        'ecfg', 'flbmk', 'flg_3dsmk', 'ovrlt',
                        'conam_check', 'cano_check',
                        'mcc_mchno', 'scity_stocn', 'mchno_stocn',
                        'csmcu_stscd', 'acqic_etymd', 'stscd_etymd', 'txn_info_etymd',
                        'past_w_same_store'
                       ]

continuous_features = list(filter(lambda x: x not in categorical_features, X))

In [None]:
for column in continuous_features:
    scaler = StandardScaler()
    #if X[column].max() > 100 and X[column].min() >= 0:
    #    X[column] = np.log1p(X[column])
    #    X_test[column] = np.log1p(X_test[column])
    scaler.fit(np.concatenate([X[column].values.reshape(-1,1), X_test[column].values.reshape(-1,1)]))
    X[column] = scaler.transform(X[column].values.reshape(-1,1))
    X_test[column] = scaler.transform(X_test[column].values.reshape(-1,1))

In [None]:
SEED = 42
seed_everything(SEED)

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize = False,
                          title = 'Confusion matrix"',
                          cmap = plt.cm.Blues) :
    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])) :
        plt.text(j, i, cm[i, j],
                 horizontalalignment = 'center',
                 color = 'white' if cm[i, j] > thresh else 'black')
 
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
class roc_callback(Callback):
    def __init__(self,training_data,validation_data):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]


    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred_val = self.model.predict(self.x_val)
        #print(y_pred_val)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        
        F1_score = f1_score(self.y_val.values, y_pred_val.round())
        recalls = recall_score(self.y_val.values, y_pred_val.round())
        precisions = precision_score(self.y_val.values , y_pred_val.round())
        
        print('\rROC        : %s' % (str(round(roc_val,6))),end=100*' '+'\n')
        print('\rF1_score   : %s' % (str(round(F1_score,6))),end=100*' '+'\n')
        print('\rRecall     : %s' % (str(round(recalls,6))),end=100*' '+'\n')
        print('\rPrecisions : %s' % (str(round(precisions,6))),end=100*' '+'\n')
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return
    
def focal_loss(gamma=2., alpha=.25):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(K.epsilon()+pt_1))-K.mean((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0 + K.epsilon()))
    return focal_loss_fixed

get_custom_objects().update({'focal_loss_fn': focal_loss()})

In [None]:
def get_keras_dataset(df):
    X = {str(col) : np.array(df[col]) for col in df.columns}
    return X

def create_model(loss_fn):
    print('Create Model')
    
    cat_inputs = []
    num_inputs = []
    bin_inputs = []
    embeddings = []
    embedding_layer_names = []
    emb_n = 5
    
    binary_features = ['ecfg', 'flbmk', 'flg_3dsmk', 'ovrlt', 'conam_check', 'cano_check',]
    for col in binary_features:
        categorical_features.remove(col)

    for col in categorical_features:
        _input = layers.Input(shape=(1,), name=col)
        vocab_size = X[col].nunique()
        emb_n = min(10, vocab_size//2+1)
        _embed = layers.Embedding(vocab_size, emb_n, name=col+'_emb')(_input)
        _embed = layers.Reshape(target_shape=(emb_n,))(_embed)
        cat_inputs.append(_input)
        embeddings.append(_embed)
        embedding_layer_names.append(col+'_emb')
    
    for col in continuous_features:
        numeric_input = layers.Input(shape=(1,), name=col)
        num_inputs.append(numeric_input)
    
    for col in binary_features:
        binary_input = layers.Input(shape=(1,), name=col)
        bin_inputs.append(binary_input)

    merged_num_inputs = layers.concatenate(num_inputs)
    merged_bin_inputs = layers.concatenate(bin_inputs)
    merged_inputs = layers.concatenate(embeddings)
    inps = layers.concatenate([merged_inputs, merged_num_inputs, merged_bin_inputs])
    
    x = Dense(256, activation='relu')(inps)
    x = BatchNormalization()(x)
    x = Dropout(0.25)(x)
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.15)(x)
    output = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=cat_inputs + num_inputs + bin_inputs, outputs=output)
    model.compile(
          optimizer=Adam(), #Nadam()
          loss=[loss_fn],
          metrics=['accuracy'],
    )
    print('Done!')

    #print('\n Model Summary: ')
    #print(model.summary())
    return model

In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import load_model
n_epochs = 15
patience = 3
n_batch_size = 512

y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X.shape[0])

NFOLDS = 5
folds = KFold(n_splits=NFOLDS)
splits = folds.split(X, y)

auc_score = 0
F1_score = []
recalls = []
precisions = []
cms= []

for fold_n, (train_idx, valid_idx) in enumerate(splits):    

    x_train_fold= X[features_columns].iloc[train_idx,:]
    y_train_fold= y.iloc[train_idx]
    x_val_fold= X[features_columns].iloc[valid_idx,:]
    y_val_fold= y.iloc[valid_idx]
    
    model = create_model('focal_loss_fn') #binary_crossentropy
    callbacks = [
        roc_callback(training_data=(get_keras_dataset(x_train_fold), y_train_fold),
                     validation_data=(get_keras_dataset(x_val_fold), y_val_fold)),
        EarlyStopping(monitor='val_acc', patience=patience, mode='max', verbose=1),
        #ModelCheckpoint("model_" + str(fold_n+1) + ".hdf5",
        #                 save_best_only=True, verbose=1,
        #                 monitor='val_acc', mode='max')
    ]
    model.fit(get_keras_dataset(x_train_fold),
              y_train_fold,
              epochs=n_epochs,
              batch_size=n_batch_size,
              callbacks=callbacks,
              verbose=1)
    #del model
    #model = load_model("model_" + str(fold_n+1) + ".hdf5", custom_objects={'focal_loss_fn': focal_loss()})
    y_pred_valid = model.predict(get_keras_dataset(x_val_fold))
    y_oof[valid_idx] = y_pred_valid.reshape(y_pred_valid.shape[0])
    
    
    print(f"\nFold {fold_n + 1} | AUC: {roc_auc_score(y_val_fold, y_pred_valid)}")
    print(f"Fold {fold_n + 1} | F1:  {f1_score(y_val_fold.values, y_pred_valid.round())}")
    
    auc_score += roc_auc_score(y_val_fold, y_pred_valid) / NFOLDS
    F1_score.append(f1_score(y_val_fold.values, y_pred_valid.round())) 
    recalls.append(recall_score(y_val_fold.values, y_pred_valid.round()))
    precisions.append(precision_score(y_val_fold.values , y_pred_valid.round()))
    cms.append(confusion_matrix(y_val_fold.values, y_pred_valid.round()))
    
    
    y_temp_preds = model.predict(get_keras_dataset(X_test))
    y_preds += y_temp_preds.reshape(y_temp_preds.shape[0]) / NFOLDS

print(f"\nMean AUC = {auc_score}")
print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}")
print(f"\nMean F1 Score = {np.mean(F1_score)}")
print(f"Mean Recall score = {np.mean(recalls)}")
print(f"Mean Precision score = {np.mean(precisions)}")

In [None]:
# Confusion maxtrix & metrics
plt.rcParams["axes.grid"] = False
cm = np.average(cms, axis=0)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title= 'Confusion matrix [averaged/folds]')

In [None]:
plt.hist(y_oof,bins=100)
plt.ylim((0,5000))
plt.title('NN OOF')
plt.show()

X['oof'] = y_oof
X['txkey'] = X_index
X.reset_index(inplace=True)
X[['txkey','oof']].to_csv('oof_NN.csv')
X.set_index('txkey',drop=True,inplace=True)

In [None]:
plt.hist(y_preds,bins=100)
plt.ylim((0,5000))
plt.title('NN Submission')
plt.show()

In [None]:
# Optimize f1 score
thresholds = []
for thresh in np.arange(0.1, 0.5, 0.01):
    thresh = np.round(thresh, 2)
    res = f1_score(y, (y_oof > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

In [None]:
submission_test = pd.read_csv('E-Sun_Credit_Card_Fraud_Data/submission_test.csv')
print('\tSuccessfully loaded submission_test!')

submission_test['fraud_ind'] = y_preds
submission_test.to_csv("E-Sun_Credit_Card_Fraud_Detection_NN_without_optimize.csv", index=False)