In [1]:
import pandas as pd
import numpy as np
import multiprocessing
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
from time import time
import datetime
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

In [2]:
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold,TimeSeriesSplit, GroupKFold
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR

from sklearn.metrics import mean_absolute_error as sk_mean_absolute_error
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

import eli5
from eli5.sklearn import PermutationImportance

class processutil:
    def _str2class(s):
        if s in globals() and isinstance(globals()[s], type):
                return globals()[s]
        if isinstance(eval(s), type):
            return eval(s)
        if callable(eval(s)):
            return eval(s)
        return None
    

In [3]:
folder_path = '../data/IEEE-CIS-Fraud-Detection/'

In [4]:
df_train = pd.read_pickle(f'{folder_path}/df_train3.gzde', compression='gzip')#.iloc[:100000,:]
# df_test = pd.read_pickle(f'{folder_path}/df_test2.gzde', compression='gzip').iloc[:10000,:]

In [5]:
df_train = df_train.replace([np.inf, -np.inf], np.nan).fillna(-999)

# df_test = df_test.replace([np.inf, -np.inf], np.nan).fillna(-999)

In [6]:
columns = df_train.columns.tolist()
# columns.remove('TransactionID')
# columns.remove('TransactionDT')
columns.remove('isFraud')

In [7]:
len(columns)

925

In [8]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47
         }

In [9]:
def process(folds, df_train,df_test, columns, is_output_feature_importance=1, verbose=0):

#     aucs = list()
    his = []
    training_start_time = time()
    df_feature_importances_i_list = []
    df_valid_pred = pd.DataFrame()
    df_test_pred = pd.DataFrame()
    if type(df_test) != type(None):
        df_test_pred['TransactionID'] = df_test['TransactionID']
    
    X,y = df_train.sort_values('TransactionDT')[columns], df_train.sort_values('TransactionDT')['isFraud']
    if type(df_test) != type(None):
        X_test = df_test.sort_values('TransactionDT')[columns]
        
    for fold, (trn_idx, test_idx) in enumerate(folds.split(X, y)):
        start_time = time()
        if verbose > 1:
            print('Training on fold {}'.format(fold + 1))
            
        trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx])
        val_data = lgb.Dataset(X.iloc[test_idx], label=y.iloc[test_idx])
        clf = lgb.train(params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)
        
        y_trn_pred = clf.predict(X.iloc[trn_idx].values)
        y_val_pred = clf.predict(X.iloc[test_idx].values)
        
        original_index = df_train['TransactionID'].values[test_idx]
        df_valid_pred_i = pd.DataFrame({'TransactionID': original_index, 'predict': y_val_pred, 'fold': np.zeros(y_val_pred.shape[0]) + fold})
        df_valid_pred = pd.concat([df_valid_pred, df_valid_pred_i], axis=0)
        
        y_test_pred = None
        if type(df_test)!=type(None):
            y_test_pred = clf.predict(X_test.values)
            df_test_pred_i = pd.DataFrame({fold: y_test_pred})
            df_test_pred = pd.concat([df_test_pred, df_test_pred_i], axis=1)
        
        trn_auc = roc_auc_score(y.iloc[trn_idx].values, y_trn_pred)
        val_auc = roc_auc_score(y.iloc[test_idx].values, y_val_pred)
        
        his.append({'val_auc':val_auc, 'trn_auc':trn_auc, 'y_val_pred':y_val_pred, 'y_test_pred':y_test_pred, 'test_idx':test_idx})
        
        if is_output_feature_importance:
            best_iter = clf.best_iteration
            clf = lgb.LGBMClassifier(**params, num_boost_round=best_iter)
            clf.fit(X.iloc[trn_idx].values, y.iloc[trn_idx].values)
            perm = PermutationImportance(clf, random_state=42).fit(X.iloc[test_idx].values, y.iloc[test_idx].values)
            df_feature_importances_i2 = eli5.explain_weights_dfs(perm, feature_names=columns, top=len(columns))['feature_importances']
            df_feature_importances_i2 = df_feature_importances_i2.sort_values(by=['feature'])
            df_feature_importances_i2 = df_feature_importances_i2.reset_index(drop=True)
            df_feature_importances_i_list.append(df_feature_importances_i2)
        
#         aucs.append(clf.best_score['valid_1']['auc'])
#         his.append({'val_auc':val_auc, 'trn_auc':trn_auc, 'y_val_pred':y_val_pred, 'y_test_pred':y_test_pred, 'test_idx':test_idx})
        if verbose > 0:
            print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds=time() - start_time))))
    
    his = pd.DataFrame(his)
    
    df_feature_importances = None
    if is_output_feature_importance:
        df_feature_importances = df_feature_importances_i_list[0]
        for idx, df_feature_importances_i in enumerate(df_feature_importances_i_list[1:]):
            df_feature_importances = pd.merge(df_feature_importances, df_feature_importances_i, on='feature', suffixes=('', idx + 1))
            
    df_valid_pred = df_valid_pred.sort_values(by=['TransactionID'])
    df_valid_pred = df_valid_pred.reset_index(drop=True)

    if type(df_test) != type(None):
        df_test_pred = df_test_pred.sort_values(by=['TransactionID'])
        df_test_pred = df_test_pred.reset_index(drop=True)
    
    if verbose > 0:
        print('-' * 30)
        print('Training has finished.')
        print('Total training time is {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))
        print('Mean AUC:', his.val_auc.mean(), his.trn_auc.mean())
        print('-' * 30)
    return his, df_feature_importances, df_valid_pred, df_test_pred, his.val_auc.mean()

In [12]:
columns_emd_over_o4 = ['C7_fq_enc',
 'ProductCD_target_mean',
 'C4_fq_enc',
 'id_35',
 'addr2_fq_enc',
 'C12_fq_enc',
 'D7_fq_enc',
 'V94',
 'almost-com_addr2',
 'addr2',
 'V200',
 'V201',
 'TransactionAmt_to_mean_addr1',
 'id_17',
 'C8_fq_enc',
 'ProductCD',
 'V171',
 'card3_TransactionAmt_std',
 'C10_fq_enc',
 'V170',
 'TransactionAmt_to_std_addr1',
 'ieee-gb-2-_id_33',
 'email_check',
 'card3_fq_enc',
 'card3_count_full',
 'id_29',
 'id_28',
 'V194',
 'V242',
 'V189',
 'R_emaildomain_suffix',
 'V188',
 'V186',
 'feature-en_R_emaildomain_suffix',
 'extensive-_R_emaildomain_suffix',
 'M4_target_mean',
 'eda-and-mo_addr2',
 'D6_fq_enc',
 'V244',
 'extensive-_PCA_V_17',
 'PCA_V_17',
 'eda-and-mo_card3',
 'D14',
 'V184',
 'R_emaildomain_2',
 'id_38',
 'card3',
 'id_09',
 'id_10',
 'ieee-gb-2-_D9',
 'ieee-gb-2-_id_29',
 'D8_fq_enc',
 'V197',
 'id_16',
 'id_15',
 'D12',
 'V239',
 'V195',
 'V185',
 'id_30_version',
 'id_01_count_dist',
 'V198',
 'id_03',
 'id_20',
 'V238',
 'ieee-gb-2-_id_16',
 'almost-com_R_emaildomain',
 'DeviceInfo_version_fq_enc',
 'V243',
 'id_31_device',
 'eda-and-mo_id_02_to_mean_card4',
 'id_02_to_mean_card4',
 'eda-and-mo_id_02_to_std_card4',
 'id_02_to_std_card4',
 'id_04',
 'id_02',
 'V199',
 'V169',
 'id_36',
 'ieee-gb-2-_id_31',
 'D7',
 'V190',
 'V176',
 'V191',
 'D15_to_mean_addr2',
 'id_02_to_mean_card1',
 'eda-and-mo_id_02_to_mean_card1',
 'id_02_to_std_card1',
 'eda-and-mo_id_02_to_std_card1',
 'id_31_count_dist',
 'DeviceInfo_fq_enc',
 'DeviceInfo_device_fq_enc',
 'V196',
 'id_11',
 'eda-and-mo_id_11',
 'lgb-single_id_11',
 'D15_to_mean_addr1',
 'eda-and-mo_D15_to_mean_addr1',
 'V220',
 'id_05',
 'V251',
 'V181',
 'V222',
 'V193',
 'V187',
 'lgb-single_V208',
 'eda-and-mo_V208',
 'V208',
 'R_emaildomain_fq_enc',
 'D15_to_std_addr1',
 'id_31_device_fq_enc',
 'V259',
 'V260',
 'V174',
 'eda-and-mo_V214',
 'lgb-single_V214',
 'V214',
 'V216',
 'eda-and-mo_V216',
 'lgb-single_V216',
 'eda-and-mo_V202',
 'lgb-single_V202',
 'V202',
 'eda-and-mo_V215',
 'V215',
 'lgb-single_V215',
 'eda-and-mo_V211',
 'V211',
 'lgb-single_V206',
 'eda-and-mo_V206',
 'V206',
 'eda-and-mo_V213',
 'lgb-single_V213',
 'V213',
 'lgb-single_V204',
 'V204',
 'eda-and-mo_V204',
 'eda-and-mo_V205',
 'lgb-single_V205',
 'V205',
 'eda-and-mo_V210',
 'lgb-single_V210',
 'V210',
 'eda-and-mo_V212',
 'lgb-single_V212',
 'V212',
 'V221',
 'eda-and-mo_V207',
 'lgb-single_V207',
 'V207',
 'V183',
 'V302',
 'V250',
 'lgb-single_V203',
 'V203',
 'eda-and-mo_V203',
 'eda-and-mo_id_19',
 'V235',
 'V262',
 'V192',
 'id_36_count_dist',
 'V247',
 'V209',
 'lgb-single_V209',
 'eda-and-mo_V209',
 'V245',
 'V249',
 'id_36_count_full',
 'V256',
 'V258',
 'V180',
 'V257',
 'V172',
 'V229',
 'V304',
 'V175',
 'V255',
 'V252',
 'V167',
 'V177',
 'eda-and-mo_V271',
 'lgb-single_V271',
 'V271',
 'lgb-single_V272',
 'eda-and-mo_V272',
 'V272',
 'browser_id_31',
 'V230',
 'lgb-single_V270',
 'eda-and-mo_V270',
 'V270',
 'V227',
 'id_06',
 'V182',
 'V179',
 'V168',
 'V246',
 'V79',
 'V173',
 'V248',
 'V178',
 'V228',
 'V232',
 'D13',
 'id_13',
 'V218',
 'V261',
 'ieee-gb-2-_ProductCD',
 'V219',
 'V237',
 'V217',
 'V233',
 'id_31',
 'V236',
 'V231',
 'V123',
 'V254',
 'id_30_device_fq_enc',
 'V234',
 'D15_to_std_addr2',
 'V57',
 'ieee-gb-2-_id_37',
 'V253',
 'D6',
 'V264',
 'lgb-single_V264',
 'eda-and-mo_V264',
 'eda-and-mo_V265',
 'lgb-single_V265',
 'V265',
 'lgb-single_V274',
 'eda-and-mo_V274',
 'V274',
 'eda-and-mo_V275',
 'lgb-single_V275',
 'lgb-single_V263',
 'V263',
 'eda-and-mo_V263',
 'lgb-single_V273',
 'eda-and-mo_V273',
 'V226',
 'lgb-single_V276',
 'eda-and-mo_V276',
 'V276',
 'V225',
 'lgb-single_V277',
 'eda-and-mo_V277',
 'V277',
 'id_30_fq_enc',
 'V278',
 'eda-and-mo_V278',
 'lgb-single_V278',
 'V240',
 'id_33_fq_enc',
 'V241',
 'eda-and-mo_V268',
 'lgb-single_V268',
 'V268',
 'eda-and-mo_V266',
 'lgb-single_V266',
 'V266',
 'eda-and-mo_V269',
 'V269',
 'V267',
 'eda-and-mo_V267',
 'lgb-single_V267',
 'id_30_version_fq_enc',
 'V224',
 'V223',
 'V303',
 'V93',
 'almost-com_M6',
 'id_01',
 'card3_TransactionAmt_mean',
 'V92',
 'V71',
 'id_37',
 'id_12',
 'DeviceInfo_version',
 'eda-and-mo_id_13',
 'V21',
 'DeviceType',
 'id_19',
 'V74',
 'id_02__id_20',
 'addr1_fq_enc',
 'id_02__D8',
 'PCA_V_28',
 'lgb-single_D9',
 'eda-and-mo_D9',
 'D9',
 'lgb-single_D8',
 'eda-and-mo_D8',
 'D8',
 'addr1',
 'extensive-_PCA_V_28',
 'feature-en_R_emaildomain_bin',
 'extensive-_R_emaildomain_bin',
 'almost-com_addr1',
 'V125',
 'C5_fq_enc',
 'V58',
 'V73',
 'eda-and-mo_addr1',
 'V72',
 'ieee-gb-2-_id_28',
 'almost-com_ProductCD',
 'V273',
 'V69',
 'R_emaildomain',
 'V64',
 'extensive-_PCA_V_19',
 'PCA_V_19',
 'V283',
 'card6_count_full',
 'ieee-gb-2-_card6',
 'card6',
 'M6',
 'V275',
 'V10',
 'id_33_0',
 'V11',
 'V63',
 'PCA_V_8',
 'extensive-_PCA_V_8',
 'PCA_V_23',
 'extensive-_PCA_V_23',
 'id_33_1',
 'addr1__card1',
 'device_version',
 'device_name',
 'PCA_V_34',
 'first_value_addr1',
 'V29',
 'PCA_V_1',
 'M_na',
 'PCA_V_31',
 'ieee-gb-2-_id_15',
 'dist2_fq_enc',
 'M_sum',
 'V85',
 'V70',
 'M1',
 'almost-com_M1',
 'ieee-gb-2-_M1',
 'V90',
 'ieee-gb-2-_M2',
 'dist2',
 'V1',
 'D11',
 'V22',
 'V6',
 'V9',
 'V111',
 'V8',
 'R_emaildomain_1',
 'V3',
 'V2',
 'V7',
 'eda-and-mo_id_20',
 'V5',
 'V4',
 'V113',
 'almost-com_M4',
 'PCA_V_30',
 'ieee-gb-2-_M3',
 'V84',
 'card2_TransactionAmt_mean',
 'PCA_V_29',
 'C2_fq_enc',
 'V112',
 'V114',
 'V282',
 'V18',
 'PCA_V_3',
 'V60',
 'V50',
 'V17',
 'extensive-_PCA_V_29',
 'V54',
 'C11_fq_enc',
 'V30',
 'ieee-gb-2-_DeviceInfo',
 'V59',
 'V33',
 'DeviceInfo_device',
 'id_18',
 'C1_fq_enc',
 'DeviceInfo',
 'V53',
 'DeviceInfo__P_emaildomain',
 'V15',
 'D2',
 'V124',
 'M2',
 'extensive-_PCA_V_25',
 'extensive-_PCA_V_26',
 'PCA_V_26',
 'PCA_V_25',
 'V108',
 'PCA_V_22',
 'D11__DeviceInfo',
 'extensive-_PCA_V_22',
 'V91',
 'V31',
 'D15_to_std_card4',
 'eda-and-mo_D15_to_std_card4',
 'PCA_V_4',
 'extensive-_PCA_V_4',
 'V116',
 'version_id_31',
 'D10',
 'PCA_V_11',
 'extensive-_PCA_V_11',
 'V81',
 'M3',
 'D15_to_mean_card4',
 'eda-and-mo_D15_to_mean_card4',
 'id_30_device',
 'almost-com_M7',
 'D15',
 'PCA_V_7',
 'extensive-_PCA_V_7',
 'M7',
 'ieee-gb-2-_M9',
 'ieee-gb-2-_M6',
 'V13',
 'dist1_fq_enc',
 'V110',
 'D1',
 'V80',
 'V12',
 'extensive-_PCA_V_12',
 'PCA_V_12',
 'V67',
 'V51',
 'dist1',
 'eda-and-mo_dist1',
 'lgb-single_dist1',
 'PCA_V_0',
 'V76',
 'PCA_V_33',
 'card1_TransactionAmt_std',
 'extensive-_PCA_V_10',
 'PCA_V_10',
 'M8',
 'V281',
 'V43',
 'PCA_V_27',
 'V75',
 'uid2_TransactionAmt_std',
 'ieee-gb-2-_R_emaildomain',
 'card2_TransactionAmt_std',
 'uid_TransactionAmt_std',
 'extensive-_PCA_V_27',
 'R_emaildomain_prefix',
 'M9',
 'V48',
 'V34',
 'V49',
 'V66',
 'V16',
 'D2_fq_enc',
 'V23',
 'PCA_V_15',
 'extensive-_PCA_V_15',
 'V56',
 'V32']

In [10]:
folds = TimeSeriesSplit(n_splits=5)

In [None]:
his, df_feature_importances, df_valid_pred, df_test_pred, val_metric = process(folds, df_train,None, columns_emd_over_o4, is_output_feature_importance=1, verbose=1)

Training until validation scores don't improve for 500 rounds.


In [None]:
def sort_feature_importances(df_feature_importances, key='average_permutation_weight'):
    df_feature_importances['average_permutation_weight'] = df_feature_importances[
        [col for col in df_feature_importances.columns.tolist() if ('weight' in col) & ('model' not in col)]].mean(
        axis=1)
    df_feature_importances = df_feature_importances.sort_values(by=[key], ascending=False)
    sorted_columns = df_feature_importances.feature.tolist()
    return sorted_columns

In [None]:
sorted_columns = sort_feature_importances(df_feature_importances)

In [None]:
sorted_columns

In [None]:
def add_feature_obo(trial, folds, df_train, columns, trial, max_col = 50):
#     trial = []
    selected_columns = [columns[0]]
    if type(max_col) == type(None):
        max_col = len(columns)
    best_val_metric = 0
    no_progress_count = 0
    for col in columns[1:]:
        his, df_feature_importances, df_valid_pred, df_test_pred, val_metric = process(folds, df_train,None, selected_columns + [col], is_output_feature_importance=0, verbose=0)
        if val_metric > best_val_metric:
            selected_columns = selected_columns + [col]
            best_val_metric = val_metric
            trial.append({'col_count':len(selected_columns), 'val_metric':val_metric, 'selected_columns':selected_columns})
        else:
            no_progress_count += 1
        if len(selected_columns) > max_col:
            break
        if no_progress_count > 50:
            break
    return selected_columns

In [None]:
trial=[]
select_columns = add_feature_obo(trial, folds, df_train, sorted_columns, max_col = 50)