In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [2]:
import numpy as np
import pandas as pd
import os
import time
import datetime
import json
import copy
import gc
import warnings
from tqdm import tqdm_notebook, tqdm

import optuna

import lightgbm as lgb
import xgboost as xgb

from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold,TimeSeriesSplit, GroupKFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing

import eli5
from eli5.sklearn import PermutationImportance

import networkx as nx

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
folder_path =  '../data/IEEE-CIS-Fraud-Detection/'
os.listdir(folder_path)

['test_identity.csv',
 'ieee-gb-2-make-amount-useful-again_train.pkl',
 'merge_two_csv.ipynb',
 'test_identity.csv.zip',
 'lgb-single-model-lb-0-9419_test.pkl',
 'extensive-eda-and-modeling-xgb-hyperopt_train.pkl',
 'test.csv',
 'day-and-time-powerful-predictive-feature_test.pkl',
 'eda-and-models_test.pkl',
 'train_identity_fixed.pkl',
 'test_transaction_fixed.pkl',
 'almost-complete-feature-engineering-ieee-data_train.pkl',
 'almost-complete-feature-engineering-ieee-data_test.pkl',
 'test_identity.pkl',
 'train_transaction.pkl',
 'sample_submission.csv',
 'train_transaction.csv',
 'train_transaction.csv.zip',
 'extensive-eda-and-modeling-xgb-hyperopt_test.pkl',
 'feature-engineering-lightgbm_test.pkl',
 'fraud-complete-eda_train.pkl',
 'test_transaction.csv',
 'feature-engineering-lightgbm_train.pkl',
 'sample_submission.csv.zip',
 'test_identity_fixed.pkl',
 'lgb-single-model-lb-0-9419_train.pkl',
 '.DS_Store',
 'train_identity.csv.zip',
 'df_train.gzde',
 'train_identity.pkl',
 'tr

In [4]:
feature_pkl_list = [pkl for pkl in os.listdir(folder_path) if pkl.endswith('train.pkl') or pkl.endswith('test.pkl')]
print(len(feature_pkl_list))
print(feature_pkl_list)

16
['ieee-gb-2-make-amount-useful-again_train.pkl', 'lgb-single-model-lb-0-9419_test.pkl', 'extensive-eda-and-modeling-xgb-hyperopt_train.pkl', 'day-and-time-powerful-predictive-feature_test.pkl', 'eda-and-models_test.pkl', 'almost-complete-feature-engineering-ieee-data_train.pkl', 'almost-complete-feature-engineering-ieee-data_test.pkl', 'extensive-eda-and-modeling-xgb-hyperopt_test.pkl', 'feature-engineering-lightgbm_test.pkl', 'fraud-complete-eda_train.pkl', 'feature-engineering-lightgbm_train.pkl', 'lgb-single-model-lb-0-9419_train.pkl', 'day-and-time-powerful-predictive-feature_train.pkl', 'eda-and-models_train.pkl', 'fraud-complete-eda_test.pkl', 'ieee-gb-2-make-amount-useful-again_test.pkl']


In [5]:
csv_list = ['train_transaction.csv', 'test_transaction.csv', 'train_identity.csv', 'test_identity.csv']
print(len(csv_list))
print(csv_list)

4
['train_transaction.csv', 'test_transaction.csv', 'train_identity.csv', 'test_identity.csv']


In [6]:
def prepare_basic_feature(folder_path, csv_list):
    df = pd.read_csv(f'{folder_path}{csv_list[0]}')
    for file in csv_list[1:]:
        df = pd.merge(df, pd.read_csv(f'{folder_path}{file}'), how='left', on='TransactionID')
    return df.sort_values(by=['TransactionID']).reset_index(drop=True)

In [7]:
# df_train, df_test = prepare_basic_feature(file_folder, csv_list)
# print('basic feature size')
# print(df_train.shape, df_test.shape)

# for f in feature_pkl_list:
#     print(f'========================={f}===============================')
#     if (f.endswith('.pkl')) and (not f.startswith('.')):
#         if f[:-4].endswith('train'):
#             df_feature_i = pd.read_pickle(f'{file_folder}/{f}', compression='gzip').sort_values(by=['TransactionID'])
#             df_feature_i = df_feature_i.reset_index(drop=True)
#             columns_i = df_feature_i.columns.tolist()
#             new_columns = set(columns_i) - set(df_train.columns.tolist())
#             duplicates_columns = [col for col in columns_i if col not in list(new_columns)]
#             duplicates_columns_diff_values = []
#             for col in duplicates_columns:
#                 try:
#                     error = np.where(df_feature_i[col].values!=df_train[col].values)[0]
#                     if error.shape[0] > 0:
#                         duplicates_columns_diff_values.append(col)
#                 except Exception as e:
#                     raise Exception(e.__str__())
# #                     pass
#             df_train = pd.merge(df_train, df_feature_i[list(new_columns) + ['TransactionID']], on='TransactionID')
#             rename_col = {}
#             for col in duplicates_columns_diff_values:
#                 rename_col[col] = f'{f[:-4]}_{col}'
#             df_train = pd.merge(df_train, df_feature_i[list(duplicates_columns_diff_values) + ['TransactionID']].rename(columns=rename_col), on='TransactionID')
#             df_train = df_train.sort_values(by=['TransactionID'])
#             df_train = df_train.reset_index(drop=True)
#             print('train add', f, new_columns)
            
            
#         if f[:-4].endswith('test'):
#             df_feature_i = pd.read_pickle(f'{file_folder}/{f}', compression='gzip').sort_values(by=['TransactionID'])
#             df_feature_i = df_feature_i.reset_index(drop=True)
#             columns_i = df_feature_i.columns.tolist()
#             new_columns = set(columns_i) - set(df_test.columns.tolist())
#             duplicates_columns = [col for col in columns_i if col not in list(new_columns)]
#             duplicates_columns_diff_values = []
#             for col in duplicates_columns:
#                 try:
#                     error = np.where(df_feature_i[col].values!=df_test[col].values)[0]
#                     if error.shape[0] > 0:
#                         duplicates_columns_diff_values.append(col)
#                 except Exception as e:
#                     raise Exception(e.__str__())
# #                     pass
#             df_test = pd.merge(df_test, df_feature_i[list(new_columns) + ['TransactionID']], on='TransactionID')
#             rename_col = {}
#             for col in duplicates_columns_diff_values:
#                 rename_col[col] = f'{f[:-4]}_{col}'
#             df_test = pd.merge(df_test, df_feature_i[list(duplicates_columns_diff_values) + ['TransactionID']].rename(columns=rename_col), on='TransactionID')
#             df_test = df_test.sort_values(by=['TransactionID'])
#             df_test = df_test.reset_index(drop=True)
#             print('test add', f, new_columns)

In [8]:
def prepare_data(folder_path, feature_pkl_list, csv_list, prepare_basic_feature):
    
    df = prepare_basic_feature(folder_path, csv_list)
    print('basic feature size')
    print(df.shape)

    for f in feature_pkl_list:
        print(f'========================={f}===============================')
        if (f.endswith('.pkl')) and (not f.startswith('.')):
            if (f[:-4].endswith('train')) or (f[:-4].endswith('test')):
                df_feature_i = pd.read_pickle(f'{folder_path}/{f}', compression='gzip').sort_values(by=['TransactionID'])
                df_feature_i = df_feature_i.reset_index(drop=True)
                columns_i = df_feature_i.columns.tolist()
                new_columns = set(columns_i) - set(df.columns.tolist())
                duplicates_columns = [col for col in columns_i if col not in list(new_columns)]
                duplicates_columns_diff_values = []
                for col in duplicates_columns:
                    try:
                        error = np.where(df_feature_i[col].values!=df[col].values)[0]
                        if error.shape[0] > 0:
                            duplicates_columns_diff_values.append(col)
                    except Exception as e:
                        raise Exception(e.__str__())
    #                     pass
                df = pd.merge(df, df_feature_i[list(new_columns) + ['TransactionID']], on='TransactionID')
                rename_col = {}
                for col in duplicates_columns_diff_values:
                    rename_col[col] = f'{f[:-4]}_{col}'
                print('rename_col', len(rename_col))
#                 df = pd.merge(df, df_feature_i[list(duplicates_columns_diff_values) + ['TransactionID']].rename(columns=rename_col), on='TransactionID')
                df = df.sort_values(by=['TransactionID'])
                df = df.reset_index(drop=True)
                print('df add', f, new_columns)
    
#     print(f'=========================save tmp===============================')
#     df_train.to_pickle(f'{folder_path}/df_train.gzde', compression='gzip')
#     df_test.to_pickle(f'{folder_path}/df_test.gzde', compression='gzip')
    return df

In [9]:
folder_path, feature_pkl_list, csv_list

('../data/IEEE-CIS-Fraud-Detection/',
 ['ieee-gb-2-make-amount-useful-again_train.pkl',
  'lgb-single-model-lb-0-9419_test.pkl',
  'extensive-eda-and-modeling-xgb-hyperopt_train.pkl',
  'day-and-time-powerful-predictive-feature_test.pkl',
  'eda-and-models_test.pkl',
  'almost-complete-feature-engineering-ieee-data_train.pkl',
  'almost-complete-feature-engineering-ieee-data_test.pkl',
  'extensive-eda-and-modeling-xgb-hyperopt_test.pkl',
  'feature-engineering-lightgbm_test.pkl',
  'fraud-complete-eda_train.pkl',
  'feature-engineering-lightgbm_train.pkl',
  'lgb-single-model-lb-0-9419_train.pkl',
  'day-and-time-powerful-predictive-feature_train.pkl',
  'eda-and-models_train.pkl',
  'fraud-complete-eda_test.pkl',
  'ieee-gb-2-make-amount-useful-again_test.pkl'],
 ['train_transaction.csv',
  'test_transaction.csv',
  'train_identity.csv',
  'test_identity.csv'])

In [10]:
[pkl for pkl in feature_pkl_list if 'train' in pkl],[csv for csv in csv_list if 'train' in csv]

(['ieee-gb-2-make-amount-useful-again_train.pkl',
  'extensive-eda-and-modeling-xgb-hyperopt_train.pkl',
  'almost-complete-feature-engineering-ieee-data_train.pkl',
  'fraud-complete-eda_train.pkl',
  'feature-engineering-lightgbm_train.pkl',
  'lgb-single-model-lb-0-9419_train.pkl',
  'day-and-time-powerful-predictive-feature_train.pkl',
  'eda-and-models_train.pkl'],
 ['train_transaction.csv', 'train_identity.csv'])

In [12]:
df_train = prepare_data(folder_path, [pkl for pkl in feature_pkl_list if 'train' in pkl], [csv for csv in csv_list if 'train' in csv], prepare_basic_feature)

basic feature size
(590540, 434)
rename_col 402
df add ieee-gb-2-make-amount-useful-again_train.pkl {'card1_TransactionAmt_mean', 'card2_fq_enc', 'TransactionAmt_check', 'DT_day', 'C11_fq_enc', 'uid3_fq_enc', 'D7_fq_enc', 'id_30_fq_enc', 'D2_fq_enc', 'id_33_1', 'dist1_fq_enc', 'C13_fq_enc', 'id_31_device_fq_enc', 'C3_fq_enc', 'id_31_device', 'uid2_TransactionAmt_std', 'M_sum', 'uid', 'C9_fq_enc', 'R_emaildomain_prefix', 'uid3', 'C10_fq_enc', 'D1_fq_enc', 'D3_fq_enc', 'ProductCD_target_mean', 'D6_fq_enc', 'C12_fq_enc', 'id_33_0', 'R_emaildomain_fq_enc', 'card3_TransactionAmt_std', 'C6_fq_enc', 'C14_fq_enc', 'card5_TransactionAmt_std', 'dist2_fq_enc', 'card5_TransactionAmt_mean', 'DeviceInfo_device_fq_enc', 'DT_day_week', 'addr1_fq_enc', 'id_30_version_fq_enc', 'card2_TransactionAmt_mean', 'card1_fq_enc', 'uid_TransactionAmt_mean', 'P_emaildomain_prefix', 'P_emaildomain_fq_enc', 'card3_TransactionAmt_mean', 'uid2_TransactionAmt_mean', 'id_30_device_fq_enc', 'uid3_TransactionAmt_std', 'DT

In [13]:
for col in df_train.columns:
    print(col, df_train[col].dtype)

TransactionID int64
isFraud int64
TransactionDT int64
TransactionAmt float64
ProductCD object
card1 int64
card2 float64
card3 float64
card4 object
card5 float64
card6 object
addr1 float64
addr2 float64
dist1 float64
dist2 float64
P_emaildomain object
R_emaildomain object
C1 float64
C2 float64
C3 float64
C4 float64
C5 float64
C6 float64
C7 float64
C8 float64
C9 float64
C10 float64
C11 float64
C12 float64
C13 float64
C14 float64
D1 float64
D2 float64
D3 float64
D4 float64
D5 float64
D6 float64
D7 float64
D8 float64
D9 float64
D10 float64
D11 float64
D12 float64
D13 float64
D14 float64
D15 float64
M1 object
M2 object
M3 object
M4 object
M5 object
M6 object
M7 object
M8 object
M9 object
V1 float64
V2 float64
V3 float64
V4 float64
V5 float64
V6 float64
V7 float64
V8 float64
V9 float64
V10 float64
V11 float64
V12 float64
V13 float64
V14 float64
V15 float64
V16 float64
V17 float64
V18 float64
V19 float64
V20 float64
V21 float64
V22 float64
V23 float64
V24 float64
V25 float64
V26 float64
V27 f

In [14]:
# df_train.to_pickle(f'{folder_path}/df_train.gzde', compression='gzip')
# del df_train

In [14]:
df_test = prepare_data(folder_path, [pkl for pkl in feature_pkl_list if 'test' in pkl], [csv for csv in csv_list if 'test' in csv], prepare_basic_feature)

basic feature size
(506691, 433)
rename_col 248
df add lgb-single-model-lb-0-9419_test.pkl {'P_emaildomain__C2', 'card1_count_full', 'id_36_count_full', 'TransactionAmt_decimal', 'DeviceInfo__P_emaildomain', 'id_02__D8', 'Transaction_hour', 'addr1__card1', 'card2__dist1', 'card5__P_emaildomain', 'id_02__id_20', 'Transaction_day_of_week', 'card2__id_20', 'card1__card5'}
rename_col 0
df add day-and-time-powerful-predictive-feature_test.pkl {'make_hour_feature', 'make_day_feature'}
rename_col 313
df add eda-and-models_test.pkl {'TransactionAmt_to_mean_card1', 'R_emaildomain_1', 'D15_to_std_card1', 'TransactionAmt_to_std_card1', 'id_02_to_std_card1', 'D15_to_std_addr1', 'id_02_to_mean_card4', 'P_emaildomain_2', 'R_emaildomain_2', 'D15_to_mean_addr1', 'id_02_to_std_card4', 'P_emaildomain_1', 'D15_to_mean_card1', 'id_02_to_mean_card1'}
rename_col 20
df add almost-complete-feature-engineering-ieee-data_test.pkl {'PCA_V_28', 'count_cluster', 'PCA_D_3', 'PCA_D_2', 'PCA_C_2', 'R_emaildomain_bin'

In [15]:
for col in df_test.columns:
    print(col, df_test[col].dtype)

TransactionID int64
TransactionDT int64
TransactionAmt float64
ProductCD object
card1 int64
card2 float64
card3 float64
card4 object
card5 float64
card6 object
addr1 float64
addr2 float64
dist1 float64
dist2 float64
P_emaildomain object
R_emaildomain object
C1 float64
C2 float64
C3 float64
C4 float64
C5 float64
C6 float64
C7 float64
C8 float64
C9 float64
C10 float64
C11 float64
C12 float64
C13 float64
C14 float64
D1 float64
D2 float64
D3 float64
D4 float64
D5 float64
D6 float64
D7 float64
D8 float64
D9 float64
D10 float64
D11 float64
D12 float64
D13 float64
D14 float64
D15 float64
M1 object
M2 object
M3 object
M4 object
M5 object
M6 object
M7 object
M8 object
M9 object
V1 float64
V2 float64
V3 float64
V4 float64
V5 float64
V6 float64
V7 float64
V8 float64
V9 float64
V10 float64
V11 float64
V12 float64
V13 float64
V14 float64
V15 float64
V16 float64
V17 float64
V18 float64
V19 float64
V20 float64
V21 float64
V22 float64
V23 float64
V24 float64
V25 float64
V26 float64
V27 float64
V28 flo

In [16]:
# df_test.to_pickle(f'{folder_path}/df_test.gzde', compression='gzip')
# del df_test

In [9]:
df_train = pd.read_pickle(f'{folder_path}/df_train.gzde', compression='gzip')
df_test = pd.read_pickle(f'{folder_path}/df_test.gzde', compression='gzip')

In [13]:
df_train.shape

(590540, 591)

In [10]:
import sys
sys.path.append("..") # Adds higher directory to python modules path.
from kaggleKID.preprocessing import check_dataframe

df_train2, df_test2, diff_type_cols, not_numbic_cols, na_existed_cols, all_na_cols, trn_tst_imbalance_col = check_dataframe(df_train, df_test,['TransactionID','TransactionDT'])

train Transaction_hour is float16 and test Transaction_hour is float64
train Transaction_day_of_week is float16 and test Transaction_day_of_week is float64
train id_02_to_std_card4 is float16 and test id_02_to_std_card4 is float64
train TransactionAmt_to_mean_card1 is float16 and test TransactionAmt_to_mean_card1 is float64
train D15_to_mean_card1 is float16 and test D15_to_mean_card1 is float64
train D15_to_mean_addr1 is float16 and test D15_to_mean_addr1 is float64
train id_02_to_mean_card4 is float16 and test id_02_to_mean_card4 is float64
train id_02_to_mean_card1 is float16 and test id_02_to_mean_card1 is float64
ProductCD object
card4 object
card6 object
P_emaildomain object
R_emaildomain object
M1 object
M2 object
M3 object
M4 object
M5 object
M6 object
M7 object
M8 object
M9 object
id_12 object
id_15 object
id_16 object
id_23 object
id_27 object
id_28 object
id_29 object
id_30 object
id_31 object
id_33 object
id_34 object
id_35 object
id_36 object
id_37 object
id_38 object
Devi

V115 314 0 float64 [0.0, 0.0] [6.0, 9.0]
V116 314 0 float64 [0.0, 0.0] [6.0, 9.0]
V117 314 0 float64 [0.0, 0.0] [3.0, 2.0]
V118 314 0 float64 [0.0, 0.0] [3.0, 3.0]
V119 314 0 float64 [0.0, 0.0] [3.0, 2.0]
V120 314 0 float64 [0.0, 0.0] [3.0, 4.0]
V121 314 0 float64 [0.0, 0.0] [3.0, 4.0]
V122 314 0 float64 [0.0, 0.0] [3.0, 4.0]
V123 314 0 float64 [0.0, 0.0] [13.0, 12.0]
V124 314 0 float64 [0.0, 0.0] [13.0, 13.0]
V125 314 0 float64 [0.0, 0.0] [13.0, 12.0]
V126 314 0 float64 [0.0, 0.0] [160000.0, 519038.5]
V127 314 0 float64 [0.0, 0.0] [160000.0, 544500.0]
V128 314 0 float64 [0.0, 0.0] [160000.0, 519038.5]
V129 314 0 float64 [0.0, 0.0] [55125.0, 64800.0]
V130 314 0 float64 [0.0, 0.0] [55125.0, 167200.0]
V131 314 0 float64 [0.0, 0.0] [55125.0, 167200.0]
V132 314 0 float64 [0.0, 0.0] [93736.0, 519038.5]
V133 314 0 float64 [0.0, 0.0] [133915.0, 519038.5]
V134 314 0 float64 [0.0, 0.0] [98476.0, 519038.5]
V135 314 0 float64 [0.0, 0.0] [90750.0, 302500.0]
V136 314 0 float64 [0.0, 0.0] [90750.0, 

V280 12 3 float64 [0.0, 0.0] [975.0, 108.0]
V281 1269 6031 float64 [0.0, 0.0] [22.0, 30.0]
V282 1269 6031 float64 [0.0, 0.0] [32.0, 63.0]
V283 1269 6031 float64 [0.0, 0.0] [68.0, 68.0]
V284 12 3 float64 [0.0, 0.0] [12.0, 8.0]
V285 12 3 float64 [0.0, 0.0] [95.0, 90.0]
V286 12 3 float64 [0.0, 0.0] [8.0, 6.0]
V287 12 3 float64 [0.0, 0.0] [31.0, 31.0]
V288 1269 6031 float64 [0.0, 0.0] [10.0, 6.0]
V289 1269 6031 float64 [0.0, 0.0] [12.0, 11.0]
V290 12 3 float64 [1.0, 1.0] [67.0, 49.0]
V291 12 3 float64 [1.0, 1.0] [1055.0, 250.0]
V292 12 3 float64 [1.0, 1.0] [323.0, 75.0]
V293 12 3 float64 [0.0, 0.0] [869.0, 62.0]
V294 12 3 float64 [0.0, 0.0] [1286.0, 246.0]
V295 12 3 float64 [0.0, 0.0] [928.0, 78.0]
V296 1269 6031 float64 [0.0, 0.0] [93.0, 179.0]
V297 12 3 float64 [0.0, 0.0] [12.0, 85.0]
V298 12 3 float64 [0.0, 0.0] [93.0, 125.0]
V299 12 3 float64 [0.0, 0.0] [49.0, 106.0]
V300 1269 6031 float64 [0.0, 0.0] [11.0, 14.0]
V301 1269 6031 float64 [0.0, 0.0] [13.0, 14.0]
V302 12 3 float64 [0.0, 0.

In [11]:
print(df_train2.shape, df_test2.shape)
df_train2.columns.tolist()

(590540, 560) (506691, 559)


['TransactionID',
 'isFraud',
 'TransactionDT',
 'TransactionAmt',
 'ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'dist1',
 'dist2',
 'P_emaildomain',
 'R_emaildomain',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D11',
 'D12',
 'D13',
 'D14',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V19',
 'V20',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V41',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V61',
 'V62',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70',
 'V73',
 'V74',
 'V75',
 'V76',
 'V77',
 'V78',
 'V79',
 'V82',
 'V83',
 '

In [16]:
for col in df_train2.columns:
    if not ('int' in str(df_train2[col].dtype)) and (not 'float' in str(df_train2[col].dtype)):
        print(col, df_train2[col].dtype)

In [None]:
df_train2.to_pickle(f'{folder_path}/df_train2.gzde', compression='gzip')
df_test2.to_pickle(f'{folder_path}/df_test2.gzde', compression='gzip')