In [1]:
import os

import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb

In [2]:
print("XGBoost version:", xgb.__version__)

XGBoost version: 0.90


In [4]:
folder_path = '../data/IEEE-CIS-Fraud-Detection/'

In [5]:
%%time
train_transaction = pd.read_csv(f'{folder_path}train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv(f'{folder_path}test_transaction.csv', index_col='TransactionID')

train_identity = pd.read_csv(f'{folder_path}train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv(f'{folder_path}test_identity.csv', index_col='TransactionID')

sample_submission = pd.read_csv(f'{folder_path}sample_submission.csv', index_col='TransactionID')

train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

print(train.shape)
print(test.shape)

y_train = train['isFraud'].copy()
del train_transaction, train_identity, test_transaction, test_identity

# Drop target, fill in NaNs
X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

del train, test

X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

# Label Encoding
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))   

(590540, 433)
(506691, 432)
CPU times: user 1min 25s, sys: 5.54 s, total: 1min 31s
Wall time: 1min 32s


In [6]:
clf = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=9,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    missing=-999,
    random_state=2019,
    tree_method='gpu_hist'  # THE MAGICAL PARAMETER
)

In [7]:
%time clf.fit(X_train, y_train)

CPU times: user 23.9 s, sys: 7.6 s, total: 31.5 s
Wall time: 32 s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=-999, n_estimators=500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=2019,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.9, tree_method='gpu_hist', verbosity=1)

In [9]:
df_fe = pd.DataFrame({'feature':X_train.columns.tolist(), 'feature_importance':clf.feature_importances_})

In [12]:
df_fe.sort_values(by='feature_importance', ascending=False).head(50)

Unnamed: 0,feature,feature_importance
310,V258,0.13578
253,V201,0.044701
122,V70,0.030315
143,V91,0.023595
346,V294,0.017217
241,V189,0.015663
208,V156,0.014473
298,V246,0.009837
28,C14,0.009135
21,C7,0.008525
