In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
# Standard plotly imports
#import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
#import cufflinks
#import cufflinks as cf
import plotly.figure_factory as ff
import datetime
# Using plotly + cufflinks in offline mode
init_notebook_mode(connected=True)
#cufflinks.go_offline(connected=True)

# Preprocessing, modelling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
# from xgboost import XGBClassifier
# import xgboost as xgb

## Hyperopt modules
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial

import os
import gc
from tqdm import tqdm
print(os.listdir("../input/ieee-fraud-detection"))

['train_identity.csv', 'train_transaction.csv', 'test_transaction.csv', 'test_identity.csv', 'sample_submission.csv']


In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in tqdm(df.columns):
        col_type = df[col].dtype
        #print(col_type)
        if str(col_type)[:4]== 'date' : continue
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
train = pd.read_pickle('../input/datas4/train2.pkl')
test  = pd.read_pickle('../input/datas4/test2.pkl')
sample_submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv', index_col='TransactionID')

In [4]:
print('Memory usage of train is {:.2f} MB'.format(train.memory_usage().sum() / 1024**2))
print('Memory usage of test is {:.2f} MB'.format(test.memory_usage().sum() / 1024**2))

Memory usage of train is 1385.45 MB
Memory usage of test is 1195.99 MB


In [5]:
train['nulls'] = train.isnull().sum(axis=1)
test['nulls'] = test.isnull().sum(axis=1)

In [6]:
train

Unnamed: 0_level_0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,uid2_TransactionAmt_mean,uid2_TransactionAmt_std,uid3_TransactionAmt_mean,uid3_TransactionAmt_std,uid4_TransactionAmt_mean,uid4_TransactionAmt_std,uid5_TransactionAmt_mean,uid5_TransactionAmt_std,DayOfYear,nulls
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0,2017-12-02 00:00:00,4.226562,W,13926,,150.0,discover,142.0,credit,...,5.164062,1.046875,4.992188,1.085938,4.226562,,4.992188,1.085938,336,325
2987001,0,2017-12-02 00:00:01,3.367188,W,2755,404.0,150.0,mastercard,102.0,credit,...,4.796875,0.937500,4.816406,1.009766,5.195312,1.457031,4.804688,1.029297,336,250
2987002,0,2017-12-02 00:01:09,4.078125,W,4663,490.0,150.0,visa,166.0,debit,...,4.281250,0.785156,3.988281,0.569824,4.277344,0.233154,3.980469,0.574219,336,260
2987003,0,2017-12-02 00:01:39,3.912109,W,18132,567.0,150.0,mastercard,117.0,debit,...,4.347656,0.835449,4.386719,0.697266,4.148438,0.547852,4.394531,0.686035,336,253
2987004,0,2017-12-02 00:01:46,3.912109,H,4497,514.0,150.0,mastercard,102.0,credit,...,4.324219,0.823242,3.912109,,3.912109,,3.912109,,336,207
2987005,0,2017-12-02 00:01:50,3.892578,W,5937,555.0,150.0,visa,226.0,debit,...,4.785156,0.664062,4.441406,0.829102,3.892578,0.000000,4.441406,0.829102,336,280
2987006,0,2017-12-02 00:02:02,5.070312,W,12308,360.0,150.0,visa,166.0,debit,...,4.277344,0.779297,4.718750,0.500000,4.718750,0.500000,4.718750,0.500000,336,266
2987007,0,2017-12-02 00:02:09,6.046875,W,12695,490.0,150.0,visa,226.0,debit,...,4.488281,0.866699,4.496094,0.873047,4.152344,1.143555,4.507812,0.880859,336,246
2987008,0,2017-12-02 00:02:15,2.708984,H,2803,100.0,150.0,visa,226.0,debit,...,4.441406,0.861328,4.382812,0.821289,4.460938,0.721680,4.429688,0.816895,336,124
2987009,0,2017-12-02 00:02:16,4.761719,W,17399,111.0,150.0,mastercard,224.0,debit,...,4.351562,0.838867,4.339844,0.823730,4.335938,0.818848,4.351562,0.825684,336,216


In [7]:
list(train.columns)

train = train.drop(['uid','uid2','uid3','uid4','uid5'],axis =1)
test = test.drop(['uid','uid2','uid3','uid4','uid5'],axis =1)

train = train.drop(['card1','card2','card3','card4','card5','card6'],axis =1)
test = test.drop(['card1','card2','card3','card4','card5','card6'],axis =1)

train = train.drop(['ProductCD'],axis =1)
test = test.drop(['ProductCD'],axis =1)



In [8]:
Y_train = train['isFraud'].copy()

X_train = train.drop('isFraud', axis=1)
print(X_train.shape)
X_train.drop('TransactionDT', axis=1, inplace=True)
print(X_train.shape)
X_test = test.drop('TransactionDT', axis=1) 
print(X_test.shape)

train, test = None, None
del train, test
gc.collect()


(590540, 570)
(590540, 569)
(506691, 569)


63

In [9]:
numerical_columns = list(X_train.select_dtypes(include=['float16','float32','float64','int8','int16','int64']).columns)

#list(test.select_dtypes(exclude=['object']).columns)
print(X_train.shape)
X_train[numerical_columns] =X_train[numerical_columns].fillna(X_train[numerical_columns].median())
X_test[numerical_columns] =X_test[numerical_columns].fillna(X_test[numerical_columns].median())
print("filling numerical columns null values done")
# print(X_train.shape)

(590540, 569)
filling numerical columns null values done


In [10]:
categorical_columns = list(filter(lambda x: x not in numerical_columns, list(X_train.columns)))
categorical_columns[:5]
X_train[categorical_columns] =X_train[categorical_columns].fillna(X_train[categorical_columns].mode())
X_test[categorical_columns] =X_test[categorical_columns].fillna(X_test[categorical_columns].mode())
print("filling numerical columns null values done")

filling numerical columns null values done


In [11]:
for f in X_train.select_dtypes(include='category').columns.tolist() + X_train.select_dtypes(include='object').columns.tolist():
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train[f].values) + list(X_test[f].values))
    X_train[f] = lbl.transform(list(X_train[f].values))
    X_test[f] = lbl.transform(list(X_test[f].values))

In [12]:
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)
gc.collect()


  2%|▏         | 10/569 [00:00<00:05, 95.19it/s]

Memory usage of dataframe is 1476.95 MB


100%|██████████| 569/569 [00:44<00:00, 12.79it/s]
  2%|▏         | 11/569 [00:00<00:05, 108.71it/s]

Memory usage after optimization is: 962.20 MB
Decreased by 34.9%
Memory usage of dataframe is 1277.82 MB


100%|██████████| 569/569 [00:37<00:00, 15.23it/s]


Memory usage after optimization is: 841.95 MB
Decreased by 34.1%


189

In [13]:
from sklearn.model_selection import TimeSeriesSplit,KFold
n_fold = 5
folds = KFold(n_splits=n_fold,shuffle=True)

In [14]:
xgb_submission=sample_submission.copy()
xgb_submission['isFraud'] = 0
import xgboost as xgb
from sklearn.metrics import roc_auc_score
for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
    xgbclf = xgb.XGBClassifier(
        n_estimators=10000,
        max_depth=-1,
        learning_rate=0.048,
        subsample=0.85,
        colsample_bytree=0.85,
        missing=-999,
        tree_method='gpu_hist',  # THE MAGICAL PARAMETER
        reg_alpha=0.15,
        reg_lamdba=0.85
    )
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = Y_train.iloc[train_index], Y_train.iloc[valid_index]
    xgbclf.fit(X_train_,y_train_)
    del X_train_,y_train_
    gc.collect()
    pred=xgbclf.predict_proba(X_test)[:,1]
    val=xgbclf.predict_proba(X_valid)[:,1]
    del xgbclf, X_valid
    gc.collect()
    print('ROC accuracy: {}'.format(roc_auc_score(y_valid, val)))
    del val,y_valid
    gc.collect()
    xgb_submission['isFraud'] = xgb_submission['isFraud']+pred/n_fold
    del pred
    gc.collect()
    gc.collect()

XGBoostError: value -1 for Parameter max_depth should be greater equal to 0

In [15]:
xgb_submission.to_csv('sub_xgboost.csv')
xgb_submission.head()

Unnamed: 0_level_0,isFraud
TransactionID,Unnamed: 1_level_1
3663549,0
3663550,0
3663551,0
3663552,0
3663553,0
