In [1]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

### 1. Importing Libraries and Reading the Dataset

In [2]:
import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../input/ieee-fraud-detection.zip
../input/test_transaction.csv
../input/test_identity.csv
../input/sample_submission.csv
../input/train_identity.csv
../input/train_transaction.csv


In [3]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import gc, datetime, random
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score

In [4]:
pd.options.display.max_rows = 4000

In [5]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED = 24
seed_everything(SEED)
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')

In [6]:
%%time
train_transaction = pd.read_csv('../input/train_transaction.csv', index_col='TransactionID')
test_transaction  = pd.read_csv('../input/test_transaction.csv',  index_col='TransactionID')

train_identity = pd.read_csv('../input/train_identity.csv', index_col='TransactionID')
test_identity  = pd.read_csv('../input/test_identity.csv',  index_col='TransactionID')

sample_submission = pd.read_csv('../input/sample_submission.csv', index_col='TransactionID')

train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test  = test_transaction.merge(test_identity,   how='left', left_index=True, right_index=True)

print("Train shape: ", train.shape)
print("Test shape: ", test.shape)

y = train['isFraud'].copy()
del train_transaction, train_identity, test_transaction, test_identity

# Drop target, fill in NaNs
train = train.drop('isFraud', axis=1)

Train shape:  (590540, 433)
Test shape:  (506691, 432)
CPU times: user 29.7 s, sys: 2.05 s, total: 31.8 s
Wall time: 33.3 s


In [7]:
train = reduce_mem_usage(train)
test  = reduce_mem_usage(test)

Memory usage of dataframe is 1970.87 MB
Memory usage after optimization is: 547.26 MB
Decreased by 72.2%
Memory usage of dataframe is 1693.87 MB
Memory usage after optimization is: 480.15 MB
Decreased by 71.7%


### 2. Feature Engineering

#### 2.1. Add New Features

In [8]:
def addNewFeatures(data): 
    data['uid'] = data['card1'].astype(str) + '_' + data['card2'].astype(str)

    data['uid2'] = data['uid'].astype(str) + '_' + data['card3'].astype(str) + '_' + data['card5'].astype(str)

    data['uid3'] = data['uid2'].astype(str) + '_' + data['addr1'].astype(str) + '_' + data['addr2'].astype(str)

    data['D9'] = np.where(data['D9'].isna(), 0, 1)
    
    return data

In [9]:
train = addNewFeatures(train)
test  = addNewFeatures(test)

In [10]:
i_cols = ['card1','card2','card3','card5','uid','uid2','uid3']

for col in i_cols:
    for agg_type in ['mean','std']:
        new_col_name = col + '_TransactionAmt_' + agg_type
        temp_df = pd.concat([train[[col, 'TransactionAmt']], test[[col,'TransactionAmt']]])
        #temp_df['TransactionAmt'] = temp_df['TransactionAmt'].astype(int)
        temp_df = temp_df.groupby([col])['TransactionAmt'].agg([agg_type]).reset_index().rename(
                                                columns={agg_type: new_col_name})

        temp_df.index = list(temp_df[col])
        temp_df = temp_df[new_col_name].to_dict()   

        train[new_col_name] = train[col].map(temp_df)
        test[new_col_name]  = test[col].map(temp_df)

train = train.replace(np.inf, 999)
test  = test.replace(np.inf, 999)

In [11]:
train['TransactionAmt'] = np.log1p(train['TransactionAmt'])
test['TransactionAmt']  = np.log1p(test['TransactionAmt'])

#### 2.2. Handle Email Domains

In [12]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other',
          'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft',
          'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 
          'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other',
          'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo',
          'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo',
          'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo',
          'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo',
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other',
          'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple',
          'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other',
          'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
us_emails = ['gmail', 'net', 'edu']

for c in ['P_emaildomain', 'R_emaildomain']:
    train[c + '_bin'] = train[c].map(emails)
    test[c + '_bin']  = test[c].map(emails)
    
    train[c + '_suffix'] = train[c].map(lambda x: str(x).split('.')[-1])
    test[c + '_suffix']  = test[c].map(lambda x: str(x).split('.')[-1])
    
    train[c + '_suffix'] = train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    test[c + '_suffix']  = test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

#### 2.3. Handle P Email Domain and R Email Domain

In [13]:
p = 'P_emaildomain'
r = 'R_emaildomain'
uknown = 'email_not_provided'

def setDomain(df):
    df[p] = df[p].fillna(uknown)
    df[r] = df[r].fillna(uknown)
    
    # Check if P_emaildomain matches R_emaildomain
    df['email_check'] = np.where((df[p] == df[r]) & (df[p] != uknown), 1, 0)

    df[p + '_prefix'] = df[p].apply(lambda x: x.split('.')[0])
    df[r + '_prefix'] = df[r].apply(lambda x: x.split('.')[0])
    
    return df
    
train = setDomain(train)
test  = setDomain(test)

#### 2.4. Set Time

In [14]:
def setTime(df):
    df['TransactionDT'] = df['TransactionDT'].fillna(df['TransactionDT'].median())
    # Temporary
    df['DT'] = df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
    df['DT_M'] = (df['DT'].dt.year-2017)*12 + df['DT'].dt.month
    df['DT_W'] = (df['DT'].dt.year-2017)*52 + df['DT'].dt.weekofyear
    df['DT_D'] = (df['DT'].dt.year-2017)*365 + df['DT'].dt.dayofyear
    
    df['DT_hour'] = df['DT'].dt.hour
    df['DT_day_week'] = df['DT'].dt.dayofweek
    df['DT_day'] = df['DT'].dt.day
    
    return df
    
train = setTime(train)
test  = setTime(test)

#### 2.5. Handle Browser Version

In [15]:
train["lastest_browser"] = np.zeros(train.shape[0])
test["lastest_browser"]  = np.zeros(test.shape[0])

def setBrowser(df):
    df.loc[df["id_31"]=="samsung browser 7.0",'lastest_browser']=1
    df.loc[df["id_31"]=="opera 53.0",'lastest_browser']=1
    df.loc[df["id_31"]=="mobile safari 10.0",'lastest_browser']=1
    df.loc[df["id_31"]=="google search application 49.0",'lastest_browser']=1
    df.loc[df["id_31"]=="firefox 60.0",'lastest_browser']=1
    df.loc[df["id_31"]=="edge 17.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 69.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 67.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 63.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 63.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0 for ios",'lastest_browser']=1
    return df

train=setBrowser(train)
test=setBrowser(test)

#### 2.6. Handle Device Type

In [16]:
def setDevice(df):
    df['DeviceInfo'] = df['DeviceInfo'].fillna('unknown_device').str.lower()
    
    df['device_name'] = df['DeviceInfo'].str.split('/', expand=True)[0]

    df.loc[df['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    df.loc[df['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    df.loc[df['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    df.loc[df['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    df.loc[df['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    df.loc[df['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    #df.loc[df.device_name.isin(df.device_name.value_counts()[df.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    df['had_id'] = 1
    gc.collect()
    
    return df

train=setDevice(train)
test=setDevice(test)

#### 2.7. Set Frequency

In [17]:
i_cols = ['card1','card2','card3','card5',
          'C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
          'D1','D2','D3','D4','D5','D6','D7','D8',
          'addr1','addr2',
          'dist1','dist2',
          'P_emaildomain', 'R_emaildomain',
          'DeviceInfo','device_name',
          'id_30','id_33',
          'uid','uid2','uid3',
         ]

for col in i_cols:
    temp_df = pd.concat([train[[col]], test[[col]]])
    fq_encode = temp_df[col].value_counts(dropna=False).to_dict()   
    train[col+'_fq_enc'] = train[col].map(fq_encode)
    test[col+'_fq_enc']  = test[col].map(fq_encode)


for col in ['DT_M','DT_W','DT_D']:
    temp_df = pd.concat([train[[col]], test[[col]]])
    fq_encode = temp_df[col].value_counts().to_dict()
            
    train[col+'_total'] = train[col].map(fq_encode)
    test[col+'_total']  = test[col].map(fq_encode)
        

periods = ['DT_M','DT_W','DT_D']
i_cols = ['uid']
for period in periods:
    for col in i_cols:
        new_column = col + '_' + period
            
        temp_df = pd.concat([train[[col,period]], test[[col,period]]])
        temp_df[new_column] = temp_df[col].astype(str) + '_' + (temp_df[period]).astype(str)
        fq_encode = temp_df[new_column].value_counts().to_dict()
            
        train[new_column] = (train[col].astype(str) + '_' + train[period].astype(str)).map(fq_encode)
        test[new_column]  = (test[col].astype(str) + '_' + test[period].astype(str)).map(fq_encode)
        
        train[new_column] /= train[period+'_total']
        test[new_column]  /= test[period+'_total']

### 3. Data Preprocessing

In [18]:
def get_too_many_null_attr(data):
    many_null_cols = [col for col in data.columns if data[col].isnull().sum() / data.shape[0] > 0.9]
    return many_null_cols

def get_too_many_repeated_val(data):
    big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
    return big_top_value_cols

def get_useless_columns(data):
    too_many_null = get_too_many_null_attr(data)
    print("More than 90% null: " + str(len(too_many_null)))
    too_many_repeated = get_too_many_repeated_val(data)
    print("More than 90% repeated value: " + str(len(too_many_repeated)))
    cols_to_drop = list(set(too_many_null + too_many_repeated))
    #cols_to_drop.remove('isFraud')
    return cols_to_drop

In [19]:
cols_to_drop = get_useless_columns(train)

More than 90% null: 12
More than 90% repeated value: 71


In [20]:
train = train.drop(cols_to_drop, axis=1)
test  = test.drop(cols_to_drop, axis=1)

In [21]:
print(train.shape)
print(test.shape)
print(y.shape)

(590540, 440)
(506691, 440)
(590540,)


In [22]:
numerical_cols   = train.select_dtypes(exclude = 'object').columns
categorical_cols = train.select_dtypes(include = 'object').columns

In [23]:
categorical_cols[:5]

Index(['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain'], dtype='object')

In [24]:
# Label Encoding
for f in train.columns:
    if train[f].dtype.name == 'object' or test[f].dtype.name == 'object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f]  = lbl.transform(list(test[f].values))

In [25]:
train = train.fillna(-999)
test  = test.fillna(-999)

In [26]:
print(train.isnull().sum().max())
print(test.isnull().sum().max())

0
0


In [27]:
train.head()

Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_33_fq_enc,uid_fq_enc,uid2_fq_enc,uid3_fq_enc,DT_M_total,DT_W_total,DT_D_total,uid_DT_M,uid_DT_W,uid_DT_D
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,86400,4.242188,4,13926,-999.0,150.0,1,142.0,1,315.0,...,953271,6,6,2,137321,12093,5122,1.5e-05,8.3e-05,0.000195
2987001,86401,3.400391,4,2755,404.0,150.0,2,102.0,1,325.0,...,953271,1328,1328,105,137321,12093,5122,0.001216,0.001075,0.001367
2987002,86469,4.09375,4,4663,490.0,150.0,4,166.0,2,330.0,...,953271,1787,1787,46,137321,12093,5122,0.001347,0.00215,0.001952
2987003,86499,3.931641,4,18132,567.0,150.0,2,117.0,2,476.0,...,953271,7602,7602,362,137321,12093,5122,0.006015,0.009179,0.009567
2987004,86506,3.931641,1,4497,514.0,150.0,2,102.0,1,420.0,...,1430,30,30,1,137321,12093,5122,1.5e-05,8.3e-05,0.000195


#### 3.1. Boruta Feature Selection

In [28]:
import gc
gc.collect()

21

In [29]:
from boruta import BorutaPy

In [30]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=400, n_jobs=-1, class_weight='balanced', max_depth=7)
boruta_selector = BorutaPy(rfc, n_estimators='auto', max_iter=50, verbose=2)

In [None]:
# clf = lgb.LGBMClassifier(boosting_type="rf",
#                          num_leaves=512,
#                          max_depth=7,
#                          n_estimators=1000,
#                          subsample=.623,
#                          colsample_bytree=.5
#                         )

In [None]:
# clf = lgb.LGBMClassifier(num_leaves=546,
#                          min_child_weight=0.03454472573214212,
#                          feature_fraction=0.1797454081646243,
#                          bagging_fraction=0.2181193142567742,
#                          min_data_in_leaf=106,
#                          objective='binary',
#                          max_depth=-1,
#                          learning_rate=0.005883242363721497,
#                          boosting_type='gbdt',
#                          bagging_seed=11,
#                          metric='auc',
#                          verbosity=-1,
#                          reg_alpha=0.3299927210061127,
#                          reg_lambda=0.3885237330340494,
# #                          random_state=24,
#                          num_boost_round=100
#                     )

In [None]:
# define Boruta feature selection method
# boruta_selector = BorutaPy(clf, n_estimators='auto', verbose=2, random_state=None)

In [31]:
from datetime import datetime

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [32]:
start_time = timer(None)
boruta_selector.fit(train.drop(['TransactionDT', 'DT'], axis=1).values, y.values)
timer(start_time)

Iteration: 	1 / 50
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	2 / 50
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	3 / 50
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	4 / 50
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	5 / 50
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	6 / 50
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	7 / 50
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	8 / 50
Confirmed: 	375
Tentative: 	63
Rejected: 	0
Iteration: 	9 / 50
Confirmed: 	375
Tentative: 	63
Rejected: 	0
Iteration: 	10 / 50
Confirmed: 	375
Tentative: 	63
Rejected: 	0
Iteration: 	11 / 50
Confirmed: 	375
Tentative: 	55
Rejected: 	8
Iteration: 	12 / 50
Confirmed: 	384
Tentative: 	46
Rejected: 	8
Iteration: 	13 / 50
Confirmed: 	384
Tentative: 	46
Rejected: 	8
Iteration: 	14 / 50
Confirmed: 	384
Tentative: 	46
Rejected: 	8
Iteration: 	15 / 50
Confirmed: 	384
Tentative: 	46
Rejected: 	8
Iteration: 	16 / 50
Confirmed: 	388
Tentative: 	39
Rejec

In [33]:
# number of selected features
print ('\n Number of selected features:')
print (boruta_selector.n_features_)


 Number of selected features:
398


In [34]:
feature_df = pd.DataFrame(train.drop(['TransactionDT', 'DT'], axis=1).columns.tolist(), columns=['features'])
feature_df['rank']=boruta_selector.ranking_
feature_df = feature_df.sort_values('rank', ascending=True).reset_index(drop=True)
print ('\n Top %d features:' % boruta_selector.n_features_)
print (feature_df.head(boruta_selector.n_features_))
feature_df.to_csv('boruta-feature-ranking.csv', index=False)


 Top 398 features:
                      features  rank
0               TransactionAmt     1
1                         V283     1
2                         V282     1
3                         V280     1
4                         V279     1
5                         V277     1
6                         V275     1
7                         V274     1
8                         V273     1
9                         V272     1
10                        V271     1
11                        V285     1
12                        V270     1
13                        V267     1
14                        V266     1
15                        V265     1
16                        V264     1
17                        V263     1
18                        V262     1
19                        V261     1
20                        V259     1
21                        V258     1
22                        V257     1
23                        V268     1
24                        V256     1
25                

In [35]:
# check selected features
print ('\n Selected features:')
print (boruta_selector.support_)


 Selected features:
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True False  True  True
  True  True  True  True False False  True  True  True  True False False
 False False False False False False False False False False  True  True
  True  True False  True  True  True  True  True  True  True  True  True
  True False  True False False  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False False  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True

In [36]:
# check weak features
print ('\n Support for weak features:')
print (boruta_selector.support_weak_)


 Support for weak features:
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False  True False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False  True False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False  True False False False False False False False
 False False False False False False False False False False False False
 False False False Fal

In [37]:
train.head()

Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_33_fq_enc,uid_fq_enc,uid2_fq_enc,uid3_fq_enc,DT_M_total,DT_W_total,DT_D_total,uid_DT_M,uid_DT_W,uid_DT_D
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,86400,4.242188,4,13926,-999.0,150.0,1,142.0,1,315.0,...,953271,6,6,2,137321,12093,5122,1.5e-05,8.3e-05,0.000195
2987001,86401,3.400391,4,2755,404.0,150.0,2,102.0,1,325.0,...,953271,1328,1328,105,137321,12093,5122,0.001216,0.001075,0.001367
2987002,86469,4.09375,4,4663,490.0,150.0,4,166.0,2,330.0,...,953271,1787,1787,46,137321,12093,5122,0.001347,0.00215,0.001952
2987003,86499,3.931641,4,18132,567.0,150.0,2,117.0,2,476.0,...,953271,7602,7602,362,137321,12093,5122,0.006015,0.009179,0.009567
2987004,86506,3.931641,1,4497,514.0,150.0,2,102.0,1,420.0,...,1430,30,30,1,137321,12093,5122,1.5e-05,8.3e-05,0.000195


In [39]:
selected = train.drop(['TransactionDT', 'DT'], axis=1).columns[boruta_selector.support_]
print(list(selected))

['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M3', 'M4', 'M5', 'M6', 'V10', 'V11', 'V12', 'V13', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V26', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V99', 'V100', 'V126', 'V127', 'V128', 'V130', 'V131', 'V138', 'V139', 'V140', 'V141', '

In [41]:
trn = train[selected]
trn['isFraud'] = y
# trn = trn.set_index('TransactionID')
trn.to_csv('train_boruta_50iter_filtered.csv', index_label='TransactionID')

tst = test[selected]
# tst = tst.set_index('TransactionID')
tst.to_csv('test_boruta_50iter_filtered.csv', index_label='TransactionID')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### 4. Model

In [43]:
# X      = train.drop(['TransactionDT', 'DT'], axis=1)
# X_test = test.drop(['TransactionDT', 'DT'], axis=1)

X      = train[selected]
X_test = test[selected]

In [44]:
print("X:", X.shape)
print("y_train:", y.shape)
print("X_test:", X_test.shape)

X: (590540, 398)
y_train: (590540,)
X_test: (506691, 398)


#### 4.1. LightGBM

In [1]:
params = {'num_leaves': 546,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.1797454081646243,
          'bagging_fraction': 0.2181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.005883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3299927210061127,
          'reg_lambda': 0.3885237330340494,
          'random_state': 42,
}

In [45]:
%%time

NFOLDS = 5
folds = KFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=500)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(X_test) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}")

Training until validation scores don't improve for 500 rounds.
[200]	training's auc: 0.957024	valid_1's auc: 0.888247
[400]	training's auc: 0.974644	valid_1's auc: 0.900444
[600]	training's auc: 0.986488	valid_1's auc: 0.909014
[800]	training's auc: 0.992718	valid_1's auc: 0.914903
[1000]	training's auc: 0.996172	valid_1's auc: 0.919175
[1200]	training's auc: 0.997981	valid_1's auc: 0.92228
[1400]	training's auc: 0.998953	valid_1's auc: 0.923951
[1600]	training's auc: 0.999459	valid_1's auc: 0.925257
[1800]	training's auc: 0.999724	valid_1's auc: 0.926066
[2000]	training's auc: 0.999865	valid_1's auc: 0.926552
[2200]	training's auc: 0.999933	valid_1's auc: 0.926765
[2400]	training's auc: 0.999969	valid_1's auc: 0.926857
[2600]	training's auc: 0.999986	valid_1's auc: 0.927103
[2800]	training's auc: 0.999994	valid_1's auc: 0.927237
[3000]	training's auc: 0.999997	valid_1's auc: 0.927258
[3200]	training's auc: 0.999999	valid_1's auc: 0.927194
[3400]	training's auc: 1	valid_1's auc: 0.9272

### 5. Submission

In [46]:
submission = pd.read_csv('../input/sample_submission.csv')

In [47]:
submission['isFraud'] = y_preds

In [48]:
submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000313
1,3663550,0.000599
2,3663551,0.001026
3,3663552,0.000756
4,3663553,0.000479


In [49]:
# submission.to_csv('../submissions/feature-engineering-feature-selection-lightgbm-baseline.csv', index=False)
submission.to_csv('../submissions/feature-engineering-feature-selection-lightgbm-baseline-boruta-50iter.csv', index=False)

**Result:** 
440 features  `0.9477`
398 features  `0.9475`