In [1]:
%matplotlib inline


import pandas as pd
import numpy as np
import multiprocessing
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
from time import time
import datetime
from reduce_mem_usage import reduce_mem_usage
from tqdm import tqdm, tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, TimeSeriesSplit, train_test_split
from sklearn.metrics import roc_auc_score
warnings.simplefilter('ignore')
sns.set()
from scipy.stats import ks_2samp

In [2]:
files = ['/input/test_identity.csv', 
         '/input/test_transaction.csv',
         '/input/train_identity.csv',
         '/input/train_transaction.csv',
         '/input/sample_submission.csv']

In [3]:
train_transaction = pd.read_csv('input/train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('input/test_transaction.csv', index_col='TransactionID')

train_identity = pd.read_csv('input/train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('input/test_identity.csv', index_col='TransactionID')

sample_submission = pd.read_csv('input/sample_submission.csv', index_col='TransactionID')

In [4]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

del test_identity, test_transaction, train_identity, train_transaction
gc.collect()

53

In [5]:
# train=reduce_mem_usage(train)
# test=reduce_mem_usage(test)

# double check no problems 
# train2 = reduce_mem_usage(train)
# numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
# for col in train.columns:
#     if train[col].dtype in numerics:
#         if (abs(train2[col] - train[col]) > 0.00001).any():
#             print(col)
            
# (train['TransactionAmt'] - train2['TransactionAmt'] == 0).all()

In [6]:
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
for df in [train, test]:
    # Temporary
    df['DT'] = df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
    df['DT_M'] = (df['DT'].dt.year-2017)*12 + df['DT'].dt.month
    df['DT_W'] = (df['DT'].dt.year-2017)*52 + df['DT'].dt.weekofyear
    df['DT_D'] = (df['DT'].dt.year-2017)*365 + df['DT'].dt.dayofyear
    
    df['DT_hour'] = df['DT'].dt.hour
    df['DT_day_week'] = df['DT'].dt.dayofweek
    df['DT_day'] = df['DT'].dt.day

## Test different options for this
## Test different options for this
    # D9 column
    df['D9_isnull'] = np.where(df['D9'].isna(),0,1)

In [7]:
## Test different options for this
## Test different options for this
for feature in ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'id_36']:
    train[feature + '_count_full'] = train[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))
    test[feature + '_count_full'] = test[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))

# Encoding - count encoding separately for train and test
for feature in ['id_01', 'id_31', 'id_33', 'id_36']:
    train[feature + '_count_dist'] = train[feature].map(train[feature].value_counts(dropna=False))
    test[feature + '_count_dist'] = test[feature].map(test[feature].value_counts(dropna=False))

In [8]:
i_cols = ['M1','M2','M3','M5','M6','M7','M8','M9']

# Replace T with 1 and F with 0 before taking the sum
for col in i_cols:
    train[col] = train[col].apply(lambda x: 1 if x == 'T' else (0 if x == 'F' else np.nan))
    test[col] = test[col].apply(lambda x: 1 if x == 'T' else (0 if x == 'F' else np.nan))

for df in [train, test]:
    df['M_sum'] = df[i_cols].sum(axis=1).astype(np.int8)
    df['M_na'] = df[i_cols].isna().sum(axis=1).astype(np.int8)

In [9]:
TARGET='isFraud'

In [10]:
# ## Test different options for this
# ## Test different options for this
# # categorical_cols = train.select_dtypes('category').columns   => didn't work
# for col in ['ProductCD','M4']:
#     temp_dict = train.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
#                                                         columns={'mean': col+'_target_mean'})
#     temp_dict.index = temp_dict[col].values
#     temp_dict = temp_dict[col+'_target_mean'].to_dict()

#     train[col+'_target_mean'] = train[col].map(temp_dict)
#     test[col+'_target_mean']  = test[col].map(temp_dict)

In [11]:
## Test different options for this and see what we can do with it
## Test different options for this
train['uid'] = train['card1'].astype(str)+'_'+train['card2'].astype(str)
test['uid'] = test['card1'].astype(str)+'_'+test['card2'].astype(str)

train['uid2'] = train['uid'].astype(str)+'_'+train['card3'].astype(str)+'_'+train['card4'].astype(str)
test['uid2'] = test['uid'].astype(str)+'_'+test['card3'].astype(str)+'_'+test['card4'].astype(str)

train['uid3'] = train['uid2'].astype(str)+'_'+train['addr1'].astype(str)+'_'+train['addr2'].astype(str)
test['uid3'] = test['uid2'].astype(str)+'_'+test['addr1'].astype(str)+'_'+test['addr2'].astype(str)


# Check if the Transaction Amount is present in the other set (train vs. test) or not 
# In our dialog with a model we are telling to trust or not to these values   
train['TransactionAmt_check']  = np.where(train['TransactionAmt'].isin(test['TransactionAmt']), 1, 0)
train['ProductCD_check'] = np.where(train['ProductCD'].isin(test['ProductCD']), 1, 0)
test['TransactionAmt_check']  = np.where(test['TransactionAmt'].isin(train['TransactionAmt']), 1, 0)
test['ProductCD_check']  = np.where(test['ProductCD'].isin(train['ProductCD']), 1, 0)

In [12]:
## Test different options for this and DO A FUNC ? 
## Test different options for this 

i_cols = ['card1','card2','card3','card5','uid','uid2','uid3']

for col in i_cols:
    for agg_type in ['mean','std']:
        new_col_name = col+'_TransactionAmt_'+agg_type
        temp_df = pd.concat([train[[col, 'TransactionAmt']], test[[col,'TransactionAmt']]])
        #temp_df['TransactionAmt'] = temp_df['TransactionAmt'].astype(int)
        temp_df = temp_df.groupby([col])['TransactionAmt'].agg([agg_type]).reset_index().rename(
                                                columns={agg_type: new_col_name})
        
        temp_df.index = list(temp_df[col])
        temp_df = temp_df[new_col_name].to_dict()   
    
        train[new_col_name] = train[col].map(temp_df)
        test[new_col_name]  = test[col].map(temp_df)
           

train['TransactionAmt_log'] = np.log1p(train['TransactionAmt'])
test['TransactionAmt_log'] = np.log1p(test['TransactionAmt'])

# Cell two (but it's more or less the same stuff so I put it here)
for col in ['card1','card2','card3','card5','addr1','addr2',
            'dist2','C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11',
            'C12','C13','C14','TransactionAmt','D1','D2','D3','D4','D5','D6',
            'D7','D8','D9','D10','D13','D14','D15','uid','uid2','uid3']:
    for agg_type in ['mean','std']:
        if (train[col].dtypes!='object'):
            new_col_name = col+'_ProductCD_'+agg_type
            temp_dict = train.groupby(['ProductCD'])[col].agg([agg_type]).reset_index().rename(columns={agg_type: new_col_name})
            temp_dict.index = temp_dict[new_col_name].values
            temp_dict = temp_dict[new_col_name].to_dict()
        
            train[new_col_name] = train[col].map(temp_dict)
            test[new_col_name]  = test[col].map(temp_dict)

In [13]:
p = 'P_emaildomain'
r = 'R_emaildomain'
uknown = 'email_not_provided'

for df in [train, test]:
    df[p] = df[p].astype(str)
    df[p] = df[p].fillna(uknown)
    df[r] = df[r].astype(str)
    df[r] = df[r].fillna(uknown)
    
    # Check if P_emaildomain matches R_emaildomain
    df['email_check'] = np.where((df[p]==df[r])&(df[p]!=uknown),1,0)

    df[p+'_prefix'] = df[p].apply(lambda x: x.split('.')[0])
    df[r+'_prefix'] = df[r].apply(lambda x: x.split('.')[0])

In [14]:
# REFACTO
# REFACTO

for df in [train, test]:
    for col in ['DeviceInfo', 'id_30', 'id_31']:
        df[col] = df[col].astype(str).fillna('unknown_device').str.lower()
        df[col+'_device'] = df[col].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
        df[col+'_version'] = df[col].apply(lambda x: ''.join([i for i in x if i.isnumeric()]))

In [None]:
# Could be done with and without test... 
########################### Freq encoding


i_cols = ['card1','card2','card3','card5',
          'C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
          'D1','D2','D3','D4','D5','D6','D7','D8',
          'addr1','addr2',
          'dist1','dist2',
          'P_emaildomain', 'R_emaildomain',
          'DeviceInfo','DeviceInfo_device','DeviceInfo_version',
          'id_30','id_30_device','id_30_version',
          'id_31_device',
          'id_33',
          'uid','uid2','uid3',]

for col in i_cols:
    temp_df = pd.concat([train[[col]], test[[col]]])
    fq_encode = temp_df[col].value_counts(dropna=False).to_dict()   
    train[col+'_fq_enc'] = train[col].map(fq_encode)
    test[col+'_fq_enc']  = test[col].map(fq_encode)
    

# Same stuff with other variables and a different naming... why ?
for col in ['DT_M','DT_W','DT_D']:
    temp_df = pd.concat([train[[col]], test[[col]]])
    fq_encode = temp_df[col].value_counts().to_dict()
            
    train[col+'_total'] = train[col].map(fq_encode)
    test[col+'_total']  = test[col].map(fq_encode)

In [None]:
# What does this DO? => frequency encoding of a mix between uid and periods
## Test different options for this (different uids ?)
## Test different options for this (different uids ?)

periods = ['DT_M','DT_W','DT_D']
i_cols = ['uid']
for period in periods:
    for col in i_cols:
        new_column = col + '_' + period
            
        temp_df = pd.concat([train[[col,period]], test[[col,period]]])
        temp_df[new_column] = temp_df[col].astype(str) + '_' + (temp_df[period]).astype(str)
        fq_encode = temp_df[new_column].value_counts().to_dict()
            
        train[new_column] = (train[col].astype(str) + '_' + train[period].astype(str)).map(fq_encode)
        test[new_column]  = (test[col].astype(str) + '_' + test[period].astype(str)).map(fq_encode)
        
        train[new_column] /= train[period+'_total']
        test[new_column]  /= test[period+'_total']

In [None]:
def id_split(dataframe):
    dataframe['device_name'] = dataframe['DeviceInfo'].str.split('/', expand=True)[0]
    dataframe['device_version'] = dataframe['DeviceInfo'].str.split('/', expand=True)[1]

    dataframe['OS_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[0]
    dataframe['version_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[1]

    dataframe['browser_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[0]
    dataframe['version_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[1]

    dataframe['screen_width'] = dataframe['id_33'].str.split('x', expand=True)[0]
    dataframe['screen_height'] = dataframe['id_33'].str.split('x', expand=True)[1]

    dataframe.loc[dataframe['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    dataframe.loc[dataframe['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    dataframe.loc[dataframe['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    dataframe.loc[dataframe['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    dataframe.loc[dataframe['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    dataframe.loc[dataframe['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    dataframe.loc[dataframe.device_name.isin(dataframe.device_name.value_counts()[dataframe.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    dataframe['had_id'] = 1
    gc.collect()
    
    return dataframe
train=id_split(train)
test=id_split(test)

In [None]:
train['id_02_to_mean_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('mean')
train['id_02_to_mean_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('mean')
train['id_02_to_std_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('std')
train['id_02_to_std_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('std')

test['id_02_to_mean_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('mean')
test['id_02_to_mean_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('mean')
test['id_02_to_std_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('std')
test['id_02_to_std_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('std')

train['D15_to_mean_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

test['D15_to_mean_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('mean')
test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
test['D15_to_std_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('std')
test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

train['D15_to_mean_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

test['D15_to_mean_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('mean')
test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
test['D15_to_std_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('std')
test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

In [None]:
# New feature - decimal part of the transaction amount.
train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - train['TransactionAmt'].astype(int)) * 1000).astype(int)
test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - test['TransactionAmt'].astype(int)) * 1000).astype(int)

In [None]:
# combination of categorical variables
for feature in ['id_02__id_20', 'id_02__D8', 'D11__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1',
                'id_02__id_14','id_14__id_20','id_02__id_17','id_14__id_17','id_17__id_20','id_02__id_19','id_14__id_19','id_17__id_19',
                'id_19__id_20'
               ]:

    f1, f2 = feature.split('__')
    train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
    test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)

    le = LabelEncoder()
    le.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
    train[feature] = le.transform(list(train[feature].astype(str).values))
    test[feature] = le.transform(list(test[feature].astype(str).values))

In [None]:
# for col in 'C3,C5,C9'.split(','): 
#         train[col+'_third_quartile'] = train.groupby('uid3')[col].transform(lambda x: x > x.quantile(0.75))
#         test[col+'_third_quartile'] = test.groupby('uid3')[col].transform(lambda x: x > x.quantile(0.75))

In [None]:
## Test different options for this (with p-value and statistic changing)
## Test different options for this 

def get_diff_columns(train_df, test_df, show_plots=True, show_all=False, threshold=0.01):
    """"Use KS to estimate columns where distributions differ a lot from each other"""
    numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    # Find the columns where the distributions are very different
    diff_data = []
    for col in tqdm(test_df.columns):
        if (test_df[col].dtypes in numerics) and (col!='TransactionID'):
            statistic, pvalue = ks_2samp(
                train_df[col].values, 
                test_df[col].values
            )
            if pvalue == 0:
                diff_data.append({'feature': col, 'p': np.round(pvalue, 5), 'statistic': np.round(np.abs(statistic), 2)})

    # Put the differences into a dataframe
    diff_df = pd.DataFrame(diff_data).sort_values(by='statistic', ascending=False)
    return diff_df

diff_df=get_diff_columns(train,test)

100%|█████████▉| 639/640 [01:12<00:00,  9.20it/s]

In [None]:
diff_df[ diff_df['statistic'] > 0.15]['feature'].tolist()

In [24]:
train=train.drop(diff_df[ diff_df['statistic'] > 0.15]['feature'].tolist(),axis=1)
test=test.drop(diff_df[ diff_df['statistic'] > 0.15]['feature'].tolist(),axis=1)
del(diff_df)

In [25]:
# Label encoding
for col in list(train):
    if train[col].dtype=='O':
        train[col] = train[col].fillna('unseen_before_label')
        test[col]  = test[col].fillna('unseen_before_label')
        
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        
        le = LabelEncoder()
        le.fit(list(train[col])+list(test[col]))
        train[col] = le.transform(train[col])
        test[col]  = le.transform(test[col])
        
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

In [26]:
rm_cols = [
    'TransactionID','TransactionDT', # These columns are pure noise right now
    TARGET,                          # Not target in features))
    'uid','uid2','uid3',             # Our new client uID -> very noisy data
    'bank_type',                     # Victims bank could differ by time
    'DT','DT_M','DT_W','DT_D',       # Temporary Variables
    'DT_hour','DT_day_week','DT_day',
    'DT_D_total','DT_W_total','DT_M_total',
    'id_30','id_31','id_33',
]

In [27]:
features_columns = [col for col in list(train) if col not in rm_cols]

# The June month drops entirely
train['random_noise'] = np.random.randn(len(train))
print(train['DT'].max())
print(test['DT'].min())
# So we need to get rid of April and keep May as validation set
X_train = train[ train['DT'] <= '2018-03-31']
y_train = X_train[TARGET]
X_train = X_train[features_columns]
X_valid = train[ (train['DT'] >= '2018-05-01')]
y_valid = X_valid[TARGET]
X_valid = X_valid[features_columns]

2018-05-31 23:58:51
2018-07-01 00:00:24


In [28]:
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 496,
                    'max_depth':-1,
                    'min_data_in_leaf':50,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':800,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': 24,
                    'early_stopping_rounds':100, 
                } 

In [29]:
tr_data = lgb.Dataset(X_train, label=y_train)
vl_data = lgb.Dataset(X_valid, label=y_valid)  
        
estimator = lgb.train(
                lgb_params,
                tr_data,
                valid_sets = [tr_data, vl_data],
                verbose_eval = 100)   

#Early stopping, best iteration is:
#[518]	training's auc: 0.999265	valid_1's auc: 0.916453
#Early stopping, best iteration is:
#[661]	training's auc: 0.999878	valid_1's auc: 0.915671

# How much does is change when I keep a lot of V's ? 
#Early stopping, best iteration is:
#[540]	training's auc: 0.999628	valid_1's auc: 0.919858
#[800]	training's auc: 0.999987	valid_1's auc: 0.92052
# Early stopping, best iteration is:
# [689]	training's auc: 0.999944	valid_1's auc: 0.92547

#0.919613 vs.

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.962749	valid_1's auc: 0.896209
[200]	training's auc: 0.983521	valid_1's auc: 0.908402
[300]	training's auc: 0.994043	valid_1's auc: 0.915231
[400]	training's auc: 0.998058	valid_1's auc: 0.918607
[500]	training's auc: 0.999395	valid_1's auc: 0.91944
[600]	training's auc: 0.999822	valid_1's auc: 0.919751
Early stopping, best iteration is:
[667]	training's auc: 0.999926	valid_1's auc: 0.919994


In [32]:
useful_vars = pd.read_csv('permut_imp_df.csv')
usecols = useful_vars[ useful_vars['permut_importances'] > 0.05]['cols']
usecols = [col for col in usecols if col in X_train.columns]
tr_data = lgb.Dataset(X_train[usecols], label=y_train)
vl_data = lgb.Dataset(X_valid[usecols], label=y_valid)  
        
estimator = lgb.train(
                lgb_params,
                tr_data,
                valid_sets = [tr_data, vl_data],
                verbose_eval = 100)   

#Early stopping, best iteration is:
#[518]	training's auc: 0.999265	valid_1's auc: 0.916453
#Early stopping, best iteration is:
#[661]	training's auc: 0.999878	valid_1's auc: 0.915671

# How much does is change when I keep a lot of V's ? 
#Early stopping, best iteration is:
#[540]	training's auc: 0.999628	valid_1's auc: 0.919858
#[800]	training's auc: 0.999987	valid_1's auc: 0.92052
# Early stopping, best iteration is:
# [689]	training's auc: 0.999944	valid_1's auc: 0.92547

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.960883	valid_1's auc: 0.898043
[200]	training's auc: 0.978829	valid_1's auc: 0.906967
[300]	training's auc: 0.989237	valid_1's auc: 0.911635
[400]	training's auc: 0.994657	valid_1's auc: 0.913522
[500]	training's auc: 0.997254	valid_1's auc: 0.914146
[600]	training's auc: 0.998503	valid_1's auc: 0.914502
Early stopping, best iteration is:
[585]	training's auc: 0.998365	valid_1's auc: 0.914585


In [38]:
usecols = useful_vars[ useful_vars['permut_importances'] > 0.05]['cols'].tolist()
print(len(usecols))
for col in useful_vars[ useful_vars['permut_importances'] > 0.01]['cols'].tolist():
    if col in X_train.columns and train[col].nunique() <= 50: #not so many split this var can make to overfit
        usecols.append(col)
usecols = list(set(usecols))
usecols = [col for col in usecols if col in X_train.columns]
print(len(usecols))
tr_data = lgb.Dataset(X_train[usecols], label=y_train)
vl_data = lgb.Dataset(X_valid[usecols], label=y_valid)  
        
estimator = lgb.train(
                lgb_params,
                tr_data,
                valid_sets = [tr_data, vl_data],
                verbose_eval = 100)   

#Early stopping, best iteration is:
#[518]	training's auc: 0.999265	valid_1's auc: 0.916453
#Early stopping, best iteration is:
#[661]	training's auc: 0.999878	valid_1's auc: 0.915671

# How much does is change when I keep a lot of V's ? 
#Early stopping, best iteration is:
#[540]	training's auc: 0.999628	valid_1's auc: 0.919858
#[800]	training's auc: 0.999987	valid_1's auc: 0.92052
# Early stopping, best iteration is:
# [689]	training's auc: 0.999944	valid_1's auc: 0.92547

45
72
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.98051	valid_1's auc: 0.910601
[300]	training's auc: 0.990435	valid_1's auc: 0.916149
[400]	training's auc: 0.995444	valid_1's auc: 0.918278
[500]	training's auc: 0.997777	valid_1's auc: 0.91928
[600]	training's auc: 0.998919	valid_1's auc: 0.919325
[700]	training's auc: 0.999444	valid_1's auc: 0.919396
Early stopping, best iteration is:
[670]	training's auc: 0.999325	valid_1's auc: 0.919529


In [43]:
tr_data = lgb.Dataset(train[usecols], label=train[TARGET])
estimator = lgb.train(
                lgb_params,
                tr_data,
                valid_sets = [tr_data],
                verbose_eval = 100)   

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.954769
[200]	training's auc: 0.974852
[300]	training's auc: 0.985592
[400]	training's auc: 0.991455
[500]	training's auc: 0.994956
[600]	training's auc: 0.996976
[700]	training's auc: 0.998135
[800]	training's auc: 0.998812
Did not meet early stopping. Best iteration is:
[800]	training's auc: 0.998812


In [44]:
preds = estimator.predict(test[usecols])

In [54]:
sub = pd.read_csv('submission.csv')
sub['isFraud'] = preds
sub.to_csv('submission.csv', index=False)

# Try Kfold on valid

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score


EPOCHS = 5
kf = KFold(n_splits = EPOCHS, shuffle = False)
y_preds = np.zeros(X_valid.shape[0])
y_oof = np.zeros(X_train.shape[0])

precisions = []
recalls = []
f1s = []
rocs = []
permut_importances = np.zeros(len(X_train.columns))

for tr_idx, val_idx in kf.split(X_train, y_train):
 
    X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
    y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    
    tr_data = lgb.Dataset(X_tr[usecols], label=y_tr)
    vl_data = lgb.Dataset(X_vl[usecols], label=y_vl)  
        
    clf = lgb.train(
                lgb_params,
                tr_data,
                valid_sets = [tr_data, vl_data],
                verbose_eval = 100)
    y_pred_train = clf.predict_proba(X_vl)[:,1]
    y_oof[val_idx] = y_pred_train
    print('train finished')
    precision = precision_score(y_vl, (y_pred_train > 0.5).astype(int))
    print('precision {}'.format(precision))
    recall = recall_score(y_vl, (y_pred_train > 0.5).astype(int))
    print('recall {}'.format(recall))
    f1 = f1_score(y_vl, (y_pred_train > 0.5).astype(int))
    print('f1_score {}'.format(f1))
    roc = roc_auc_score(y_vl, y_pred_train)
    print('ROC AUC {}'.format(roc))
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    rocs.append(roc)
    
    
    y_preds+= clf.predict_proba(X_valid)[:,1] / EPOCHS
    
    # feat_importance:
    for i, col in enumerate(X_train.columns.tolist()):
        save = X_train[col].copy()
        valid_to_modify[col] = np.random.permutation(valid_to_modify[col])
        
        predict_permutation = clf.predict_proba(valid_to_modify)
        score_after_permut = roc_auc_score(y_vl, predict_permutation[:,1])
        perte = roc - score_after_permut
        permut_importances[i] += perte / EPOCHS
        valid_to_modify[col] = save
    if (permut_importances == 0 ).all():
        print('no change...')

In [None]:
# what do I need to do to maximize the score ? 
# -ensure with all that stuff the public LB keeps following the local valid
# if yes, keep doing some FE, notably W_HOUR_CENTS
# Ensemble multiple models (test if ensembling on X_train or X_subsample helps with valid as well)

# What do I need to do to maximize robustness ?
# do permutation importance on validation
# do permutation importance on cross-validation
# do multiple drop importance on undersampling
# do k2s and train-test prediction 
# remove the ones that are dangerous