In [1]:
%load_ext pycodestyle_magic
%flake8_on

In [2]:
%matplotlib inline

import gc
from time import time
import datetime
import multiprocessing
import warnings


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp


from tqdm import tqdm, tqdm_notebook
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, TimeSeriesSplit, train_test_split
from sklearn.metrics import roc_auc_score

from reduce_mem_usage import reduce_mem_usage

In [3]:
warnings.simplefilter('ignore')
sns.set()

In [4]:
train = pd.read_csv('input/train_w_fe.csv')
test = pd.read_csv('input/test_w_fe.csv')

# Remove some cols and split 

In [7]:
TARGET = 'isFraud'

In [8]:
rm_cols = [
    'TransactionID', 'TransactionDT',  # These columns are pure noise right now
    TARGET,                            # Not target in features))
    'uid', 'uid2', 'uid3',             # Our new client uID -> very noisy data
    'bank_type',                       # Victims bank could differ by time
    'DT', 'DT_M', 'DT_W', 'DT_D',      # Temporary Variables
    'DT_hour', 'DT_day_week', 'DT_day',
    'DT_D_total', 'DT_W_total', 'DT_M_total',
    'id_30', 'id_31', 'id_33',
]

features_columns = [col for col in list(train) if col not in rm_cols]

# The June month drops entirely
train['random_noise'] = np.random.randn(len(train))
print(train['DT'].max())
print(test['DT'].min())
# So we need to get rid of April and keep May as validation set
X_train = train[train['DT'] <= '2018-03-31']
y_train = X_train[TARGET]
X_train = X_train[features_columns]
X_valid = train[(train['DT'] >= '2018-05-01')]
y_valid = X_valid[TARGET]
X_valid = X_valid[features_columns]

2018-05-31 23:58:51
2018-07-01 00:00:24


# Subsample

In [9]:
pos_class = X_train[y_train == 1]
neg_class = X_train[y_train == 0].sample(frac=0.1, random_state=42)

X_subsample = pd.concat([pos_class, neg_class])
X_subsample = X_subsample.sample(frac=1)  # quick way to shuffle
y_subsample = y_train[X_subsample.index]
assert len(X_subsample) == len(y_subsample)
assert (X_subsample.index == y_subsample.index).all()

# Baseline reminder 

In [11]:
lgb_params = {
                    'objective': 'binary',
                    'boosting_type': 'gbdt',
                    'metric': 'auc',
                    'n_jobs': -1,
                    'learning_rate': 0.01,
                    'num_leaves': 496,
                    'max_depth': -1,
                    'min_data_in_leaf': 50,
                    'tree_learner': 'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq': 1,
                    'subsample': 0.7,
                    'n_estimators': 800,
                    'max_bin': 255,
                    'verbose': -1,
                    'seed': 24,
                    'early_stopping_rounds': 100,
                } 

In [12]:
tr_data = lgb.Dataset(X_subsample, label=y_subsample)
vl_data = lgb.Dataset(X_valid, label=y_valid)

estimator = lgb.train(
                lgb_params,
                tr_data,
                valid_sets=[tr_data, vl_data],
                verbose_eval=100)

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.964286	valid_1's auc: 0.89266
[200]	training's auc: 0.97975	valid_1's auc: 0.901452
[300]	training's auc: 0.990258	valid_1's auc: 0.908027
[400]	training's auc: 0.995959	valid_1's auc: 0.912937
[500]	training's auc: 0.998521	valid_1's auc: 0.91514
[600]	training's auc: 0.999532	valid_1's auc: 0.915683
[700]	training's auc: 0.999878	valid_1's auc: 0.91592
[800]	training's auc: 0.999974	valid_1's auc: 0.915932
Did not meet early stopping. Best iteration is:
[800]	training's auc: 0.999974	valid_1's auc: 0.915932


# Feature idea: SUM_ID_NANs

In [15]:
# get the id cols (more than 6 characters is a col engineered with id)
id_cols = [col for col in X_subsample.columns if 'id' in col and len(col) < 6]

In [16]:
for col in id_cols:
    X_subsample[col] = X_subsample[col].apply(lambda x: np.nan if x == 'nan' else x)

In [17]:
X_subsample['id_nan_count'] = X_subsample[id_cols].isnull().sum(axis=1)
X_valid['id_nan_count'] = X_valid[id_cols].isnull().sum(axis=1)

In [19]:
tr_data = lgb.Dataset(X_subsample, label=y_subsample)
vl_data = lgb.Dataset(X_valid, label=y_valid)

estimator = lgb.train(
                lgb_params,
                tr_data,
                valid_sets=[tr_data, vl_data],
                verbose_eval=100) 

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.964076	valid_1's auc: 0.892608
[200]	training's auc: 0.979792	valid_1's auc: 0.901635
[300]	training's auc: 0.990203	valid_1's auc: 0.909133
[400]	training's auc: 0.995951	valid_1's auc: 0.913507
[500]	training's auc: 0.998506	valid_1's auc: 0.91507
[600]	training's auc: 0.999528	valid_1's auc: 0.916248
[700]	training's auc: 0.999874	valid_1's auc: 0.916457
[800]	training's auc: 0.999974	valid_1's auc: 0.916279
Did not meet early stopping. Best iteration is:
[800]	training's auc: 0.999974	valid_1's auc: 0.916279


# feature idea: C1 third quartile
is this C1 value in the top 25% of the C1 values FOR THIS UID ?

In [21]:
# Try to regroup some variables to find back the users
X_subsample['uid'] = X_subsample['card1'].astype(str) + '_' + \
    X_subsample['card2'].astype(str)
X_valid['uid'] = X_valid['card1'].astype(str) + '_' + \
    X_valid['card2'].astype(str)

X_subsample['uid2'] = X_subsample['uid'].astype(str) + '_' + \
    X_subsample['card3'].astype(str) + '_' + X_subsample['card4'].astype(str)
X_valid['uid2'] = X_valid['uid'].astype(str) + '_' + \
    X_valid['card3'].astype(str) + '_' + X_valid['card4'].astype(str)

X_subsample['uid3'] = X_subsample['uid2'].astype(str) + '_' + \
    X_subsample['addr1'].astype(str) + '_' + X_subsample['addr2'].astype(str)
X_valid['uid3'] = X_valid['uid2'].astype(str) + '_' + \
    X_valid['addr1'].astype(str) + '_' + X_valid['addr2'].astype(str)

In [22]:
X_subsample['C1_third_quartile'] = \
    X_subsample.groupby('uid3')['C1'].transform(lambda x: x > x.quantile(0.75))

X_valid['C1_third_quartile'] = \
    X_valid.groupby('uid3')['C1'].transform(lambda x: x > x.quantile(0.75))

In [28]:
tr_data = lgb.Dataset(X_subsample.drop(['uid', 'uid2', 'uid3'], axis=1),
                      label=y_subsample)
vl_data = lgb.Dataset(X_valid.drop(['uid', 'uid2', 'uid3'], axis=1),
                      label=y_valid)

estimator = lgb.train(
                lgb_params,
                tr_data,
                valid_sets=[tr_data, vl_data],
                verbose_eval=100)

#  0.914075 => 0.91819
#  0.909696 => 0.919665

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.964592	valid_1's auc: 0.893758
[200]	training's auc: 0.979979	valid_1's auc: 0.902955
[300]	training's auc: 0.990349	valid_1's auc: 0.91026
[400]	training's auc: 0.995984	valid_1's auc: 0.914434
[500]	training's auc: 0.998529	valid_1's auc: 0.916136
[600]	training's auc: 0.99954	valid_1's auc: 0.916838
[700]	training's auc: 0.999878	valid_1's auc: 0.916692
Early stopping, best iteration is:
[634]	training's auc: 0.999701	valid_1's auc: 0.916851


# Test same idea for all C features

In [32]:
def quantile_trans(x):
    return x > x.quantile(decile)


C_cols = [col for col in X_subsample.columns
          if col.startswith('C') and '_' not in col]
results_c_cols = {}
for col in tqdm_notebook(C_cols):
    for decile in [0.75, 0.9]:
        X_subsample[col + '_third_quartile'] = \
            X_subsample.groupby('uid3')[col].transform(quantile_trans)
        X_valid[col + '_third_quartile'] = \
            X_valid.groupby('uid3')[col].transform(quantile_trans)

        uids = ['uid', 'uid2', 'uid3']
        tr_data = lgb.Dataset(X_subsample.drop(uids, axis=1),
                              label=y_subsample)
        vl_data = lgb.Dataset(X_valid.drop(uids, axis=1),
                              label=y_valid)

        estimator3 = lgb.train(
                    lgb_params,
                    tr_data,
                    valid_sets=[tr_data, vl_data],
                    verbose_eval=0)

        score_wo_feat = 0.916279
        score_w_feat = estimator3.best_score['valid_1']['auc']
        print(decile, col, score_w_feat, score_w_feat > score_wo_feat)
        key = col + ' ' + str(decile)
        results_c_cols[key] = score_w_feat - score_wo_feat
        del(X_subsample[col+'_third_quartile'])
        del(X_valid[col+'_third_quartile'])

HBox(children=(IntProgress(value=0, max=14), HTML(value='')))

0.75 C1 0.916795425004353 True
0.9 C1 0.9168320630409046 True
0.75 C2 0.9176519936917615 True
0.9 C2 0.9164476430240032 True
0.75 C3 0.9168900372581565 True
0.9 C3 0.9168900372581565 True
0.75 C4 0.9164403794848331 True
0.9 C4 0.9172590958209366 True
0.75 C5 0.9170851657193969 True
0.9 C5 0.9159894291740267 False
0.75 C6 0.9170134764502352 True
0.9 C6 0.9162104568085228 False
0.75 C7 0.9164801613301332 True
0.9 C7 0.9171530481490551 True
0.75 C8 0.9170694243263033 True
0.9 C8 0.9166573358125022 True
0.75 C9 0.9165775933270917 True
0.9 C9 0.9159102379726146 False
0.75 C10 0.9163536565520356 True
0.9 C10 0.9166560246505597 True
0.75 C11 0.9169702192808092 True
0.9 C11 0.9163046742237869 True
0.75 C12 0.916674172323809 True
0.9 C12 0.9169142378807141 True
0.75 C13 0.9160665381608767 False
0.9 C13 0.9167152020078896 True
0.75 C14 0.9164115041229637 True
0.9 C14 0.9173936836144182 True


# Feature idea: same general idea with TransactionAmt
is this amount of transaction unusually high for this user ?

In [35]:
omg, omg2 = 1, 2
del(omg, omg2)

In [37]:
group = X_subsample.groupby('uid3')['TransactionAmt']
X_subsample['AMT_95'] = group.transform(lambda x:  x > x.quantile(0.95))
X_subsample['AMT_85'] = group.transform(lambda x:  x > x.quantile(0.85))
X_subsample['AMT_75'] = group.transform(lambda x:  x > x.quantile(0.75))
X_subsample['AMT_65'] = group.transform(lambda x:  x > x.quantile(0.65))
X_subsample['AMT_55'] = group.transform(lambda x:  x > x.quantile(0.55))
X_subsample['AMT_HH'] = X_subsample['AMT_55'] + X_subsample['AMT_65'] + \
    X_subsample['AMT_75'] + X_subsample['AMT_85'] + X_subsample['AMT_95']
del(X_subsample['AMT_95'], X_subsample['AMT_85'], X_subsample['AMT_75'])
del(X_subsample['AMT_65'], X_subsample['AMT_55'])

group = X_valid.groupby('uid3')['TransactionAmt']
X_valid['AMT_95'] = group.transform(lambda x:  x > x.quantile(0.95))
X_valid['AMT_85'] = group.transform(lambda x:  x > x.quantile(0.85))
X_valid['AMT_75'] = group.transform(lambda x:  x > x.quantile(0.75))
X_valid['AMT_65'] = group.transform(lambda x:  x > x.quantile(0.65))
X_valid['AMT_55'] = group.transform(lambda x:  x > x.quantile(0.55))
X_valid['AMT_HH'] = X_valid['AMT_55'] + X_valid['AMT_65'] + \
    X_valid['AMT_75'] + X_valid['AMT_85'] + X_valid['AMT_95']
del(X_valid['AMT_95'], X_valid['AMT_85'], X_valid['AMT_75'])
del(X_valid['AMT_65'], X_valid['AMT_55'])

In [39]:
tr_data = lgb.Dataset(X_subsample.drop(uids, axis=1), label=y_subsample)
vl_data = lgb.Dataset(X_valid.drop(uids, axis=1), label=y_valid)

estimator2 = lgb.train(
                lgb_params,
                tr_data,
                valid_sets=[tr_data, vl_data],
                verbose_eval=100)

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.964472	valid_1's auc: 0.893614
[200]	training's auc: 0.979942	valid_1's auc: 0.901729
[300]	training's auc: 0.990367	valid_1's auc: 0.909175
[400]	training's auc: 0.995987	valid_1's auc: 0.913352
[500]	training's auc: 0.998531	valid_1's auc: 0.915003
[600]	training's auc: 0.99954	valid_1's auc: 0.915721
[700]	training's auc: 0.999879	valid_1's auc: 0.915657
Early stopping, best iteration is:
[631]	training's auc: 0.999691	valid_1's auc: 0.915833


Worse than without the var (score without the var is 0.9162, here 0.9158

# Add more Target_encoding
didn't work

In [41]:
permut_imp_df = pd.read_csv('permut_imp_df.csv')
permut_imp_df.head()

Unnamed: 0,permut_importances,cols
0,0.020405,TransactionAmt
1,0.502413,ProductCD
2,0.0,card1
3,0.076115,card2
4,0.0,card3


In [43]:
mask = permut_imp_df['permut_importances'] > 0.001
useful_cols = permut_imp_df[mask]['cols'].tolist()

In [45]:
categorical_cols = X_subsample.select_dtypes('category').columns
X_subsample['isFraud'] = y_subsample
for col in categorical_cols:
    group = X_subsample.groupby([col])['isFraud']
    temp_dict = group.agg(['mean']).reset_index().rename(
        columns={'mean': col+'_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col+'_target_mean'].to_dict()

    X_subsample[col+'_target_mean'] = X_subsample[col].map(temp_dict)
    X_valid[col+'_target_mean'] = X_valid[col].map(temp_dict)
del(X_subsample['isFraud'])

# ProductCD
# card1 - card6
# addr1, addr2
# Pemaildomain Remaildomain
# M1 - M9

In [None]:
# results worse than without those vars
results_categories = {}
for col in categorical_cols:
    tr_data = lgb.Dataset(X_subsample[useful_cols +[col+'_target_mean']],
                          label=y_subsample)
    vl_data = lgb.Dataset(X_valid[useful_cols+[col+'_target_mean']],
                          label=y_valid)  

    estimator = lgb.train(
                lgb_params,
                tr_data,
                valid_sets=[tr_data, vl_data],
                verbose_eval=0)
    score = estimator.best_score['valid_1']['auc']
    print(col + '_target_mean', score)
    results_categories[col + '_target_mean'] = score - 0.916279

# Next notebook

In [None]:
# ideés: 
# -revoir le uid avec addr1__addr2 + ALLcards
# -revoir le uid avec addr1__addr2 + ALLcards + concatALLIDS
# -Use D columns to figure out better ways to make sure the above makes sense :
#    see https://www.kaggle.com/akasyanama13/eda-what-s-behind-d-features

# that stuff: https://www.kaggle.com/kyakovlev/ieee-fe-with-some-eda
# and here : https://www.kaggle.com/c/ieee-fraud-detection/discussion/108467#624394