In [6]:
%load_ext pycodestyle_magic
%flake8_on

The pycodestyle_magic extension is already loaded. To reload it, use:
  %reload_ext pycodestyle_magic


In [7]:
%matplotlib inline

import gc
from time import time
import datetime
import multiprocessing
import warnings


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp


from tqdm import tqdm, tqdm_notebook
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, TimeSeriesSplit, train_test_split
from sklearn.metrics import roc_auc_score

from reduce_mem_usage import reduce_mem_usage

In [8]:
warnings.simplefilter('ignore')
sns.set()

In [10]:
train = pd.read_csv('input/train_w_fe.csv')
test = pd.read_csv('input/test_w_fe.csv')

# Remove some cols and split 

In [7]:
TARGET = 'isFraud'

In [8]:
rm_cols = [
    'TransactionID', 'TransactionDT',  # These columns are pure noise right now
    TARGET,                            # Not target in features))
    'uid', 'uid2', 'uid3',             # Our new client uID -> very noisy data
    'bank_type',                       # Victims bank could differ by time
    'DT', 'DT_M', 'DT_W', 'DT_D',      # Temporary Variables
    'DT_hour', 'DT_day_week', 'DT_day',
    'DT_D_total', 'DT_W_total', 'DT_M_total',
    'id_30', 'id_31', 'id_33',
]

features_columns = [col for col in list(train) if col not in rm_cols]

# The June month drops entirely
train['random_noise'] = np.random.randn(len(train))
print(train['DT'].max())
print(test['DT'].min())
# So we need to get rid of April and keep May as validation set
X_train = train[train['DT'] <= '2018-03-31']
y_train = X_train[TARGET]
X_train = X_train[features_columns]
X_valid = train[(train['DT'] >= '2018-05-01')]
y_valid = X_valid[TARGET]
X_valid = X_valid[features_columns]

2018-05-31 23:58:51
2018-07-01 00:00:24


# Predict

In [15]:
def make_predictions_lgb(tr_df, tt_df, features_columns,
                         target, lgb_params, NFOLDS=2):
    folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

    X, y = tr_df[features_columns], tr_df[target]
    P = tt_df[features_columns]

    predictions = np.zeros(len(sub))
    permut_imp_allfolds = []

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('Fold:', fold_)
        tr_x, tr_y = X.iloc[trn_idx, :], y[trn_idx]
        vl_x, vl_y = X.iloc[val_idx, :], y[val_idx]

        print(len(tr_x), len(vl_x))
        tr_data = lgb.Dataset(tr_x, label=tr_y)

        vl_data = lgb.Dataset(vl_x, label=vl_y)

        estimator = lgb.train(
            lgb_params,
            tr_data,
            valid_sets=[tr_data, vl_data],
            verbose_eval=200,
        )

        pp_p = estimator.predict(P)
        predictions += pp_p/NFOLDS

        permut_imp_fold = permut_imp(estimator, vl_x, vl_y)
        permut_imp_allfolds.append(permut_imp_fold)

        del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data
        gc.collect()

    tt_df['prediction'] = predictions

    return estimator, tt_df



In [104]:
features_columns = [col for col in list(train) if col not in rm_cols]

In [None]:
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 496,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':800,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': 24,
                    'early_stopping_rounds':100, 
                } 

lgb_params['learning_rate'] = 0.005
lgb_params['n_estimators'] = 1800
lgb_params['early_stopping_rounds'] = 100    
clf, test_predictions = make_predictions_lgb(train, test,
                                             features_columns,
                                             TARGET, lgb_params,
                                             NFOLDS=7)

Fold: 0
506177 84363
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.960897	valid_1's auc: 0.94032
[400]	training's auc: 0.984326	valid_1's auc: 0.955628
[600]	training's auc: 0.994611	valid_1's auc: 0.965867


In [None]:
test_predictions['isFraud'] = test_predictions['prediction']
test_predictions['TransactionID']=test['TransactionID']
test_predictions[['TransactionID','isFraud']].to_csv('submission.csv', index=False)