In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import sys
import gc

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
#from src.preprocess import *
from src.fe_modeling import *
from numba import jit
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import graphviz

from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from scipy.stats import ks_2samp
from scipy.stats.mstats import gmean

from tqdm import tqdm_notebook as tqdm
from IPython.display import display

import random

%matplotlib inline

pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000
pd.options.display.max_colwidth = 1000

In [2]:
# Set a few plotting defaults
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 6)
plt.rcParams['font.size'] = 12

In [3]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [4]:
SEED = 42
seed_everything(SEED)
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
TARGET = 'isFraud'
NFOLDS = 5

In [5]:
%%time
X = pd.read_pickle('../data/train_reduced.pkl')
X_test = pd.read_pickle('../data/test_reduced.pkl')

sample_submission = pd.read_csv('../data/sample_submission.csv', index_col='TransactionID')

y = X[TARGET]
X = X.drop(TARGET, axis=1)
        
print(f'X.shape : {X.shape}, X_test.shape : {X_test.shape}')

X.shape : (590540, 434), X_test.shape : (506691, 434)
Wall time: 7.17 s


In [6]:
def train_val_split_by_time(X, y, test_size=0.2):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, shuffle=False)
    
    print(f'train.shape: {X_train.shape}, val.shape: {X_val.shape}')
    
    return X_train, y_train, X_val, y_val

In [7]:
X_train, y_train, X_val, y_val = train_val_split_by_time(X, y)

train.shape: (472432, 434), val.shape: (118108, 434)


In [8]:
lgb_params = {
    'objective': 'binary',
    #'metric': 'auc',
    'metric': 'None',
    'learning_rate': 0.01,
    'num_leaves': 2**8,
    'max_bin': 255,
    'max_depth': -1,
    'bagging_freq': 5,
    'bagging_fraction': 0.7,
    'bagging_seed': SEED,
    'feature_fraction': 0.7,
    'feature_fraction_seed': SEED,
    'first_metric_only': True,
    'verbose': 100,
    'n_jobs': -1,
    'seed': SEED,
}

In [9]:
cols_to_drop = [
    'D5_DT_W_std_score',
    'ProductCD_TransactionAmt_DT_W',
    'D4_DT_D_std_score',
    'D15_DT_D_std_score',
    'D3_DT_W_std_score',
    'D11_DT_W_std_score',
    'card3_card5_DT_W_week_day_dist',
    'card5_DT_W_week_day_dist',
    'D10_DT_D_std_score',
    'card3_card5_DT_D',
    'ProductCD_cents_DT_D',
    'D4_DT_W_std_score',
    'D15_DT_W_std_score',
    'uid_DT_D',
    'card3_DT_W_week_day_dist',
    'D10_DT_W_std_score',
    'D8_DT_D_std_score',
    'card3_card5_DT_W',
    'ProductCD_cents_DT_W',
    'uid_DT_W',
    'D8_DT_W_std_score'
]

In [None]:
X_train, X_val, category_cols1 = fe1(X_train, X_val, cols_to_drop)

Rare data card1 5134
No intersection in Train card1 20399
Intersection in Train card1 452033
####################
Rare data ProductCD_card1 10509
No intersection in Train ProductCD_card1 33115
Intersection in Train ProductCD_card1 439317
####################
Rare data card1_addr1 21640
No intersection in Train card1_addr1 57867
Intersection in Train card1_addr1 414565
####################
Rare data TransactionAmt_dist2 18260
No intersection in Train TransactionAmt_dist2 49343
Intersection in Train TransactionAmt_dist2 423089
####################
No intersection in Train card2 6102
Intersection in Train card2 466330
####################
No intersection in Train card3 146
Intersection in Train card3 472286
####################
No intersection in Train card4 0
Intersection in Train card4 472432
####################
No intersection in Train card5 7339
Intersection in Train card5 465093
####################
No intersection in Train card6 45
Intersection in Train card6 472387
###############

In [None]:
category_cols1

In [None]:
fi_df, best_iteration1, val_preds = make_val_prediction(X_train, y_train, X_val, y_val, category_cols=category_cols1,lgb_params=lgb_params)

In [None]:
np.save('val_preds_lgb.npy', val_preds)

In [None]:
X, X_test, category_cols = fe1(X, X_test)
preds = make_test_prediction(X, y, X_test, best_iteration1, category_cols=category_cols)

In [None]:
np.save('preds_lgb.npy', preds)