In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')
sns.set(font_scale=1)

In [2]:
random_state = 42
np.random.seed(random_state)
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')

In [3]:
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [4]:
x,y = augment(df_train.drop(columns=['ID_code','target']).values,df_train.target)

In [5]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.20,random_state = 42)
train_x,test_x  = pd.DataFrame(train_x),pd.DataFrame(test_x)
train_x,test_x = train_x.add_prefix('var_'),test_x.add_prefix('var_')

In [9]:
lgb_params_bin = {
    "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.005,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    #"lambda_l1" : 5,
    #"lambda_l2" : 5,
    "bagging_seed" : random_state,
    "verbosity" : 1,
    "seed": random_state,
}

In [10]:
trn_data_bin = lgb.Dataset(train_x, label=train_y)
val_data_bin = lgb.Dataset(test_x, label=test_y)
evals_result_bin = {}
lgb_clf_bin = lgb.train(
    lgb_params_bin,
    trn_data_bin,
    200000,
    valid_sets = [trn_data_bin, val_data_bin],
    early_stopping_rounds=3000,
    verbose_eval = 1000,
    evals_result=evals_result_bin
)

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.886495	valid_1's auc: 0.879352
[2000]	training's auc: 0.892437	valid_1's auc: 0.885114
[3000]	training's auc: 0.896973	valid_1's auc: 0.888614
[4000]	training's auc: 0.90073	valid_1's auc: 0.891612
[5000]	training's auc: 0.904227	valid_1's auc: 0.89423
[6000]	training's auc: 0.907256	valid_1's auc: 0.896489
[7000]	training's auc: 0.909844	valid_1's auc: 0.898347
[8000]	training's auc: 0.912082	valid_1's auc: 0.899763
[9000]	training's auc: 0.914041	valid_1's auc: 0.900906
[10000]	training's auc: 0.915731	valid_1's auc: 0.901873
[11000]	training's auc: 0.917288	valid_1's auc: 0.902622
[12000]	training's auc: 0.918714	valid_1's auc: 0.903268
[13000]	training's auc: 0.920038	valid_1's auc: 0.903778
[14000]	training's auc: 0.921293	valid_1's auc: 0.904161
[15000]	training's auc: 0.922507	valid_1's auc: 0.90448
[16000]	training's auc: 0.923658	valid_1's auc: 0.904777
[17000]	training's auc: 0.924796	val

[144000]	training's auc: 0.994354	valid_1's auc: 0.910439
[145000]	training's auc: 0.994509	valid_1's auc: 0.910434
[146000]	training's auc: 0.994658	valid_1's auc: 0.910458
[147000]	training's auc: 0.994803	valid_1's auc: 0.910459
[148000]	training's auc: 0.994946	valid_1's auc: 0.910478
[149000]	training's auc: 0.995086	valid_1's auc: 0.910492
[150000]	training's auc: 0.995226	valid_1's auc: 0.910504
[151000]	training's auc: 0.99536	valid_1's auc: 0.91052
[152000]	training's auc: 0.995489	valid_1's auc: 0.910529
[153000]	training's auc: 0.995616	valid_1's auc: 0.91056
[154000]	training's auc: 0.995741	valid_1's auc: 0.910588
[155000]	training's auc: 0.995861	valid_1's auc: 0.910597
[156000]	training's auc: 0.99598	valid_1's auc: 0.910624
[157000]	training's auc: 0.996097	valid_1's auc: 0.910629
[158000]	training's auc: 0.996214	valid_1's auc: 0.910637
[159000]	training's auc: 0.996323	valid_1's auc: 0.910658
[160000]	training's auc: 0.996431	valid_1's auc: 0.910667
[161000]	training'

In [None]:
train_pred = lgb_clf_bin.predict(train_x, num_iteration=lgb_clf_bin.best_iteration)
test_pred = lgb_clf_bin.predict(test_x, num_iteration=lgb_clf_bin.best_iteration)

In [None]:
new_train_y = train_y-train_pred
new_test_y = test_y-test_pred

In [13]:
lgb_params_reg = {
    "objective" : "regression",
    "metric" : "l2",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    "bagging_seed" : random_state,
    "verbosity" : 1,
    "seed": random_state
}

In [None]:
trn_data_reg = lgb.Dataset(test_x, label=new_test_y)
val_data_reg = lgb.Dataset(train_x, label=new_train_y)
evals_result_reg = {}
lgb_clf_reg = lgb.train(
    lgb_params_reg,
    trn_data_reg,
    100000,
    valid_sets = [trn_data_reg, val_data_reg],
    early_stopping_rounds=3000,
    verbose_eval = 1000,
    evals_result=evals_result_reg
)

In [11]:
predictions_bin = lgb_clf_bin.predict(df_test.drop(columns=['ID_code']), num_iteration=lgb_clf_bin.best_iteration)

In [12]:
submission = pd.DataFrame({'ID_code':df_test.ID_code,'target':predictions_bin})
submission.to_csv('../reports/submission.csv',index=False)