## Santander Customer Transaction Prediction

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, scale
from sklearn.metrics import roc_auc_score

In [3]:
#train=pd.read_csv('data/train.csv')
train=pd.read_csv("data/train_with_FE.csv")

In [4]:
test=pd.read_csv('data/test.csv')

In [6]:
features=[c for c in train.columns if c not in ['ID_code', 'target']]

In [4]:
train.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,test_var_195,test1_var_195,test_var_196,test1_var_196,test_var_197,test1_var_197,test_var_198,test1_var_198,test_var_199,test1_var_199
0,train_0,0,-0.577102,-1.273737,0.451707,-0.833709,0.235571,-0.53643,-0.334926,0.608751,...,-3.068013,-0.811748,1.987122,0.525761,-0.97077,-0.144063,-1.995417,-0.527956,0.295597,0.155122
1,train_1,0,0.269959,-0.622138,1.19036,-0.688846,0.790975,1.5399,0.244461,-0.003525,...,2.745405,0.844147,1.925416,0.59202,-0.368492,-0.04544,,,0.698061,0.366325
2,train_2,0,-0.681113,-0.276066,0.516988,0.536516,-0.305477,-0.511033,1.769839,-0.564749,...,3.17135,1.664246,-2.628296,-0.995938,-1.737201,-0.27816,-0.687909,-0.211516,0.492373,0.258385
3,train_3,0,0.125158,-0.129426,-0.667575,0.195355,0.927992,0.410672,0.500633,-0.474201,...,-1.7705,-0.352069,-1.864428,-0.493298,2.708259,0.832725,0.96232,0.505002,-0.750264,-0.39372
4,train_4,0,-0.277303,0.03561,0.817683,-0.077829,0.738607,0.955574,0.613372,0.791544,...,-2.221023,-0.413625,0.578616,0.153093,1.495873,0.278579,0.97502,0.511667,-0.725242,-0.380589


In [8]:
test.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,-2.1556,11.8495,-1.43,2.4508,13.7112,2.4669,4.3654,10.72,15.4722,-8.7197
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,10.6165,8.8349,0.9403,10.1282,15.5765,0.4773,-1.4852,9.8714,19.1293,-20.976
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,-0.7484,10.9935,1.9803,2.18,12.9813,2.1281,-7.1086,7.0618,19.8956,-23.1794
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,4.2259,9.1723,1.2835,3.3778,19.5542,-0.286,-5.1612,7.2882,13.926,-9.1846



## Data exploration analysis
- Thanks to Santander EDA and Prediction from Gabriel Preda (https://www.kaggle.com/gpreda/santander-eda-and-prediction)
- No missing data in both train and test datasets
- Standard deviation is relatively large for both train and test variable data, mean values are distributed over a large range
- Only 10 % train data with a target value of 1
- Barely no correlation among features
- Var 68, 313, 126 have duplicated values of about 1000, 300, 300, respectively.

## Feature engineering
Use effective FEs from `EDA` jupyter file

### 1. Combine train data and real test data

In [9]:
from tqdm import tqdm_notebook as tqdm
df_test=test.drop(['ID_code'], axis=1)
df_test = df_test.values

In [10]:
unique_samples = []
unique_count = np.zeros_like(df_test)
for feature in tqdm(range(df_test.shape[1])):
    _, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1

# Samples which have unique values are real the others are fake
real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [15]:
print(len(real_samples_indexes))
print(len(synthetic_samples_indexes))

100000
100000


In [16]:
df_test_real = df_test[real_samples_indexes].copy()
df_test_real=pd.DataFrame(df_test_real)
df_test_real=df_test_real.add_prefix('var_')
df_test_real.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,7.4578,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
1,17.3035,-2.4212,13.3989,8.3998,11.0777,9.6449,5.9596,17.8477,-4.8068,7.4643,...,4.4676,4.4214,0.9303,1.4994,15.2648,-1.7931,6.5316,10.4855,23.4631,0.7283
2,10.6137,-2.1898,8.909,3.8014,13.8602,-5.9802,5.5515,15.4716,-0.1714,7.6178,...,13.1683,4.0625,-0.1537,7.9787,18.4518,0.1,-7.8212,9.2355,15.0721,-7.3475
3,14.8595,-4.5378,13.6483,5.648,9.9144,1.519,5.0358,13.4524,-2.5419,9.445,...,2.6735,5.8526,4.8517,2.502,22.8224,-0.9325,8.6849,10.2848,17.4932,6.08
4,14.1732,-5.149,9.7591,3.7316,10.37,-21.9202,7.713,18.8749,0.468,7.8453,...,0.864,5.9058,1.314,4.8961,20.1087,1.1051,7.7184,9.3406,21.1746,-2.0098


In [17]:
train_value=train.drop(['ID_code', 'target'], axis=1)
df_combined=pd.concat([train_value, df_test_real])
df_combined.shape

(300000, 200)

In [19]:
df_combined.to_csv('data/combined_train_test.csv', index=False)

In [204]:
del [df_test, df_combined, df_test_real]

### 2. count unique numbers

In [211]:
df_combined=pd.read_csv('data/combined_train_test.csv')

In [225]:
for i in range(200):
    var='var_'+str(i)
    if i%25==0:
        print (i)
    
    
    dictionary=df_combined[var].value_counts().to_dict()
    train['count_'+var]=train[var].map(dictionary)
    #test['count_'+var]=test[var].map(dictionary,na_action='ignore')   

0
25
50
75
100
125
150
175


In [26]:
train.to_csv('data/train_with_count.csv', index=False)
test.to_csv('data/test_with_count.csv', index=False)

In [205]:
train=pd.read_csv('data/train_with_count.csv')

### 3. Data scaling and FE based on counts

In [206]:
sc=StandardScaler()
train[train.columns[2:202]]=sc.fit_transform(train[train.columns[2:202]])
#test[test.columns[1:201]]=sc.fit_transform(test[test.columns[1:201]])

In [212]:
df_combined.isnull().sum().sum()

0

In [30]:
for i in range(200):
    var='var_'+str(i)
    if i%25==0:
        print (i)
    train['test_'+var]=train[var]*(np.log2(train['count_'+var]+1)**0.7)
    train['test1_'+var]=train[var]/(np.log2(train['count_'+var]+1)**0.7)
    train.drop('count_'+var, inplace=True, axis=1)
    test['test_'+var]=test[var]*(np.log2(test['count_'+var]+1)**0.7)
    test['test1_'+var]=test[var]/(np.log2(test['count_'+var]+1)**0.7)
    test.drop('count_'+var, inplace=True, axis=1)

0
25
50
75
100
125
150
175


In [32]:
train.to_csv('data/train_with_FE.csv', index=False)
test.to_csv('data/test_with_FE.csv', index=False)

### 4. Data augment by shuffling

In [5]:
def shuffle(x,y,t=2):
    # Shuffle positive samples twice and added to the existing 
    pos_shuffle=[]
    neg_shuffle=[]
    for i in range(t):
        mask=y==1
        x1=x[mask].copy()
        rows=np.arange(x1.shape[0])
        for col in range(x1.shape[1]):
            #np.random.seed(1111)
            np.random.shuffle(rows) # shuffle rows in each column
            x1[:,col]=x1[rows][:,col]
        pos_shuffle.append(x1)
    for i in range(t//2):
        mask=y==0
        x1=x[mask].copy()
        rows=np.arange(x1.shape[0])
        for col in range(x1.shape[1]):
            #np.random.seed(1111)
            np.random.shuffle(rows) # shuffle rows in each column
            x1[:,col]=x1[rows][:,col]
        neg_shuffle.append(x1)
    pos_shuffle=np.vstack(pos_shuffle)  #stack the dataframe, maybe do contatenate is better
    neg_shuffle=np.vstack(neg_shuffle)
    pos_y = np.ones(pos_shuffle.shape[0]) # make labels for the shuffled results
    neg_y = np.zeros(neg_shuffle.shape[0])
    x = np.vstack([x,pos_shuffle,neg_shuffle]) #augmented data. As pos target is around 0.1, pos_s is 0.2x, neg_s is 0.9x, 2.1x size in total
    y = np.concatenate([y,pos_y,neg_y])
    return x,y

In [4]:
y=train['target']
X = train.drop(['target', 'ID_code'], axis=1)

In [62]:
X_test = test.drop(['ID_code'],axis = 1)
X_test= sc.fit_transform(X_test)

In [34]:
#sc = StandardScaler()
#X= sc.fit_transform(X)
#X_test = sc.transform(X_test)
n_splits = 5 # Number of K-fold Splits

splits = list(StratifiedKFold(n_splits=n_splits, shuffle=True).split(X, y))

### LightGBM

In [35]:
import lightgbm as lgb

In [36]:
lgb_param = {
    'bagging_freq': 5,
        'bagging_fraction': 0.35,
        'boost_from_average':'false',
        'boost': 'gbdt',#gbdt
        'feature_fraction': 0.055,#0.045
        'learning_rate': 0.012,#0.0083
        'max_depth': 1, #1
        'metric':'auc',
        'min_data_in_leaf': 80, 
        'min_sum_hessian_in_leaf': 12,#10
        'num_leaves': 2, #8
        #'num_threads': 8,
        'tree_learner': 'serial',
        'objective': 'binary', 
        'verbosity': 1,
        "boost_from_average": "false",
        #'lambda_l1':0.1
        "max_bin":2000
        #'is_unbalance':True #new
}

In [39]:
oof = np.zeros(len(X))
#predictions = np.zeros(len(X_test))
#feature_importance_df=pd.DataFrame()
for i, (train_idx, valid_idx) in enumerate(splits):  
    print(f'Fold {i + 1}')
    x=np.array(X)
    y=np.array(y)
    x_train = x[train_idx]
    y_train = y[train_idx]
    
    trn_data = lgb.Dataset(x_train, label=y_train)
    #trn_data = lgb.Dataset(x_train[train_idx.astype(int)], label=y_train[train_idx.astype(int)])
    val_data = lgb.Dataset(x[valid_idx.astype(int)], label=y[valid_idx.astype(int)])
    watchlist = [(trn_data, 'train'), (val_data, 'valid')]
                 
    #num_round = 15000
    lgb_clf=lgb.train(lgb_param, trn_data, 300000, valid_sets = [trn_data, val_data], early_stopping_rounds=9000, verbose_eval=1000)
    
    oof[valid_idx] = lgb_clf.predict(x[valid_idx], num_iteration=lgb_clf.best_iteration)
        
    #fold_importance_df = pd.DataFrame()
    #fold_importance_df["feature"] = features
    #fold_importance_df["importance"] = lgb_clf.feature_importance()
    #fold_importance_df["fold"] = i+1
    #feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    #predictions += lgb_clf.predict(X_test, num_iteration=lgb_clf.best_iteration) /n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(y, oof)))

Fold 1
Training until validation scores don't improve for 9000 rounds.
[1000]	training's auc: 0.823534	valid_1's auc: 0.815366
[2000]	training's auc: 0.857125	valid_1's auc: 0.846728
[3000]	training's auc: 0.875794	valid_1's auc: 0.863512
[4000]	training's auc: 0.887461	valid_1's auc: 0.874406
[5000]	training's auc: 0.895656	valid_1's auc: 0.882155
[6000]	training's auc: 0.901281	valid_1's auc: 0.887527
[7000]	training's auc: 0.906003	valid_1's auc: 0.891942
[8000]	training's auc: 0.909686	valid_1's auc: 0.895386
[9000]	training's auc: 0.912509	valid_1's auc: 0.898058
[10000]	training's auc: 0.91507	valid_1's auc: 0.900347
[11000]	training's auc: 0.917187	valid_1's auc: 0.902287
[12000]	training's auc: 0.918937	valid_1's auc: 0.90381
[13000]	training's auc: 0.920467	valid_1's auc: 0.905065
[14000]	training's auc: 0.921773	valid_1's auc: 0.906341
[15000]	training's auc: 0.922933	valid_1's auc: 0.907422
[16000]	training's auc: 0.923988	valid_1's auc: 0.908391
[17000]	training's auc: 0.92

[23000]	training's auc: 0.92895	valid_1's auc: 0.91266
[24000]	training's auc: 0.929431	valid_1's auc: 0.912941
[25000]	training's auc: 0.92993	valid_1's auc: 0.913244
[26000]	training's auc: 0.9304	valid_1's auc: 0.913455
[27000]	training's auc: 0.930876	valid_1's auc: 0.913713
[28000]	training's auc: 0.931322	valid_1's auc: 0.914001
[29000]	training's auc: 0.931747	valid_1's auc: 0.914187
[30000]	training's auc: 0.932123	valid_1's auc: 0.91432
[31000]	training's auc: 0.932532	valid_1's auc: 0.914458
[32000]	training's auc: 0.9329	valid_1's auc: 0.914582
[33000]	training's auc: 0.933265	valid_1's auc: 0.914677
[34000]	training's auc: 0.933593	valid_1's auc: 0.914781
[35000]	training's auc: 0.933971	valid_1's auc: 0.914925
[36000]	training's auc: 0.934329	valid_1's auc: 0.915021
[37000]	training's auc: 0.934675	valid_1's auc: 0.915128
[38000]	training's auc: 0.935002	valid_1's auc: 0.915223
[39000]	training's auc: 0.935344	valid_1's auc: 0.915253
[40000]	training's auc: 0.935643	valid_

In [21]:
lgb_clf.save_model('lgb_clf.txt')
# lgb = lgb.Booster(model_file='lgb_clf.txt')  

<lightgbm.basic.Booster at 0x172b4d9bcc0>

### xgboost

In [13]:
import xgboost as xgb

In [28]:
param = {
        'eta': 0.1,
        'max_depth': 15,
        'gamma': 100,
        'objective':'binary:logistic',    
        'eval_metric':'auc',
        'seed':42
        }

In [75]:
xgb_param = {
    'min_child_weight': 10.0,
    'objective': 'binary:logistic',
    'max_depth': 7,
    'max_delta_step': 1.8,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'eta': 0.025,
    'gamma': 0.65,
    'eval_metric':'auc'
        }

In [24]:
oof = np.zeros(len(X))
#predictions = np.zeros(len(X_test))
#feature_importance_df = pd.DataFrame()

for i, (train_idx, valid_idx) in enumerate(splits):  
    print(f'Fold {i + 1}')
    x_train = np.array(X)
    y_train = np.array(y)
    trn_data = xgb.DMatrix(x_train[train_idx.astype(int)], label=y_train[train_idx.astype(int)])
    val_data = xgb.DMatrix(x_train[valid_idx.astype(int)], label=y_train[valid_idx.astype(int)])
    watchlist = [(trn_data, 'train'), (val_data, 'valid')]
                 
    #num_round = 15000
    clf=xgb.train(param, trn_data, 700, evals=watchlist, early_stopping_rounds=250, verbose_eval=500)
    
    oof[valid_idx] = clf.predict(x_train[valid_idx], ntree_limit=clf.best_ntree_limit)
    
    #fold_importance_df = pd.DataFrame()
    #fold_importance_df["feature"] = features
    #fold_importance_df["importance"] = clf.feature_importance()
    #fold_importance_df["fold"] = i + 1
    #feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    #predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / 5

print("CV score: {:<8.5f}".format(roc_auc_score(y, oof)))


Fold 1
[22:18:28] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 142 extra nodes, 0 pruned nodes, max_depth=7
[0]	train-auc:0.636529	valid-auc:0.617924
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 250 rounds.
[22:18:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 142 extra nodes, 0 pruned nodes, max_depth=7
[22:18:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 138 extra nodes, 0 pruned nodes, max_depth=7
[22:18:31] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 136 extra nodes, 0 pruned nodes, max_depth=7
[22:18:32] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 158 extra nodes, 0 pruned nodes, max_depth=7
[22:18:33] C:\Users\Administrator\Desktop\xgboo

[22:19:19] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 158 extra nodes, 0 pruned nodes, max_depth=7
[22:19:20] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 168 extra nodes, 0 pruned nodes, max_depth=7
[22:19:21] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 164 extra nodes, 0 pruned nodes, max_depth=7
[22:19:22] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 132 extra nodes, 0 pruned nodes, max_depth=7
[22:19:23] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 180 extra nodes, 0 pruned nodes, max_depth=7
[22:19:24] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 170 extra nodes, 0 pruned nodes, max_depth=7
[22:19:25] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pru

[22:20:14] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 164 extra nodes, 0 pruned nodes, max_depth=7
[22:20:15] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 154 extra nodes, 0 pruned nodes, max_depth=7
[22:20:16] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 170 extra nodes, 2 pruned nodes, max_depth=7
[22:20:17] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 142 extra nodes, 0 pruned nodes, max_depth=7
[22:20:18] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 146 extra nodes, 0 pruned nodes, max_depth=7
[22:20:19] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 156 extra nodes, 0 pruned nodes, max_depth=7
[22:20:20] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pru

[22:21:10] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 150 extra nodes, 0 pruned nodes, max_depth=7
[22:21:11] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 150 extra nodes, 0 pruned nodes, max_depth=7
[22:21:12] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 184 extra nodes, 0 pruned nodes, max_depth=7
[22:21:13] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 136 extra nodes, 0 pruned nodes, max_depth=7
[22:21:14] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 178 extra nodes, 0 pruned nodes, max_depth=7
[22:21:15] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 148 extra nodes, 0 pruned nodes, max_depth=7
[22:21:16] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pru

[22:22:10] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 148 extra nodes, 0 pruned nodes, max_depth=7
[22:22:11] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 174 extra nodes, 0 pruned nodes, max_depth=7
[22:22:12] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 166 extra nodes, 0 pruned nodes, max_depth=7
[22:22:13] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 160 extra nodes, 0 pruned nodes, max_depth=7
[22:22:14] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 148 extra nodes, 0 pruned nodes, max_depth=7
[22:22:15] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 152 extra nodes, 0 pruned nodes, max_depth=7
[22:22:16] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pru

[22:23:09] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 164 extra nodes, 0 pruned nodes, max_depth=7
[22:23:10] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 130 extra nodes, 0 pruned nodes, max_depth=7
[22:23:11] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 172 extra nodes, 0 pruned nodes, max_depth=7
[22:23:12] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 158 extra nodes, 0 pruned nodes, max_depth=7
[22:23:13] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 156 extra nodes, 0 pruned nodes, max_depth=7
[22:23:14] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=7
[22:23:15] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pru

[22:24:07] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 156 extra nodes, 0 pruned nodes, max_depth=7
[22:24:08] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 168 extra nodes, 0 pruned nodes, max_depth=7
[22:24:09] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 188 extra nodes, 0 pruned nodes, max_depth=7
[22:24:10] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 152 extra nodes, 0 pruned nodes, max_depth=7
[22:24:11] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 142 extra nodes, 0 pruned nodes, max_depth=7
[22:24:12] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 150 extra nodes, 0 pruned nodes, max_depth=7
[22:24:13] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pru

[22:25:04] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 142 extra nodes, 0 pruned nodes, max_depth=7
[22:25:05] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 186 extra nodes, 0 pruned nodes, max_depth=7
[22:25:06] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 142 extra nodes, 0 pruned nodes, max_depth=7
[22:25:08] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 180 extra nodes, 0 pruned nodes, max_depth=7
[22:25:09] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 166 extra nodes, 0 pruned nodes, max_depth=7
[22:25:10] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 142 extra nodes, 0 pruned nodes, max_depth=7
[22:25:11] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pru

[22:26:04] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 152 extra nodes, 0 pruned nodes, max_depth=7
[22:26:05] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nodes, 0 pruned nodes, max_depth=7
[22:26:06] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 166 extra nodes, 0 pruned nodes, max_depth=7
[22:26:07] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 176 extra nodes, 0 pruned nodes, max_depth=7
[22:26:08] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 164 extra nodes, 0 pruned nodes, max_depth=7
[22:26:09] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 196 extra nodes, 0 pruned nodes, max_depth=7
[22:26:10] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pru

[22:27:02] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 132 extra nodes, 0 pruned nodes, max_depth=7
[22:27:03] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 144 extra nodes, 0 pruned nodes, max_depth=7
[22:27:04] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 166 extra nodes, 0 pruned nodes, max_depth=7
[22:27:05] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 158 extra nodes, 0 pruned nodes, max_depth=7
[22:27:06] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 148 extra nodes, 0 pruned nodes, max_depth=7
[22:27:07] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 120 extra nodes, 0 pruned nodes, max_depth=7
[22:27:08] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pru

[22:28:01] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 160 extra nodes, 0 pruned nodes, max_depth=7
[22:28:02] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 182 extra nodes, 0 pruned nodes, max_depth=7
[22:28:03] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 144 extra nodes, 0 pruned nodes, max_depth=7
[22:28:04] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=7
[22:28:05] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 104 extra nodes, 0 pruned nodes, max_depth=7
[22:28:06] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 132 extra nodes, 0 pruned nodes, max_depth=7
[22:28:07] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pru

[22:28:59] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=7
[22:29:00] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 140 extra nodes, 0 pruned nodes, max_depth=7
[22:29:01] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 182 extra nodes, 0 pruned nodes, max_depth=7
[22:29:02] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 182 extra nodes, 0 pruned nodes, max_depth=7
[22:29:03] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 166 extra nodes, 0 pruned nodes, max_depth=7
[22:29:04] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 138 extra nodes, 0 pruned nodes, max_depth=7
[22:29:05] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pru

[22:29:54] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 138 extra nodes, 0 pruned nodes, max_depth=7
[22:29:55] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=7
[22:29:57] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 172 extra nodes, 0 pruned nodes, max_depth=7
[22:29:58] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 156 extra nodes, 0 pruned nodes, max_depth=7
[22:29:59] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 152 extra nodes, 0 pruned nodes, max_depth=7
[22:30:00] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 108 extra nodes, 0 pruned nodes, max_depth=7
[22:30:02] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pru

AttributeError: 'numpy.ndarray' object has no attribute 'feature_names'

In [73]:
xgb_clf=clf
xgb_clf.save_model('xgb_clf_raw.model')

### Deep neural network

### Tensorflow

In [5]:
import tensorflow as tf
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau,LearningRateScheduler
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.backend import eval, set_value

In [51]:
class DNNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, n_hidden_layers=5, n_neurons=100, optimizer_class=tf.train.AdamOptimizer,
                 learning_rate=0.01, batch_size=4000,activation=tf.nn.relu, initializer=tf.contrib.layers.xavier_initializer(),
                 batch_norm_momentum=None, dropout_rate=None, random_state=None):
        """Initialize the DNNClassifier by simply storing all the hyperparameters."""
        self.n_hidden_layers = n_hidden_layers
        self.n_neurons = n_neurons
        self.optimizer_class = optimizer_class
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.activation = activation
        self.initializer = initializer
        self.batch_norm_momentum = batch_norm_momentum
        self.dropout_rate = dropout_rate
        self.random_state = random_state
        self._session = None

    def _dnn(self, inputs):
        """Build the hidden layers, with support for batch normalization and dropout."""
        for layer in range(self.n_hidden_layers):
            if self.dropout_rate:
                inputs = tf.layers.dropout(inputs, self.dropout_rate, training=self._training)
            inputs = tf.layers.dense(inputs, self.n_neurons,
                                     kernel_initializer=self.initializer,
                                     name="hidden%d" % (layer + 1))
            if self.batch_norm_momentum:
                inputs = tf.layers.batch_normalization(inputs, momentum=self.batch_norm_momentum,
                                                       training=self._training)
            inputs = self.activation(inputs, name="hidden%d_out" % (layer + 1))
        return inputs

    def _build_graph(self, n_inputs, n_outputs):
        """Build the same model as earlier"""
        if self.random_state is not None:
            tf.set_random_seed(self.random_state)
            np.random.seed(self.random_state)

        X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
        y = tf.placeholder(tf.int32, shape=(None), name="y")

        if self.batch_norm_momentum or self.dropout_rate:
            self._training = tf.placeholder_with_default(False, shape=(), name='training')
        else:
            self._training = None

        dnn_outputs = self._dnn(X)

        logits = tf.layers.dense(dnn_outputs, n_outputs, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="logits")
        Y_proba = tf.nn.softmax(logits, name="Y_proba")

        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
        loss = tf.reduce_mean(xentropy, name="loss")

        optimizer = self.optimizer_class(learning_rate=self.learning_rate)
        training_op = optimizer.minimize(loss)

        correct = tf.nn.in_top_k(logits, y, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        # Make the important operations available easily through instance variables
        self._X, self._y = X, y
        self._Y_proba, self._loss = Y_proba, loss
        self._training_op, self._accuracy = training_op, accuracy
        self._init, self._saver = init, saver

    def close_session(self):
        if self._session:
            self._session.close()

    def _get_model_params(self):
        """Get all variable values (used for early stopping, faster than saving to disk)"""
        with self._graph.as_default():
            gvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        return {gvar.op.name: value for gvar, value in zip(gvars, self._session.run(gvars))}

    def _restore_model_params(self, model_params):
        """Set all variables to the given values (for early stopping, faster than loading from disk)"""
        gvar_names = list(model_params.keys())
        assign_ops = {gvar_name: self._graph.get_operation_by_name(gvar_name + "/Assign")
                      for gvar_name in gvar_names}
        init_values = {gvar_name: assign_op.inputs[1] for gvar_name, assign_op in assign_ops.items()}
        feed_dict = {init_values[gvar_name]: model_params[gvar_name] for gvar_name in gvar_names}
        self._session.run(assign_ops, feed_dict=feed_dict)

    def fit(self, X, y, n_epochs=100, X_valid=None, y_valid=None):
        """Fit the model to the training set. If X_valid and y_valid are provided, use early stopping."""
        self.close_session()

        # infer n_inputs and n_outputs from the training set.
        n_inputs = X.shape[1]
        self.classes_ = np.unique(y)
        n_outputs = len(self.classes_)
        
        self.class_to_index_ = {label: index
                                for index, label in enumerate(self.classes_)}
        y = np.array([self.class_to_index_[label]
                      for label in y], dtype=np.int32)
        
        self._graph = tf.Graph()
        with self._graph.as_default():
            self._build_graph(n_inputs, n_outputs)
            # extra ops for batch normalization
            extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        # needed in case of early stopping
        max_checks_without_progress = 20
        checks_without_progress = 0
        best_loss = np.infty
        best_params = None
        
        # Now train the model!
        self._session = tf.Session(graph=self._graph)
        with self._session.as_default() as sess:
            self._init.run()
            for epoch in range(n_epochs):
                start_pos=0
                for batches in range(int(len(X)/self.batch_size)):
                    X_batch,y_batch=X[start_pos:start_pos+self.batch_size],y[start_pos:start_pos+self.batch_size]
                    feed_dict = {self._X: X_batch, self._y: y_batch}
                    start_pos+=self.batch_size
                #rnd_idx = np.random.permutation(len(X))
                #for rnd_indices in np.array_split(rnd_idx, len(X) // self.batch_size):
                    #X_batch, y_batch = X[rnd_indices], y[rnd_indices]
                    #feed_dict = {self._X: X_batch, self._y: y_batch}
                    if self._training is not None:
                        feed_dict[self._training] = True
                    sess.run(self._training_op, feed_dict=feed_dict)
                    if extra_update_ops:
                        sess.run(extra_update_ops, feed_dict=feed_dict)
                if X_valid is not None and y_valid is not None:
                    loss_val, acc_val = sess.run([self._loss, self._accuracy],
                                                 feed_dict={self._X: X_valid,
                                                            self._y: y_valid})
                    if loss_val < best_loss:
                        best_params = self._get_model_params()
                        best_loss = loss_val
                        checks_without_progress = 0
                    else:
                        checks_without_progress += 1
                    print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
                        epoch, loss_val, best_loss, acc_val * 100))
                    if checks_without_progress > max_checks_without_progress:
                        print("Early stopping!")
                        break
                else:
                    loss_train, acc_train = sess.run([self._loss, self._accuracy],
                                                     feed_dict={self._X: X_batch,
                                                                self._y: y_batch})
                    print("{}\tLast training batch loss: {:.6f}\tAccuracy: {:.2f}%".format(
                        epoch, loss_train, acc_train * 100))
            # If we used early stopping then rollback to the best model found
            if best_params:
                self._restore_model_params(best_params)
            return self

    def predict_proba(self, X):
        if not self._session:
            raise NotFittedError("This %s instance is not fitted yet" % self.__class__.__name__)
        with self._session.as_default() as sess:
            return self._Y_proba.eval(feed_dict={self._X: X})

    def predict(self, X):
        class_indices = np.argmax(self.predict_proba(X), axis=1)
        return np.array([[self.classes_[class_index]]
                         for class_index in class_indices], np.int32)

    def save(self, path):
        self._saver.save(self._session, path)

In [None]:
dnn_clf = DNNClassifier(random_state=42)
dnn_clf.fit(X_train, y_train, n_epochs=100, X_valid=X_valid, y_valid=y_valid)

In [42]:
dnn_clf.save('dnn_clf')

## Keras

In [6]:
n_splits = 5
epochs = 50
batch_size = 32

In [39]:
def get_model():
    
    #model = tf.keras.models.Sequential()
    #model.add(tf.keras.layers.Dense(64, input_shape=(600,)))
    #model.add(tf.keras.layers.Activation('relu'))
    #model.add(tf.keras.layers.BatchNormalization())

    #model.add(tf.keras.layers.Dense(32))
    #model.add(tf.keras.layers.Activation('relu'))
    #model.add(tf.keras.layers.BatchNormalization())
    
    #model.add(tf.keras.layers.Dense(1, activation='softmax')) 
   
    
    #inputs = tf.keras.layers.Input(shape=(200,3))
    inputs = tf.keras.layers.Input(shape=(600,))
    
    main = tf.keras.layers.Dense(256, activation='relu')(inputs)
    main=tf.keras.layers.BatchNormalization()(main)
    #main=tf.keras.layers.Dropout(0.25)(main)  #Did not see performance improvement
    
    main = tf.keras.layers.Dense(128, activation='relu')(main)
    main=tf.keras.layers.BatchNormalization()(main)
    
    main = tf.keras.layers.Dense(64, activation='relu')(main)
    main=tf.keras.layers.BatchNormalization()(main)
    
    main = tf.keras.layers.Dense(32, activation='relu')(main)
    main=tf.keras.layers.BatchNormalization()(main)
    
    main = tf.keras.layers.Flatten()(main)

    out = tf.keras.layers.Dense(1, activation = 'sigmoid')(main)#softmax performance is not so good

    model = tf.keras.Model(inputs, out)
    model.regularizers = [tf.keras.regularizers.l2(0.0001)]

    return model

In [10]:
class auc_score_monitor(Callback):
    def __init__(self, val_data, val_target, checkpoint_file, min_lr =1e-5, reduce_lr_patience=2, early_stop_patience=4, factor=0.1):
        self.val_data = val_data
        self.val_target = val_target
        self.checkpoint_file = checkpoint_file
        self.reduce_lr_patience = reduce_lr_patience
        self.early_stop_patience = early_stop_patience
        self.best_val_score = 0
        self.epoch_num = 0
        self.factor = factor
        self.unimproved_lr_counter = 0
        self.unimproved_stop_counter = 0
        self.min_lr = min_lr
        
    def on_train_begin(self, logs={}):
        self.val_scores = []
        
    def on_epoch_end(self, epoch, logs={}):
        val_pred = self.model.predict(self.val_data).reshape((-1,))
        val_score = roc_auc_score(self.val_target, val_pred)
        # clip pred
        self.val_scores.append(val_score)
        
        #print(self.val_target, '\n', val_pred)
        print('Epoch {} val_score: {:.5f}'.format(self.epoch_num, val_score))
        self.epoch_num += 1
        
        if val_score > self.best_val_score:
            print ('Val Score improve from {:5f} to {:5f}'.format(self.best_val_score, val_score))
            self.best_val_score = val_score
            self.unimproved_lr_counter = 0
            self.unimproved_stop_counter = 0
            if self.checkpoint_file is not None:
                print('Saving file to', self.checkpoint_file)
                self.model.save_weights(self.checkpoint_file)
        else:
            if val_score<self.best_val_score:
                print('no improve from {:.5f}'.format(self.best_val_score))
                self.unimproved_lr_counter += 1
                self.unimproved_stop_counter += 1
            
        if self.reduce_lr_patience is not None and self.unimproved_lr_counter >= self.reduce_lr_patience:
            current_lr = eval(self.model.optimizer.lr)
            if current_lr > self.min_lr:
                print('Reduce LR from {:.6f} to {:.6f}'.format(current_lr, current_lr*self.factor))
                set_value(self.model.optimizer.lr, current_lr*self.factor)
                #self.model.load_weights(self.checkpoint_file)
            else:
                pass
            
            self.unimproved_lr_counter = 0
            
        if self.early_stop_patience is not None and self.unimproved_stop_counter >= self.early_stop_patience:
            print('Early Stop Criteria Meet')
            self.model.stop_training = True
                
        return

In [42]:
def train_NN(train_data,target_data):
    
    folds = StratifiedKFold(n_splits=n_splits)

    preds_nn_oof = np.zeros(200000)
    #preds_nn_oof = np.zeros(50000)
    #preds_nn_test = np.zeros(X_test.shape[0])
    

    for train_idx, valid_idx in folds.split(train_data, target_data):    
        #x=np.array(train_data)
        #y=np.array(target_data)
        X_train=train_data.iloc[train_idx]
        y_train = target_data.iloc[train_idx]
        
        
        
        X_valid=train_data.iloc[valid_idx]
        y_valid = target_data.iloc[valid_idx]
        
        optimizer = tf.keras.optimizers.Adam(lr = learning_rate_init, decay = 0.00001,clipnorm=1.)
        model = get_model()
        lrs = [0.001]*7+[0.0001]*10+[0.00001]*5
        lr_schd = LearningRateScheduler(lambda ep: lrs[ep], verbose=1)
        loss_monitor = auc_score_monitor(X_valid, y_valid, 
                                                   checkpoint_file=None, reduce_lr_patience=2, early_stop_patience=4, 
                                                   factor=0.1) 
        callbacks = [lr_schd,loss_monitor]
        #callbacks.append(tf.keras.callbacks.LearningRateScheduler(lr_scheduler))
        model.compile(optimizer= optimizer, loss='binary_crossentropy')
        print (model.summary())
        model.fit(X_train, y_train, validation_data=(X_valid,  y_valid), epochs=epochs, verbose=2, batch_size=batch_size, callbacks=callbacks)

        preds_nn_oof[valid_idx]= model.predict(X_valid).ravel()
        #preds_nn_test += model.predict(X_test)/n_splits
        

    
    
    return preds_nn_oof

In [12]:
X.fillna(0,inplace=True)

In [None]:
preds_nn_oof= train_NN(X, y)
print("CV score: {:<8.5f}".format(roc_auc_score(y, preds_nn_oof)))

## Ensemble blending with RF (Unused)

In [49]:
from sklearn.ensemble import RandomForestClassifier
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender.fit(X_train_predictions, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [51]:
X_val_predictions = np.empty((len(X_valid), 3), dtype=np.float32)
X_val_predictions[:, 0] = lgb_clf.predict(X_valid)

In [57]:
X_val_predictions[:, 2] = dnn_clf.predict(X_valid).ravel()

In [58]:
y_pred = rnd_forest_blender.predict(X_val_predictions)

## Ensemble blending

In [None]:
# pred_nn_test from DNN, predictions from lgb
final_pred=pred_nn_test*0.4+predictions*0.6

## Submission

In [None]:
submission=pd.DataFrame()
submission['ID_code']=test['ID_code']
def generate_submission(pred, submission):
    submission['target']=pred
    submission.to_csv('submission/submission_blend.csv', index=False)
    return submission

In [None]:
generate_submission(final_pred, submission)