In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from  sklearn.preprocessing import LabelEncoder
from  sklearn.preprocessing import OneHotEncoder
import scipy 
from scipy.sparse import coo_matrix, hstack

from __future__ import division
from scipy.special import erfinv


from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score


In [2]:
df_train = pd.read_csv("../data/new_train_full.csv").fillna(0.0)
df_test = pd.read_csv("../data/new_test_full.csv").fillna(0.0)

In [3]:
df_train = df_train.drop( ["TARGET"] , axis = 1 ) 
test_ids = df_test["SK_ID_CURR"].values
#df_test = df_test.drop( ["SK_ID_CURR"] , axis = 1 ) 

In [4]:
df_labels = pd.read_csv("../data/labels_train.csv" , header = None )
labels_train = df_labels[1]
labels_train.shape

(307511,)

In [5]:
positive_data_indx = df_labels[ df_labels[1] == 1 ]
positive_data = df_train.iloc[ positive_data_indx.index ]
#print( positive_data.shape )
negative_data_indx = df_labels[  df_labels[1] == 0 ]
negative_data = df_train.iloc[  negative_data_indx.index ]
#print( negative_data.shape )

In [6]:
positive_ratio = float(len(positive_data)) / len(df_train)


In [7]:
cols = [x for x  in list(df_train.columns) if "SK" in x ]
df_train = df_train.drop( cols , axis = 1 )
df_test = df_test.drop( cols , axis = 1 )
print( df_train.shape )
print(df_test.shape)

(307511, 714)
(48744, 714)


In [8]:
labels_train = df_labels[1].values

In [9]:
features_train_num = df_train.select_dtypes(include=[np.number])
features_test_num = df_test.select_dtypes(include=[np.number])
features_train_cat =df_train.select_dtypes(include=[object])
features_test_cat = df_test.select_dtypes(include=[object])

In [10]:
cats_not_doc = [ x for x in features_train_num.columns if not x.startswith("FLAG_DOCUMENT")]
features_train_num = features_train_num[ cats_not_doc ]
features_test_num = features_test_num[ cats_not_doc ]

In [11]:
not_flags = [ x for x in features_train_num.columns if  not x.startswith("FLAG")]
flags = [ x for x in features_train_num.columns if x.startswith("FLAG")]

new_cats_train = features_train_num[ flags ]
new_cats_test = features_test_num[ flags ]

features_train_num = features_train_num.drop( flags , axis = 1 )
features_test_num = features_test_num.drop( flags , axis = 1 )

features_train_cat = pd.concat( [features_train_cat  , new_cats_train ]  , axis = 1 )
features_test_cat = pd.concat( [features_test_cat , new_cats_test] , axis = 1  )


features_train_num.head()


Unnamed: 0,LIVINGAPARTMENTS_MODE,REGION_POPULATION_RELATIVE,DAYS_ID_PUBLISH,DAYS_REGISTRATION,COMMONAREA_AVG,OBS_60_CNT_SOCIAL_CIRCLE,LIVINGAREA_MODE,ENTRANCES_AVG,LIVINGAREA_MEDI,ENTRANCES_MEDI,...,MAX(prev.MAX(card.AMT_DRAWINGS_ATM_CURRENT)),STD(prev.MAX(card.AMT_DRAWINGS_OTHER_CURRENT)),MAX(prev.AMT_GOODS_PRICE),LAST(bureau.CNT_CREDIT_PROLONG),MAX(prev.MEAN(card.AMT_BALANCE)),MAX(prev.MEAN(pos.CNT_INSTALMENT)),MAX(prev.STD(card.AMT_INST_MIN_REGULARITY)),STD(prev.MAX(card.AMT_PAYMENT_TOTAL_CURRENT)),MAX(prev.MAX(card.AMT_RECEIVABLE_PRINCIPAL)),LAST(prev.LAST(instal.NUM_INSTALMENT_VERSION))
0,0.022,0.018801,-2120,-3648.0,0.0143,2.0,0.0198,0.069,0.0193,0.069,...,0.0,0.0,179055.0,0.0,0.0,24.0,0.0,0.0,0.0,1.0
1,0.079,0.003541,-291,-1186.0,0.0605,1.0,0.0554,0.0345,0.0558,0.0345,...,0.0,0.0,900000.0,0.0,0.0,12.0,0.0,0.0,0.0,1.0
2,0.0,0.010032,-2531,-4260.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,24282.0,0.0,0.0,3.75,0.0,0.0,0.0,2.0
3,0.0,0.008019,-2437,-9833.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,688500.0,0.0,0.0,16.666667,0.0,0.0,0.0,0.0
4,0.0,0.028663,-3458,-4311.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,247500.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0


In [12]:
for col in features_train_cat.columns:
    
    df_ = pd.concat( [features_train_cat[col] , features_test_cat[col] ]   )
    
    enc = LabelEncoder()
    enc.fit( df_.values ) 
    features_train_cat[col] = enc.transform( features_train_cat[col] )
    features_test_cat[col] = enc.transform( features_test_cat[col] )
    

In [13]:
features_test_cat.head()

Unnamed: 0,WEEKDAY_APPR_PROCESS_START,ORGANIZATION_TYPE,FLAG_OWN_REALTY,NAME_TYPE_SUITE,FLAG_OWN_CAR,NAME_CONTRACT_TYPE,NAME_HOUSING_TYPE,NAME_FAMILY_STATUS,WALLSMATERIAL_MODE,OCCUPATION_TYPE,...,LAST(prev.NAME_PRODUCT_TYPE),LAST(prev.NAME_PORTFOLIO),LAST(prev.NAME_GOODS_CATEGORY),LAST(prev.NAME_CONTRACT_TYPE),FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_EMAIL,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMP_PHONE
0,5,28,1,7,0,0,1,1,6,0,...,1,4,19,2,1,0,1,1,0,1
1,0,42,1,7,0,0,1,1,0,10,...,1,4,19,2,1,0,0,1,0,1
2,1,54,1,0,1,0,1,1,0,5,...,1,4,19,2,1,0,0,1,0,1
3,6,5,1,7,0,0,1,1,5,15,...,1,4,5,2,1,0,0,1,1,1
4,0,5,0,7,1,0,1,1,0,0,...,1,4,5,2,1,1,0,1,0,1


In [14]:
features_test_cat.shape

(48744, 44)

In [15]:
#features_train_cat = features_train_cat[cats_not_doc]
#features_test_cat = features_test_cat[cats_not_doc]

In [16]:
df_ = pd.concat( [features_train_cat , features_test_cat ] , axis = 0 )
enc = OneHotEncoder()
enc.fit( df_ )
onehot_train = enc.transform( features_train_cat )
onehot_test = enc.transform( features_test_cat )

In [17]:
features_test_cat.shape

(48744, 44)

In [18]:
onehot_test.shape

(48744, 366)

In [19]:
features_train_num.shape

(307511, 650)

In [20]:
def rank_gauss(x):
    # x is numpy vector
    N = x.shape[0]
    temp = x.argsort()
    rank_x = temp.argsort() / N
    rank_x -= rank_x.mean()
    rank_x *= 2 # rank_x.max(), rank_x.min() should be in (-1, 1)
    efi_x = erfinv(rank_x) # np.sqrt(2)*erfinv(rank_x)
    efi_x -= efi_x.mean()
    return efi_x

In [21]:
#features_train_num2 =  features_train_num.apply(  rank_gauss )

In [22]:
#features_test_num2 = features_test_num.apply( rank_gauss )

In [23]:
s = features_train_num.shape[0]
for col in features_train_num.columns:
    #plt.hist( features_train_num2[col].values )
    print( col )
    zeros = np.count_nonzero( features_train_num[col].values )
    print( "Ratios nonzeros/total" , zeros/s )
    #plt.title( fn )
    #plt.show()

LIVINGAPARTMENTS_MODE
('Ratios nonzeros/total', 0.31476272393507876)
REGION_POPULATION_RELATIVE
('Ratios nonzeros/total', 1.0)
DAYS_ID_PUBLISH
('Ratios nonzeros/total', 0.9999479693409341)
DAYS_REGISTRATION
('Ratios nonzeros/total', 0.9997398467046708)
COMMONAREA_AVG
('Ratios nonzeros/total', 0.27382435099882607)
OBS_60_CNT_SOCIAL_CIRCLE
('Ratios nonzeros/total', 0.4611997619597348)
LIVINGAREA_MODE
('Ratios nonzeros/total', 0.4966228850350069)
ENTRANCES_AVG
('Ratios nonzeros/total', 0.4954619509546)
LIVINGAREA_MEDI
('Ratios nonzeros/total', 0.49709441288279116)
ENTRANCES_MEDI
('Ratios nonzeros/total', 0.4954424394574503)
APARTMENTS_AVG
('Ratios nonzeros/total', 0.490060518160326)
TOTALAREA_MODE
('Ratios nonzeros/total', 0.5154222125387384)
REG_CITY_NOT_WORK_CITY
('Ratios nonzeros/total', 0.23045354475124466)
ELEVATORS_AVG
('Ratios nonzeros/total', 0.18829245132694442)
EXT_SOURCE_2
('Ratios nonzeros/total', 0.9978537353135335)
AMT_CREDIT
('Ratios nonzeros/total', 1.0)
DEF_30_CNT_SOCIAL_

('Ratios nonzeros/total', 0.942275235682626)
LAST(prev.STD(card.MONTHS_BALANCE))
('Ratios nonzeros/total', 0.05209894930587849)
MAX(prev.LAST(card.AMT_INST_MIN_REGULARITY))
('Ratios nonzeros/total', 0.10208415308720664)
MAX(prev.LAST(card.CNT_DRAWINGS_ATM_CURRENT))
('Ratios nonzeros/total', 0.04010913430739063)
STD(prev.DAYS_FIRST_DRAWING)
('Ratios nonzeros/total', 0.5894780999704076)
MAX(prev.STD(card.CNT_INSTALMENT_MATURE_CUM))
('Ratios nonzeros/total', 0.16624771146398015)
STD(bureau.AMT_CREDIT_SUM_DEBT)
('Ratios nonzeros/total', 0.5617880335987981)
LAST(prev.LAST(card.MONTHS_BALANCE))
('Ratios nonzeros/total', 0.05256722523747118)
LAST(prev.MEAN(card.MONTHS_BALANCE))
('Ratios nonzeros/total', 0.05256722523747118)
MAX(prev.RATE_DOWN_PAYMENT)
('Ratios nonzeros/total', 0.6394925709974603)
MEAN(prev.LAST(card.AMT_TOTAL_RECEIVABLE))
('Ratios nonzeros/total', 0.11398616634852086)
LAST(prev.MAX(pos.CNT_INSTALMENT_FUTURE))
('Ratios nonzeros/total', 0.6204428459469743)
LAST(prev.MAX(card.CN

('Ratios nonzeros/total', 0.004035627993795344)
LAST(prev.MAX(instal.DAYS_ENTRY_PAYMENT))
('Ratios nonzeros/total', 0.656249695132857)
LAST(prev.MAX(pos.MONTHS_BALANCE))
('Ratios nonzeros/total', 0.6214704514635249)
STD(prev.LAST(card.CNT_DRAWINGS_POS_CURRENT))
('Ratios nonzeros/total', 0.043133416365593424)
MEAN(prev.MAX(pos.CNT_INSTALMENT_FUTURE))
('Ratios nonzeros/total', 0.932978007290796)
MAX(prev.MAX(instal.NUM_INSTALMENT_NUMBER))
('Ratios nonzeros/total', 0.941124057350794)
MAX(prev.STD(pos.CNT_INSTALMENT_FUTURE))
('Ratios nonzeros/total', 0.9321942954886167)
MEAN(prev.MAX(card.CNT_DRAWINGS_CURRENT))
('Ratios nonzeros/total', 0.1706963328141107)
LAST(prev.LAST(pos.CNT_INSTALMENT))
('Ratios nonzeros/total', 0.6168429747228554)
STD(prev.STD(card.AMT_CREDIT_LIMIT_ACTUAL))
('Ratios nonzeros/total', 0.13456104009287473)
MEAN(prev.MAX(instal.AMT_INSTALMENT))
('Ratios nonzeros/total', 0.9411175535184108)
STD(prev.LAST(card.MONTHS_BALANCE))
('Ratios nonzeros/total', 0.2494057123159822)


('Ratios nonzeros/total', 0.011046759302919245)
MAX(prev.STD(card.AMT_DRAWINGS_POS_CURRENT))
('Ratios nonzeros/total', 0.10445805190708625)
STD(bureau.AMT_ANNUITY)
('Ratios nonzeros/total', 0.16404941611844778)
LAST(prev.LAST(card.AMT_RECEIVABLE_PRINCIPAL))
('Ratios nonzeros/total', 0.02291950531850893)
MEAN(prev.MAX(card.CNT_DRAWINGS_POS_CURRENT))
('Ratios nonzeros/total', 0.1052872905359483)
STD(prev.CNT_PAYMENT)
('Ratios nonzeros/total', 0.7396548416154218)
MEAN(prev.MAX(card.CNT_DRAWINGS_ATM_CURRENT))
('Ratios nonzeros/total', 0.1461020906569196)
MEAN(prev.MAX(pos.MONTHS_BALANCE))
('Ratios nonzeros/total', 0.9331926337594427)
LAST(prev.LAST(card.AMT_RECIVABLE))
('Ratios nonzeros/total', 0.024418638682843864)
LAST(bureau.MEAN(bureau_bal.MONTHS_BALANCE))
('Ratios nonzeros/total', 0.29626257272097584)
STD(prev.MAX(card.CNT_DRAWINGS_ATM_CURRENT))
('Ratios nonzeros/total', 0.14304854135299225)
STD(prev.MAX(card.CNT_DRAWINGS_OTHER_CURRENT))
('Ratios nonzeros/total', 0.014184858427828599)

In [24]:
features_train_num_sp = scipy.sparse.csr_matrix(features_train_num.values)
features_test_num_sp = scipy.sparse.csr_matrix(features_test_num.values)


In [25]:
features_test_cat.head()

Unnamed: 0,WEEKDAY_APPR_PROCESS_START,ORGANIZATION_TYPE,FLAG_OWN_REALTY,NAME_TYPE_SUITE,FLAG_OWN_CAR,NAME_CONTRACT_TYPE,NAME_HOUSING_TYPE,NAME_FAMILY_STATUS,WALLSMATERIAL_MODE,OCCUPATION_TYPE,...,LAST(prev.NAME_PRODUCT_TYPE),LAST(prev.NAME_PORTFOLIO),LAST(prev.NAME_GOODS_CATEGORY),LAST(prev.NAME_CONTRACT_TYPE),FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_EMAIL,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMP_PHONE
0,5,28,1,7,0,0,1,1,6,0,...,1,4,19,2,1,0,1,1,0,1
1,0,42,1,7,0,0,1,1,0,10,...,1,4,19,2,1,0,0,1,0,1
2,1,54,1,0,1,0,1,1,0,5,...,1,4,19,2,1,0,0,1,0,1
3,6,5,1,7,0,0,1,1,5,15,...,1,4,5,2,1,0,0,1,1,1
4,0,5,0,7,1,0,1,1,0,0,...,1,4,5,2,1,1,0,1,0,1


In [26]:
#features_train_t = hstack( [ onehot_train , features_train_num_sp ]   )
#features_test_t = hstack( [ onehot_test , features_test_num_sp ]   )

features_train_t = pd.concat( [ features_train_cat , features_train_num ] , axis = 1 )
features_test_t = pd.concat( [ features_test_cat , features_test_num ] , axis = 1 )

In [27]:
#pd.DataFrame( features_train_t.todense() ).to_csv("../data/sparse/train2.csv" , index = False )
#features_train_t.to_csv("../data/sparse/train_new2.csv" , index = False) 

In [28]:
#pd.DataFrame( features_test_t.todense() ).to_csv("../data/sparse/test2.csv" , index = False )
#features_test_t.to_csv("../data/sparse/test_new2.csv" , index = False) 

In [29]:
#scipy.sparse.save_npz("../data/features_train_onehot.bin", features_train_t , compressed=True)
#scipy.sparse.save_npz("../data/features_test_onehot.bin", features_test_t , compressed=True)

In [30]:
x_train , x_val , y_train , y_val = train_test_split( features_train_t , labels_train , test_size = 0.3 , random_state=42)

In [31]:
x_train.shape

(215257, 694)

In [32]:
ncat = features_train_cat.shape[1]
ncat

44

In [33]:
#y_val.sum()/y_val.shape[0]

In [34]:
#lgb_train = lgb.Dataset( x_train , y_train)
#lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)
learning_rate = 0.1
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.8
num_boost_round = 10000
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": learning_rate,
          "metric":["auc" ,"binary_logloss"] , 
          "num_leaves": num_leaves,
           "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 1,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9 , 
}


In [41]:
NFOLDS = 3
kfold = KFold(n_splits=NFOLDS, shuffle=True, random_state=666)
#X = features_train_t.values
X_test = features_test_t.values

positive_data = features_train_t.iloc[ positive_data_indx.index ]
negative_data = features_train_t.iloc[ negative_data_indx.index ]

X_full = features_train_t.copy()
labels_full = df_labels[1].values
print( X_full.shape )
print( labels_full.shape) 

final_cv_train = np.zeros(len(labels_train))

final_cv_pred = np.zeros(len( test_ids ))
x_score = []

best_trees = []
fold_scores = []

for s in range(5):
    cv_train = np.zeros(len(labels_train))
    cv_pred = np.zeros(len(test_ids))

    params['seed'] = s
    
    negative_data_sample = negative_data.sample( frac= positive_ratio / (1 - positive_ratio), random_state=s)
    positive_data_sample = positive_data.sample(  frac = 0.7 , random_state = s*50 )
    labels_neg = [  0 for x in range( negative_data_sample.shape[0] ) ]
    labels_pos = [ 1 for x in range( positive_data.shape[0] )   ]
    labels_train = labels_pos + labels_neg
    
    X_train = pd.concat( [positive_data , negative_data_sample] , axis = 0  )
    X_train["y"] = labels_train
    X_train = X_train.sample( frac = 1 )
    labels_train = X_train["y"].values
    X = X_train.drop( ["y"] , axis = 1 ).values
    print("Shape Sampled data ")
    print( X.shape )

    #X_train, X_validate, label_train, label_validate = X[train_fold, :], X[validate, :], labels_train[train_fold], labels_train[validate]
    dtrain = lgb.Dataset( X, labels_train ,  categorical_feature= range(ncat))

    dvalid = lgb.Dataset(   X_full , labels_full, reference=dtrain , categorical_feature=range(ncat))
    bst = lgb.train(params, dtrain , num_boost_round , valid_sets=dvalid , verbose_eval = 100 )
    best_trees.append(bst.best_iteration)
    final_cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
    #cv_train[validate] += bst.predict(X_validate)
    y_preds_full += bst.predict( X_full )
    

    score = roc_auc_score( labels_full , y_preds_full )
    print( score )
    fold_scores.append(score)

    #cv_pred /= NFOLDS
    #final_cv_train += cv_train
    #final_cv_pred += y_preds_full


    #x_score.append( y_preds_full )

(307511, 694)
(307511,)
Shape Sampled data 
(49650, 694)
[100]	valid_0's binary_logloss: 0.569094	valid_0's auc: 0.790184
[200]	valid_0's binary_logloss: 0.560619	valid_0's auc: 0.806816
[300]	valid_0's binary_logloss: 0.556708	valid_0's auc: 0.81813
[400]	valid_0's binary_logloss: 0.554308	valid_0's auc: 0.827074
[500]	valid_0's binary_logloss: 0.552127	valid_0's auc: 0.835288
[600]	valid_0's binary_logloss: 0.550512	valid_0's auc: 0.842333
[700]	valid_0's binary_logloss: 0.549594	valid_0's auc: 0.848472
[800]	valid_0's binary_logloss: 0.548631	valid_0's auc: 0.854294
[900]	valid_0's binary_logloss: 0.548138	valid_0's auc: 0.859395
[1000]	valid_0's binary_logloss: 0.547383	valid_0's auc: 0.864613
[1100]	valid_0's binary_logloss: 0.546967	valid_0's auc: 0.86893
[1200]	valid_0's binary_logloss: 0.546571	valid_0's auc: 0.873076
[1300]	valid_0's binary_logloss: 0.546053	valid_0's auc: 0.877125
[1400]	valid_0's binary_logloss: 0.545375	valid_0's auc: 0.881045
[1500]	valid_0's binary_loglos

[2500]	valid_0's binary_logloss: 0.544901	valid_0's auc: 0.909935
[2600]	valid_0's binary_logloss: 0.545137	valid_0's auc: 0.911779
[2700]	valid_0's binary_logloss: 0.545719	valid_0's auc: 0.913426
[2800]	valid_0's binary_logloss: 0.545952	valid_0's auc: 0.915118
[2900]	valid_0's binary_logloss: 0.54674	valid_0's auc: 0.916639
[3000]	valid_0's binary_logloss: 0.547204	valid_0's auc: 0.918038
[3100]	valid_0's binary_logloss: 0.547741	valid_0's auc: 0.919451
[3200]	valid_0's binary_logloss: 0.548309	valid_0's auc: 0.920833
[3300]	valid_0's binary_logloss: 0.548985	valid_0's auc: 0.922094
[3400]	valid_0's binary_logloss: 0.549593	valid_0's auc: 0.92324
[3500]	valid_0's binary_logloss: 0.550272	valid_0's auc: 0.924352
[3600]	valid_0's binary_logloss: 0.551082	valid_0's auc: 0.925426
[3700]	valid_0's binary_logloss: 0.551611	valid_0's auc: 0.926518
[3800]	valid_0's binary_logloss: 0.552389	valid_0's auc: 0.927486
[3900]	valid_0's binary_logloss: 0.55321	valid_0's auc: 0.92845
[4000]	valid_0

[4900]	valid_0's binary_logloss: 0.554794	valid_0's auc: 0.937531
[5000]	valid_0's binary_logloss: 0.555611	valid_0's auc: 0.938163
[5100]	valid_0's binary_logloss: 0.556825	valid_0's auc: 0.938711
[5200]	valid_0's binary_logloss: 0.557912	valid_0's auc: 0.939267
[5300]	valid_0's binary_logloss: 0.559016	valid_0's auc: 0.939795
[5400]	valid_0's binary_logloss: 0.559841	valid_0's auc: 0.940387
[5500]	valid_0's binary_logloss: 0.560759	valid_0's auc: 0.940918
[5600]	valid_0's binary_logloss: 0.561673	valid_0's auc: 0.941479
[5700]	valid_0's binary_logloss: 0.562754	valid_0's auc: 0.941978
[5800]	valid_0's binary_logloss: 0.563796	valid_0's auc: 0.942483
[5900]	valid_0's binary_logloss: 0.564808	valid_0's auc: 0.942926
[6000]	valid_0's binary_logloss: 0.56583	valid_0's auc: 0.943397
[6100]	valid_0's binary_logloss: 0.567109	valid_0's auc: 0.943778
[6200]	valid_0's binary_logloss: 0.568054	valid_0's auc: 0.944261
[6300]	valid_0's binary_logloss: 0.569151	valid_0's auc: 0.94466
[6400]	valid

[7300]	valid_0's binary_logloss: 0.583463	valid_0's auc: 0.948473
[7400]	valid_0's binary_logloss: 0.584504	valid_0's auc: 0.948807
[7500]	valid_0's binary_logloss: 0.585955	valid_0's auc: 0.949098
[7600]	valid_0's binary_logloss: 0.587136	valid_0's auc: 0.949395
[7700]	valid_0's binary_logloss: 0.588368	valid_0's auc: 0.949701
[7800]	valid_0's binary_logloss: 0.5896	valid_0's auc: 0.949992
[7900]	valid_0's binary_logloss: 0.590778	valid_0's auc: 0.950254
[8000]	valid_0's binary_logloss: 0.592065	valid_0's auc: 0.950549
[8100]	valid_0's binary_logloss: 0.593213	valid_0's auc: 0.95083
[8200]	valid_0's binary_logloss: 0.59447	valid_0's auc: 0.951087
[8300]	valid_0's binary_logloss: 0.595714	valid_0's auc: 0.951363
[8400]	valid_0's binary_logloss: 0.596953	valid_0's auc: 0.95161
[8500]	valid_0's binary_logloss: 0.598114	valid_0's auc: 0.951868
[8600]	valid_0's binary_logloss: 0.599328	valid_0's auc: 0.952069
[8700]	valid_0's binary_logloss: 0.600526	valid_0's auc: 0.952319
[8800]	valid_0'

[9700]	valid_0's binary_logloss: 0.612234	valid_0's auc: 0.953607
[9800]	valid_0's binary_logloss: 0.613055	valid_0's auc: 0.953768
[9900]	valid_0's binary_logloss: 0.614288	valid_0's auc: 0.95386
[10000]	valid_0's binary_logloss: 0.615117	valid_0's auc: 0.953989
0.9539893016922211


In [47]:
final = cv_pred/5.0

In [48]:
final

array([0.03405892, 0.17835744, 0.02591243, ..., 0.00540704, 0.04952406,
       0.11942879])

In [44]:
final_cv_pred.sum()

0.0

In [42]:
print( x_score )

[]


In [None]:
print( np.array(x_score).mean() )

In [None]:
preds = final_cv_pred / 16.0

In [None]:
preds

In [None]:
#df_preds_nn = pd.read_csv("../data/preds_nn_new.csv" )

In [None]:
df_preds_nn.head()

In [None]:
plt.hist( final_cv_pred/16.0 )
plt.show()

In [49]:
pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': final }).to_csv('../data/pred_gbm_full.csv', index=False)

In [None]:

best_trees

In [None]:
x_train , x_val , y_train , y_val = train_test_split( features_train_t , labels_train , test_size = 0.3 , random_state=42)

In [None]:
dtrain = lgb.Dataset( x_train, y_train)
dvalid = lgb.Dataset( x_val , y_val, reference=dtrain)
bst = lgb.train(params, dtrain, num_boost_round, valid_sets=dvalid , verbose_eval=100,early_stopping_rounds=100)

In [None]:
print('Plot feature importances...')
ax = lgb.plot_importance( bst , max_num_features=10)
plt.show()