In [166]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action ='ignore')
import os
import gc
pd.options.display.max_rows = 99
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from numba import jit

In [222]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [223]:
train['flag'], test['flag'] = 'train','test'
target = train['target']
full_df = pd.concat([train.drop(['target'],axis =1 ),test])
full_df.head()


Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin,flag
0,7,2,2,5,1,0,0,1,0,0,...,1,5,8,0,1,1,0,0,1,train
1,9,1,1,7,0,0,0,0,1,0,...,1,1,9,0,1,1,0,1,0,train
2,13,5,4,9,1,0,0,0,1,0,...,2,7,7,0,1,1,0,1,0,train
3,16,0,1,2,0,0,1,0,0,0,...,2,4,9,0,0,0,0,0,0,train
4,17,0,2,0,1,0,1,0,0,0,...,1,1,3,0,0,0,1,1,0,train


### Meta Table 구축

In [224]:
data = []
for f in full_df.columns:
    # Defining the role
    if f == 'target':
        role = 'target'
    elif f == 'id':
        role = 'id'
    else:
        role = 'input'
         
    # Defining the level
    if 'bin' in f or f == 'target':
        level = 'binary'
    elif 'cat' in f or f == 'id':
        level = 'nominal'
    elif train[f].dtype == float:
        level = 'interval'
    elif train[f].dtype == 'int64':
        level = 'ordinal'
        
    # Initialize keep to True for all variables except for id
    keep = True
    if f == 'id':
        keep = False
    
    # Defining the data type 
    dtype = full_df[f].dtype
    
    # Creating a Dict that contains all the metadata for the variable
    f_dict = {
        'varname': f,
        'role': role,
        'level': level,
        'keep': keep,
        'dtype': dtype
    }
    data.append(f_dict)
    
meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace=True)

In [225]:
meta.head()

Unnamed: 0_level_0,role,level,keep,dtype
varname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
id,id,nominal,False,int64
ps_ind_01,input,ordinal,True,int64
ps_ind_02_cat,input,nominal,True,int64
ps_ind_03,input,ordinal,True,int64
ps_ind_04_cat,input,nominal,True,int64


### Feature engineering

- ind / reg / car / calc  데이터
- _cat / _bin 
- 아무것도 없는 컬럼을 type으로 interval / ordinal로 구분


#### Null Value 처리

In [226]:
#  -1-> Null
full_df = full_df.replace(-1,np.nan)

In [227]:
# interval 
cols =list(meta[(meta['level']=='interval') & (meta['keep']==True)].index)
#  Null Value  Portion
# print(full_df[cols].isnull().sum()/full_df[cols].shape[0]*100)
# 평균으로 처리
full_df[cols] = full_df[cols].fillna(full_df[cols].mean())


In [228]:
# ordinal : 순서형 자료 (매우높다, 높다, 낮다 등)
cols =list(meta[(meta['level']=='ordinal') & (meta['keep']==True)].index)
#  Null Value  Portion
# print(full_df[cols].isnull().sum()/full_df[cols].shape[0]*100)
# 최빈값을 넣어두자 
full_df[cols] = full_df[cols].fillna(full_df[cols].mode().iloc[0][0])



In [229]:
# Norminal  : 카테고리 변수 --
cols =list(meta[(meta['level']=='nominal') & (meta['keep']==True)].index)
#  Null Value  Portion
# print(full_df[cols].isnull().sum()/full_df[cols].shape[0]*100)
# Null 값을 하나의 카테고리로 분류  = '-999' 입력
full_df[cols] = full_df[cols].fillna('-999')


In [230]:
# binary 
cols =list(meta[(meta['level']=='binary') & (meta['keep']==True)].index)
#  Null Value  Portion
# print(full_df[cols].isnull().sum()/full_df[cols].shape[0]*100)


#### Freq encoding 
- 각 카테고리 컬럼의 빈도수를 카운트 한 것

In [231]:
# Nominal value에 대해
cols =list(meta[(meta['level']=='nominal') & (meta['keep']==True)].index)

for col in cols :
    col_name = '%s_count_full' % col
    full_df[col_name] = full_df[col].map(full_df[col].value_counts())

#### aggregation

In [34]:
# # Catgory column별로 interval value 평균과 편차는 무슨 의미가 있을까

# # full_df.head()
# cat_cols =list(meta[(meta['level']=='nominal') & (meta['keep']==True)].index)
# cols = list(meta[(meta['level']!='nominal') & (meta['keep']==True)].index)
# cols.remove('flag')
# newcol_name = [x+'_avg' for x in cols]
# for col in tqdm(cat_cols) :
#     tmp = [x+'_for_%s'%col for x in newcol_name]
#     for c,original  in zip(tmp,cols) :
#         full_df[c] = full_df[col].map(full_df.groupby(col)[original].mean())
# #     print(full_df.groupby(col)[cols].mean())
   


#### interaction
- 상관관계를 기반으로 곱하기를 해보자


Car features
ps_car_12 are (with some approximations) square roots (divided by 10) of natural numbers whilst ps_car_15 are square roots of natural numbers. Let's represent the values using pairplot.

In [35]:
# sample = trainset.sample(frac=0.05)
# var = ['ps_car_12', 'ps_car_15', 'target']
# sample = sample[var]
# sns.pairplot(sample,  hue='target', palette = 'Set1', diag_kind='kde')
# plt.show()

### Feature Selection



Thanks to the public kernels (wheel of fortune eg.) that suggest to remove *calc features,

In [232]:
cols = [x for x in meta.index if 'calc' in x]
meta.loc[cols,'keep'] = False


In [37]:
# ## Removing featues with low variance
# from sklearn.feature_selection import VarianceThreshold
# sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
# sel.fit_transform(full_df)

In [38]:
# Tree-based Feature selection
#https://scikit-learn.org/stable/modules/feature_selection.html



### Label_encoding

In [110]:

# # nominal value에 대해
# cols =list(meta[(meta['level']=='nominal') & (meta['keep']==True)].index)
# for col in tqdm(cols) :
#     lbl = LabelEncoder()
#     full_df[col] = lbl.fit_transform(list(full_df[col].values))
    

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:12<00:00,  1.74it/s]


# Getdummy


In [233]:
cols =list(meta[(meta['level']=='nominal') & (meta['keep']==True)].index)

temp=pd.get_dummies((full_df[cols]),drop_first=True)
full_df = pd.concat([full_df,temp],axis=1)
full_df = full_df.drop(cols,axis=1)

KeyError: "[('ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat')] not found in axis"

In [239]:
full_df = full_df.drop(cols,axis=1)

### DataSet

In [240]:
cols =list(meta[meta['keep']==False].index)
cols  =[x for x in full_df.columns if x not in cols]
full_df = full_df[cols]

In [241]:
full_df.head()
train_df = full_df[full_df['flag']=='train'].drop(['flag'],axis =1 )
test_df = full_df[full_df['flag']=='test'].drop(['flag'],axis =1 )
target = target

In [242]:
train_df.shape

(595212, 75)

### Target encoding
Target encoding with smoothing
min_samples_leaf define a threshold where prior and target mean (for a given category value) have the same weight. Below the threshold prior becomes more important and above mean becomes more important.

How weight behaves against value counts is controlled by smoothing parameter

In [66]:
# def add_noise(series, noise_level):
#     return series * (1 + noise_level * np.random.randn(len(series)))

# def target_encode(trn_series=None, 
#                   tst_series=None, 
#                   target=None, 
#                   min_samples_leaf=1, 
#                   smoothing=1,
#                   noise_level=0):
#     """
#     Smoothing is computed like in the following paper by Daniele Micci-Barreca
#     https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
#     trn_series : training categorical feature as a pd.Series
#     tst_series : test categorical feature as a pd.Series
#     target : target data as a pd.Series
#     min_samples_leaf (int) : minimum samples to take category average into account
#     smoothing (int) : smoothing effect to balance categorical average vs prior  
#     """ 
#     assert len(trn_series) == len(target)
#     assert trn_series.name == tst_series.name
#     temp = pd.concat([trn_series, target], axis=1)
#     # Compute target mean 
#     averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
#     # Compute smoothing
#     smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
#     # Apply average function to all target data
#     prior = target.mean()
#     # The bigger the count the less full_avg is taken into account
#     averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
#     averages.drop(["mean", "count"], axis=1, inplace=True)
#     # Apply averages to trn and tst series
#     ft_trn_series = pd.merge(
#         trn_series.to_frame(trn_series.name),
#         averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
#         on=trn_series.name,
#         how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
#     # pd.merge does not keep the index so restore it
#     ft_trn_series.index = trn_series.index 
#     ft_tst_series = pd.merge(
#         tst_series.to_frame(tst_series.name),
#         averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
#         on=tst_series.name,
#         how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
#     # pd.merge does not keep the index so restore it
#     ft_tst_series.index = tst_series.index
#     return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)


In [44]:
# # Nominal value에 대해
# cols =list(meta[(meta['level']=='nominal') & (meta['keep']==True)].index)

# for col in tqdm(cols):
#     col_name = '%s_target_enc' % col
#     train_df[col_name],test_df[col_name] = target_encode(train_df[col], 
#                                              test_df[col], 
#                                              target=target, 
#                                              min_samples_leaf=100,
#                                              smoothing=10,
#                                              noise_level=0.01)
    

In [45]:
train_df.head()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_car_02_cat_count_full,ps_car_03_cat_count_full,ps_car_04_cat_count_full,ps_car_05_cat_count_full,ps_car_06_cat_count_full,ps_car_07_cat_count_full,ps_car_08_cat_count_full,ps_car_09_cat_count_full,ps_car_10_cat_count_full,ps_car_11_cat_count_full
0,2,2,5,2,1,0,1,0,0,0,...,1234979,1028142,1241334,431560,77845,1383070,249663,486510,1475460,18326
1,1,1,7,1,1,0,0,1,0,0,...,1234979,1028142,1241334,666910,329890,1383070,1238365,883326,1475460,12535
2,5,4,9,2,1,0,0,1,0,0,...,1234979,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,19943
3,0,1,2,1,1,1,0,0,0,0,...,1234979,183044,1241334,431560,329890,1383070,1238365,36798,1475460,212989
4,0,2,0,2,1,1,0,0,0,0,...,1234979,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,26161


# Feature Selection


In [244]:
# Feature elimination : Xgboost 결과 Feature Importance 0.01 미만 삭제
cols = ['ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_14',
#  'ps_car_02_cat',
#  'ps_car_08_cat',
#  'ps_car_10_cat',
 'ps_car_02_cat_count_full',
 'ps_car_03_cat_count_full',
 'ps_car_04_cat_count_full',
 'ps_car_08_cat_count_full',
 'ps_car_10_cat_count_full']

train_df = train_df.drop(cols,axis =1)
test_df = test_df.drop(cols,axis = 1)

# Oversampling 
https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html
https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets

Apart from the random sampling with replacement, there are two popular methods to over-sample minority classes: (i) the Synthetic Minority Oversampling Technique (SMOTE) [CBHK2002] and (ii) the Adaptive Synthetic (ADASYN) [HBGL2008] sampling method


In [36]:
# from imblearn.over_sampling import SMOTE
# sm = SMOTE(random_state = 99)
# train_df_sm, target_sm = sm.fit_sample(train_df, target.ravel())
# # SMOTE.fit_sample(train_df,target)

In [58]:
# print(train_df.shape, train_df_sm.shape)
# print(target.shape, target_sm.shape)
# print(target[target==1].shape,np.sum(target_sm))
# print(sum(target==0),sum(target==1))  # target ==1인 데이터가 26배가 되었다.
# print(sum(target_sm==0),sum(target_sm==1))

(595212, 37) (1147036, 37)
(595212,) (1147036,)
(21694,) 573518
573518 21694
573518 573518


In [86]:
# train_df = train_df_sm.copy()
# target= target_sm.copy()
# train_df['target'] = target
# target= train_df['target']
# train_df = train_df.drop(['target'],axis =1)

### Eval_gini

JIT 컴파일(just-in-time compilation) 또는 동적 번역(dynamic translation)은 프로그램을 실제 실행하는 시점에
기계어로 번역하는 컴파일 기법이다. 이 기법은 프로그램의 실행 속도를 빠르게 하기 위해 사용된다.


출처: https://hamait.tistory.com/476 [HAMA 블로그]

In [202]:
# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit

#
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283



### Modeling

In [203]:
learning_rate = 0.01
early_stopping_round = 30

from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedKFold
from bayes_opt import BayesianOptimization


# Hyperparameter range
params_range = {
                'learning_rate' : (.005,.1),
                'max_depth' : (8,13),
                'subsample' : (0.7,0.9),
                'colsample_bytree' : (0.7,0.8),
                'reg_alpha' :( 0.1,1),
                'reg_lambda' : (0.5,1),                
               }


In [204]:
n_folds= 5
kf = KFold(n_splits =n_folds, random_state = 99, shuffle =True)
np.random.seed(99)
seed = 99

In [205]:
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
#     print(dtrain,labels)
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]

In [206]:
%%time
model = XGBClassifier( 
            n_estimators = 1000,
            max_depth =4,
    objective = "binary:logistic",
    learing_rateing_rate = learning_rate,
    subsample = .8,
    min_child_weight = 6,
    colsample_bytree = .8,
#     scale_pos_weight = 1.6,
    gamma = 10,
    reg_alpha = 8,
    reg_lambda =1.3,
    tree_method = 'gpu_hist'

)

Wall time: 0 ns


In [207]:
target.value_counts()

0    573518
1     21694
Name: target, dtype: int64

In [245]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42,sampling_strategy={1: target.value_counts()[1], 0: target.value_counts()[1]*3})
train_df_res, target_res = rus.fit_resample(train_df, target)


In [246]:
# Undersampling 5번한 결과를 평균 내기
scores = []
y_test_pred = []
fi =[]

RS = [42,99,199,209,5]
for rs in RS :
    
    rus = RandomUnderSampler(random_state=rs,sampling_strategy={1: target.value_counts()[1], 0: target.value_counts()[1]*10})
    train_df_res, target_res = rus.fit_resample(train_df, target)

    # train_df_res, target_res = train_df,target

    for i, (train_index, test_index) in enumerate(kf.split(train_df_res)):

        x_train,x_valid = train_df_res.iloc[train_index,:], train_df_res.iloc[test_index,:]
        y_train,y_valid = target_res.iloc[train_index], target_res.iloc[test_index]

        eval_set = [(x_valid,y_valid)]
        fit_model = model.fit(x_train,y_train,
                             eval_set =eval_set,
                             eval_metric = gini_xgb,
                            early_stopping_rounds = early_stopping_round,
                              verbose = 50
                             )
        print( "Best N tress = " , model.best_ntree_limit)
        print(" Best gini = ", -model.best_score)
        fi.append(model.feature_importances_)
    #     plot_importance(fit_model)
    #     pyplot.show()
    #     selection = SelectFromModel(fit_model,threshold = 0.15,prefit =True)
    #     sel_x_train = selection.transform(x_train)
    #     sel_x_valid = selection.transform(x_valid)
    #     s_model = XGBClassifier()
    #     eval_set = [(sel_x_valid,y_valid)]
    #     fit_model = s_model.fit(sel_x_train,y_train,
    #                      eval_set =eval_set,
    #                      eval_metric = gini_xgb,
    #                     early_stopping_rounds = early_stopping_round,
    #                       verbose = 50
    #                      )

        #  prediction
        pred = fit_model.predict_proba(x_valid)[:,1]
        print( " Gini = ", eval_gini(y_valid, pred))
        scores.append(eval_gini(y_valid, pred))
        y_test_pred.append(fit_model.predict_proba(test_df)[:,1])


        del x_train,x_valid,y_train,y_valid
        gc.collect()
print(" Cross validadation score, Gini " ,sum(scores)/len(scores))

[0]	validation_0-error:0.089781	validation_0-gini:-0.198655
Multiple eval metrics have been passed: 'validation_0-gini' will be used for early stopping.

Will train until validation_0-gini hasn't improved in 30 rounds.
[50]	validation_0-error:0.089781	validation_0-gini:-0.265492
[100]	validation_0-error:0.08974	validation_0-gini:-0.275089
[150]	validation_0-error:0.089698	validation_0-gini:-0.276127
Stopping. Best iteration:
[141]	validation_0-error:0.08974	validation_0-gini:-0.276445

Best N tress =  142
 Best gini =  0.276445
 Gini =  0.2764448817525018
[0]	validation_0-error:0.089446	validation_0-gini:-0.240748
Multiple eval metrics have been passed: 'validation_0-gini' will be used for early stopping.

Will train until validation_0-gini hasn't improved in 30 rounds.
[50]	validation_0-error:0.089446	validation_0-gini:-0.268455
[100]	validation_0-error:0.089446	validation_0-gini:-0.278029
[150]	validation_0-error:0.089425	validation_0-gini:-0.281183
[200]	validation_0-error:0.089425	

Will train until validation_0-gini hasn't improved in 30 rounds.
[50]	validation_0-error:0.093406	validation_0-gini:-0.258131
[100]	validation_0-error:0.093343	validation_0-gini:-0.268203
[150]	validation_0-error:0.093364	validation_0-gini:-0.268778
Stopping. Best iteration:
[160]	validation_0-error:0.093364	validation_0-gini:-0.269546

Best N tress =  161
 Best gini =  0.269546
 Gini =  0.2695457147318234
[0]	validation_0-error:0.092191	validation_0-gini:-0.141859
Multiple eval metrics have been passed: 'validation_0-gini' will be used for early stopping.

Will train until validation_0-gini hasn't improved in 30 rounds.
[50]	validation_0-error:0.092191	validation_0-gini:-0.27673
[100]	validation_0-error:0.092065	validation_0-gini:-0.289205
[150]	validation_0-error:0.092065	validation_0-gini:-0.293384
[200]	validation_0-error:0.092065	validation_0-gini:-0.293935
Stopping. Best iteration:
[181]	validation_0-error:0.092065	validation_0-gini:-0.294968

Best N tress =  182
 Best gini =  0.

In [249]:
scores = []
y_test_pred = []
fi =[]

rus = RandomUnderSampler(random_state=42)
train_df_res, target_res = rus.fit_resample(train_df, target)

for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    
    x_train,x_valid = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train,y_valid = target.iloc[train_index], target.iloc[test_index]
    
    eval_set = [(x_valid,y_valid)]
    fit_model = model.fit(x_train,y_train,
                         eval_set =eval_set,
                         eval_metric = gini_xgb,
                        early_stopping_rounds = early_stopping_round,
                          verbose = 50
                         )
    print( "Best N tress = " , model.best_ntree_limit)
    print(" Best gini = ", -model.best_score)
    fi.append(model.feature_importances_)
#     plot_importance(fit_model)
#     pyplot.show()
#     selection = SelectFromModel(fit_model,threshold = 0.15,prefit =True)
#     sel_x_train = selection.transform(x_train)
#     sel_x_valid = selection.transform(x_valid)
#     s_model = XGBClassifier()
#     eval_set = [(sel_x_valid,y_valid)]
#     fit_model = s_model.fit(sel_x_train,y_train,
#                      eval_set =eval_set,
#                      eval_metric = gini_xgb,
#                     early_stopping_rounds = early_stopping_round,
#                       verbose = 50
#                      )
    
    #  prediction
    pred = fit_model.predict_proba(x_valid)[:,1]
    print( " Gini = ", eval_gini(y_valid, pred))
    scores.append(eval_gini(y_valid, pred))
    y_test_pred.append(fit_model.predict_proba(test_df)[:,1])
    
    
    del x_train,x_valid,y_train,y_valid
    gc.collect()
print(" Cross validadation score, Gini " ,sum(scores)/n_folds)

[0]	validation_0-error:0.035903	validation_0-gini:-0.19446
Multiple eval metrics have been passed: 'validation_0-gini' will be used for early stopping.

Will train until validation_0-gini hasn't improved in 30 rounds.
[50]	validation_0-error:0.035903	validation_0-gini:-0.264741
[100]	validation_0-error:0.035903	validation_0-gini:-0.278715
[150]	validation_0-error:0.035911	validation_0-gini:-0.281516
[200]	validation_0-error:0.035911	validation_0-gini:-0.283295
[250]	validation_0-error:0.035911	validation_0-gini:-0.284051
Stopping. Best iteration:
[252]	validation_0-error:0.035911	validation_0-gini:-0.284224

Best N tress =  253
 Best gini =  0.284224
 Gini =  0.28422444118213763
[0]	validation_0-error:0.037012	validation_0-gini:-0.186816
Multiple eval metrics have been passed: 'validation_0-gini' will be used for early stopping.

Will train until validation_0-gini hasn't improved in 30 rounds.
[50]	validation_0-error:0.037012	validation_0-gini:-0.256922
[100]	validation_0-error:0.03701

In [250]:
from datetime import datetime
sub_df = pd.DataFrame()
sub_df['id'] = test['id']
sub_df['target'] = sum(y_test_pred)/len(y_test_pred)

sub_df.to_csv('submission_baseline_XGBOOST_' + str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + '.csv', index=False)



## 앙상블

In [259]:
lsts = ['submission_baseline_XGBOOST_2020-02-14_20-43-40.csv','submission_baseline_LGBM_2020-02-21_21-31-46.csv',
       'submission_baseline_XGBOOST_2020-02-21_22-21-37.csv','submission_baseline_XGBOOST_2020-02-21_22-17-22.csv',
       'kagglemix.csv','yckim_stack.csv']
#        'submission_baseline_LGBM_2020-02-09_20-48-06.csv']
preds = []
for lst in lsts:
    preds.append(pd.read_csv(lst)['target'])

from datetime import datetime
sub_df = pd.DataFrame()
sub_df['id'] = test['id']
sub_df['target'] = sum(preds)/len(preds)

sub_df.to_csv('submission_baseline_ENSEMBLE_' + str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + '.csv', index=False)

    



### Bayesianoptimization


In [None]:
# %%time
# paramlst = []
# scorelst =[]
# import xgboost as xgb

# def running(learning_rate, max_depth, subsample, colsample_bytree, reg_alpha, reg_lambda):
#     params = {
#     'objective': "binary:logistic",
#     'tree_method': 'gpu_hist',
#     "learning_rate": learning_rate,
#     "max_depth" : int(round(max_depth)),
#     "subsample": max(min(subsample,1),0),    
#     "colsample_bytree": max(min(colsample_bytree,1),0),
#     "reg_alpha": max(min(reg_alpha,1),0),
#     "reg_lambda": max(min(reg_lambda,1),0),
#     "n_jobs" : 10,
#     "missing" : -999,
        
#     }
    
#     print(params)
#     kf = KFold(n_splits=n_folds, shuffle=False, random_state=seed)
#     scores = [] 
#     models = []
#     for i,(train_index, val_index) in enumerate(kf.split(train_df)):
#         train_X = train_df.iloc[train_index]    
#         val_X = train_df.iloc[val_index]
#         train_y = target.iloc[train_index]
#         val_y = target.iloc[val_index]
#         xgb_train = xgb.DMatrix(train_X, train_y)
#         xgb_eval = xgb.DMatrix(val_X, val_y)
#         model = xgb.train(params, xgb_train, num_boost_round = 3000,evals=[(xgb_train, 'train'), (xgb_eval, 'val')],
#                     feval = gini_xgb ,verbose_eval = 100, early_stopping_rounds = 30)
#         print(" Best gini = ", -model.best_score)

#         pred = model.predict(xgb.DMatrix(val_X))#[:,1]
# #         print(pred)
#         print( " Gini = ", eval_gini(val_y, pred))
#         scores.append(eval_gini(val_y, pred))
#         y_test_pred.append(model.predict(xgb.DMatrix(test_df)))
    
#     paramlst.append(params)
#     scorelst.append(sum(scores)/n_folds)
#     print("Cross validation Score : ", sum(scores)/n_folds)
#     return eval_gini(val_y, pred)
    
# Baysian = BayesianOptimization(running, params_range,random_state = seed)
# Baysian.maximize(init_points = 5, n_iter = 20)

# results = pd.DataFrame()
# results['params'] = paramlst
# results['score'] = scorelst
# results.to_csv('resuts.csv',index=False)

|   iter    |  target   | colsam... | learni... | max_depth | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------
{'objective': 'binary:logistic', 'tree_method': 'gpu_hist', 'learning_rate': 0.05136744792785545, 'max_depth': 12, 'subsample': 0.8131234839221061, 'colsample_bytree': 0.7672278558630792, 'reg_alpha': 0.12830174886366832, 'reg_lambda': 0.9040249816824238, 'n_jobs': 10, 'missing': -999}
[0]	train-error:0.03617	val-error:0.037281	train-gini:-0.228663	val-gini:-0.216438
Multiple eval metrics have been passed: 'val-gini' will be used for early stopping.

Will train until val-gini hasn't improved in 30 rounds.
Stopping. Best iteration:
[42]	train-error:0.036252	val-error:0.036785	train-gini:-0.567064	val-gini:-0.258787

 Best gini =  0.258787
 Gini =  0.2511597993516598
[0]	train-error:0.036369	val-error:0.036466	train-gini:-0.22988	val-gini:-0.20749
Multiple eval metrics have been passed: 'val-gin

 Best gini =  0.259873
 Gini =  0.24893935990865912
Cross validation Score :  0.2544432604949478
|  3        |  0.2489   |  0.7377   |  0.05194  |  12.64    |  0.4559   |  0.987    |  0.8049   |
{'objective': 'binary:logistic', 'tree_method': 'gpu_hist', 'learning_rate': 0.08226429921430226, 'max_depth': 9, 'subsample': 0.8632284719974377, 'colsample_bytree': 0.7093613093297753, 'reg_alpha': 0.5989112061316169, 'reg_lambda': 0.6461345579795807, 'n_jobs': 10, 'missing': -999}
[0]	train-error:0.036264	val-error:0.036894	train-gini:-0.250057	val-gini:-0.220698
Multiple eval metrics have been passed: 'val-gini' will be used for early stopping.

Will train until val-gini hasn't improved in 30 rounds.
Stopping. Best iteration:
[28]	train-error:0.036309	val-error:0.036793	train-gini:-0.399438	val-gini:-0.263248

 Best gini =  0.263248
 Gini =  0.2605773751048188
[0]	train-error:0.036437	val-error:0.036407	train-gini:-0.24728	val-gini:-0.211247
Multiple eval metrics have been passed: 'val-gini

[700]	train-error:0.036376	val-error:0.036567	train-gini:-0.401213	val-gini:-0.274087
[800]	train-error:0.036374	val-error:0.036567	train-gini:-0.419862	val-gini:-0.276932
[900]	train-error:0.036371	val-error:0.036567	train-gini:-0.438122	val-gini:-0.279158
[1000]	train-error:0.036371	val-error:0.036567	train-gini:-0.455549	val-gini:-0.280951
[1100]	train-error:0.036361	val-error:0.036567	train-gini:-0.472037	val-gini:-0.281925
[1200]	train-error:0.036357	val-error:0.036575	train-gini:-0.487853	val-gini:-0.282778
Stopping. Best iteration:
[1195]	train-error:0.036357	val-error:0.036575	train-gini:-0.487179	val-gini:-0.28283

 Best gini =  0.28283
 Gini =  0.2826000723685641
[0]	train-error:0.036481	val-error:0.036643	train-gini:-0.241766	val-gini:-0.215691
Multiple eval metrics have been passed: 'val-gini' will be used for early stopping.

Will train until val-gini hasn't improved in 30 rounds.
[100]	train-error:0.036449	val-error:0.036265	train-gini:-0.317119	val-gini:-0.258216
[200]	t

[1000]	train-error:0.036447	val-error:0.036273	train-gini:-0.436422	val-gini:-0.285117
[1100]	train-error:0.036443	val-error:0.036265	train-gini:-0.45045	val-gini:-0.286655
[1200]	train-error:0.036443	val-error:0.036265	train-gini:-0.464314	val-gini:-0.288316
[1300]	train-error:0.036443	val-error:0.036265	train-gini:-0.477871	val-gini:-0.289621
[1400]	train-error:0.036441	val-error:0.036265	train-gini:-0.49008	val-gini:-0.290579
Stopping. Best iteration:
[1446]	train-error:0.036439	val-error:0.036265	train-gini:-0.49591	val-gini:-0.2909

 Best gini =  0.2909
 Gini =  0.2907666255926672
[0]	train-error:0.03639	val-error:0.036575	train-gini:-0.245708	val-gini:-0.214805
Multiple eval metrics have been passed: 'val-gini' will be used for early stopping.

Will train until val-gini hasn't improved in 30 rounds.
[100]	train-error:0.036388	val-error:0.036592	train-gini:-0.320626	val-gini:-0.258651
[200]	train-error:0.036386	val-error:0.036592	train-gini:-0.334808	val-gini:-0.262087
[300]	train

Will train until val-gini hasn't improved in 30 rounds.
[100]	train-error:0.03638	val-error:0.036567	train-gini:-0.296761	val-gini:-0.255305
[200]	train-error:0.03638	val-error:0.036559	train-gini:-0.318335	val-gini:-0.260863
[300]	train-error:0.036376	val-error:0.036567	train-gini:-0.331429	val-gini:-0.263388
[400]	train-error:0.036376	val-error:0.036567	train-gini:-0.342488	val-gini:-0.264796
[500]	train-error:0.036378	val-error:0.036559	train-gini:-0.356843	val-gini:-0.267178
[600]	train-error:0.036376	val-error:0.036559	train-gini:-0.373615	val-gini:-0.270371
[700]	train-error:0.036376	val-error:0.036559	train-gini:-0.392376	val-gini:-0.273237
[800]	train-error:0.036374	val-error:0.036567	train-gini:-0.41115	val-gini:-0.275767
[900]	train-error:0.036374	val-error:0.036567	train-gini:-0.427945	val-gini:-0.27761
[1000]	train-error:0.036374	val-error:0.036567	train-gini:-0.445241	val-gini:-0.279701
[1100]	train-error:0.036374	val-error:0.036575	train-gini:-0.460521	val-gini:-0.281191


[300]	train-error:0.03638	val-error:0.036601	train-gini:-0.432003	val-gini:-0.266905
Stopping. Best iteration:
[321]	train-error:0.036376	val-error:0.036601	train-gini:-0.436467	val-gini:-0.267025

 Best gini =  0.267025
 Gini =  0.2669272336943286
Cross validation Score :  0.2663151883506166
|  11       |  0.2669   |  0.8      |  0.005    |  9.85     |  1.0      |  0.5      |  0.9      |
{'objective': 'binary:logistic', 'tree_method': 'gpu_hist', 'learning_rate': 0.005, 'max_depth': 10, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 0.1000000035546548, 'reg_lambda': 1.0, 'n_jobs': 10, 'missing': -999}
[0]	train-error:0.036306	val-error:0.037272	train-gini:-0.220959	val-gini:-0.219055
Multiple eval metrics have been passed: 'val-gini' will be used for early stopping.

Will train until val-gini hasn't improved in 30 rounds.
[100]	train-error:0.036321	val-error:0.036793	train-gini:-0.320054	val-gini:-0.257409
[200]	train-error:0.036317	val-error:0.036793	train-gini:-0.369629	val

In [282]:
# %%time
# model = XGBClassifier( 
#             n_estimators = 1000,
#             max_depth =4,
#     objective = "binary:logistic",
#     learing_rateing_rate = learning_rate,
#     subsample = .8,
#     min_child_weight = 6,
#     colsample_bytree = .8,
#     scale_pos_weight = 1.6,
#     gamma = 10,
#     reg_alpha = 8,
#     reg_lambda =1.3,
#     tree_method = 'gpu_hist'

# )

Wall time: 0 ns


In [285]:
# scores = []
# y_test_pred = []
# fi =[]
# for i, (train_index, test_index) in enumerate(kf.split(train_df)):
#     x_train,x_valid = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
#     y_train,y_valid = target.iloc[train_index], target.iloc[test_index]
    
#     eval_set = [(x_valid,y_valid)]
#     fit_model = model.fit(x_train,y_train,
#                          eval_set =eval_set,
#                          eval_metric = gini_xgb,
#                         early_stopping_rounds = early_stopping_round,
#                           verbose = 50
#                          )
#     print( "Best N tress = " , model.best_ntree_limit)
#     print(" Best gini = ", -model.best_score)
#     fi.append(model.feature_importances_)
# #     plot_importance(fit_model)
# #     pyplot.show()
# #     selection = SelectFromModel(fit_model,threshold = 0.15,prefit =True)
# #     sel_x_train = selection.transform(x_train)
# #     sel_x_valid = selection.transform(x_valid)
# #     s_model = XGBClassifier()
# #     eval_set = [(sel_x_valid,y_valid)]
# #     fit_model = s_model.fit(sel_x_train,y_train,
# #                      eval_set =eval_set,
# #                      eval_metric = gini_xgb,
# #                     early_stopping_rounds = early_stopping_round,
# #                       verbose = 50
# #                      )
    
#     #  prediction
#     pred = fit_model.predict_proba(x_valid)[:,1]
#     print( " Gini = ", eval_gini(y_valid, pred))
#     scores.append(eval_gini(y_valid, pred))
#     y_test_pred.append(fit_model.predict_proba(test_df)[:,1])
    
    
#     del x_train,x_valid,y_train,y_valid
#     gc.collect()
# print(" Cross validadation score, Gini " ,sum(scores)/n_folds)

[0]	validation_0-error:0.036449	validation_0-gini:-0.183907
Multiple eval metrics have been passed: 'validation_0-gini' will be used for early stopping.

Will train until validation_0-gini hasn't improved in 30 rounds.
[50]	validation_0-error:0.036449	validation_0-gini:-0.263271
[100]	validation_0-error:0.036466	validation_0-gini:-0.277449
[150]	validation_0-error:0.036466	validation_0-gini:-0.281548
[200]	validation_0-error:0.036474	validation_0-gini:-0.28276
[250]	validation_0-error:0.036474	validation_0-gini:-0.283151
Stopping. Best iteration:
[240]	validation_0-error:0.036474	validation_0-gini:-0.283763

Best N tress =  241
 Best gini =  0.283763
 Gini =  0.2837628086228603
[0]	validation_0-error:0.036449	validation_0-gini:-0.193886
Multiple eval metrics have been passed: 'validation_0-gini' will be used for early stopping.

Will train until validation_0-gini hasn't improved in 30 rounds.
[50]	validation_0-error:0.036449	validation_0-gini:-0.268073
[100]	validation_0-error:0.036449

In [238]:
# Importance = pd.DataFrame()
# Importance['column'] = train_df.columns
# Importance['Feature_Importance'] = np.mean(fi,axis =0)
# list(Importance[Importance['Feature_Importance']<0.01]['column'])

['ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_14',
 'ps_car_02_cat',
 'ps_car_08_cat',
 'ps_car_10_cat',
 'ps_car_02_cat_count_full',
 'ps_car_03_cat_count_full',
 'ps_car_04_cat_count_full',
 'ps_car_08_cat_count_full',
 'ps_car_10_cat_count_full']

In [286]:
from datetime import datetime
sub_df = pd.DataFrame()
sub_df['id'] = test['id']
sub_df['target'] = sum(y_test_pred)/n_folds

sub_df.to_csv('submission_baseline_XGBOOST_' + str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + '.csv', index=False)



## LGBM

In [148]:
def gini_lgbm(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return 'gini', gini_score,True

In [151]:
# LGBM
import lightgbm as lgb
seeds = 99
params =  {
        'objective': 'binary',
        'num_threads': 4,
        'learning_rate': 0.01, 
        'num_iterations' : 1000,
        'subsamle' : 0.8,
        'max_depth': -1,
        'reg_alpha': 0.3,
         'reg_lambda': 0.3,
        'bagging_seed' : seeds,
        'verbose' : -1,
        'seed' :seeds,
        'min_child_samples' : 800
        
#     lgb_params['min_child_samples'] = 800
    }


In [255]:
scores = []
y_test_pred= []
for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    x_train,x_valid = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train,y_valid = target.iloc[train_index], target.iloc[test_index]
    
    lgb_train = lgb.Dataset(data = x_train, label = y_train)
    lgb_valid = lgb.Dataset(data = x_valid, label = y_valid)
    
    lgb_model = lgb.train(params, lgb_train, valid_sets = [lgb_valid], 
                          feval = gini_lgbm,verbose_eval = 100, early_stopping_rounds = early_stopping_round)
    
    pred = lgb_model.predict(x_valid, num_iteration = lgb_model.best_iteration)
#     pred = fit_model.predict_proba(x_valid)[:,1]
    print(pred.shape)
    print( " Gini = ", eval_gini(y_valid, pred))
    scores.append(eval_gini(y_valid, pred))
    y_test_pred.append(lgb_model.predict(test_df))
    
    del x_train,x_valid,y_train,y_valid
    gc.collect()
print(" Cross validadation score, Gini " ,sum(scores)/n_folds)
    

Training until validation scores don't improve for 30 rounds
[100]	valid_0's binary_logloss: 0.151666	valid_0's gini: 0.255909
[200]	valid_0's binary_logloss: 0.150709	valid_0's gini: 0.272039
[300]	valid_0's binary_logloss: 0.150327	valid_0's gini: 0.278417
[400]	valid_0's binary_logloss: 0.150163	valid_0's gini: 0.281016
[500]	valid_0's binary_logloss: 0.150055	valid_0's gini: 0.283402
[600]	valid_0's binary_logloss: 0.150017	valid_0's gini: 0.283978
Early stopping, best iteration is:
[667]	valid_0's binary_logloss: 0.149998	valid_0's gini: 0.284114
(119043,)
 Gini =  0.2841140732025563
Training until validation scores don't improve for 30 rounds
[100]	valid_0's binary_logloss: 0.155287	valid_0's gini: 0.252634
[200]	valid_0's binary_logloss: 0.154423	valid_0's gini: 0.265003
[300]	valid_0's binary_logloss: 0.154047	valid_0's gini: 0.272006
[400]	valid_0's binary_logloss: 0.153874	valid_0's gini: 0.275887
[500]	valid_0's binary_logloss: 0.153807	valid_0's gini: 0.277381
[600]	valid_0

In [257]:
from datetime import datetime

sub_df = pd.DataFrame()
sub_df['id'] = test['id']
sub_df['target'] = sum(y_test_pred)/n_folds

sub_df.to_csv('submission_baseline_LGBM_' + str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + '.csv', index=False)



### LGBM_Categorical Feature

In [63]:
def gini_lgbm(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return 'gini', gini_score,True

In [64]:
# nominal value에 대해
cols =list(meta[(meta['level']=='nominal') & (meta['keep']==True)].index)

# LGBM
import lightgbm as lgb
seeds = 99
params =  {
        'objective': 'binary',
        'num_threads': 4,
        'learning_rate': 0.01, 
        'num_iterations' : 1000,
        'max_depth': -1,
        'reg_alpha': 0.3,
         'reg_lambda': 0.3,
        'bagging_seed' : seeds,
        'verbose' : -1,
        'seed' :seeds
    }



In [66]:
scores = []
y_test_pred= []
for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    x_train,x_valid = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train,y_valid = target.iloc[train_index], target.iloc[test_index]
    
#     x_train[cols] , x_valid[cols]= x_train[cols].astype(object), x_valid[cols].astype(object)
    lgb_train = lgb.Dataset(data = x_train, label = y_train)
    lgb_valid = lgb.Dataset(data = x_valid, label = y_valid)
    
    lgb_model = lgb.train(params, lgb_train, valid_sets = [lgb_valid], 
                          feval = gini_lgbm,verbose_eval = 100, early_stopping_rounds = early_stopping_round,
                         categorical_feature=cols)
        
    pred = lgb_model.predict(x_valid, num_iteration = lgb_model.best_iteration)
#     pred = fit_model.predict_proba(x_valid)[:,1]
    print(pred.shape)
    print( " Gini = ", eval_gini(y_valid, pred))
    scores.append(eval_gini(y_valid, pred))
    y_test_pred.append(lgb_model.predict(test_df))
    
    del x_train,x_valid,y_train,y_valid
    gc.collect()
print(" Cross validadation score, Gini " ,sum(scores)/n_folds)
    

Training until validation scores don't improve for 30 rounds
[100]	valid_0's binary_logloss: 0.151767	valid_0's gini: 0.249186
[200]	valid_0's binary_logloss: 0.150992	valid_0's gini: 0.257855
[300]	valid_0's binary_logloss: 0.150695	valid_0's gini: 0.262629
[400]	valid_0's binary_logloss: 0.150577	valid_0's gini: 0.265408
[500]	valid_0's binary_logloss: 0.150535	valid_0's gini: 0.266414
Early stopping, best iteration is:
[531]	valid_0's binary_logloss: 0.150533	valid_0's gini: 0.266525
(119043,)
 Gini =  0.2665252768135875
Training until validation scores don't improve for 30 rounds
[100]	valid_0's binary_logloss: 0.155468	valid_0's gini: 0.239307
[200]	valid_0's binary_logloss: 0.154779	valid_0's gini: 0.248473
[300]	valid_0's binary_logloss: 0.154578	valid_0's gini: 0.251739
[400]	valid_0's binary_logloss: 0.154536	valid_0's gini: 0.252678
Early stopping, best iteration is:
[384]	valid_0's binary_logloss: 0.154533	valid_0's gini: 0.252729
(119043,)
 Gini =  0.252728996421557
Trainin

In [67]:
from datetime import datetime

sub_df = pd.DataFrame()
sub_df['id'] = test['id']
sub_df['target'] = sum(y_test_pred)/n_folds

sub_df.to_csv('submission_baseline_LGBM_Cat_' + str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + '.csv', index=False)



### Catboost

In [None]:
from catboost import CatBoostClassifier

params =    {     #'iterations' : 2000,
        'n_estimators': 2000,
        'learning_rate': 0.05,
        'eval_metric': 'RMSE',
        'loss_function': 'RMSE',
        'random_seed': seed,
        'metric_period': 10,
        'task_type': 'GPU',
        #'subsample' : 0.8,
        'depth': 8,
    }

model = CatBoostClassifier(iterations=2,
                           learning_rate=1,
                           depth=2)
# Fit model
model.fit(train_data, train_labels, cat_features)
# Get predicted classes
preds_class = model.predict(eval_data)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_data)
# Get predicted RawFormulaVal
preds_raw = model.predict(eval_data, prediction_type='RawFormulaVal')


 gbm = CatBoostRegressor(**params)
    print(i)
    gbm.fit(
            train_X, train_y,
             eval_set=(val_X, val_y),
            early_stopping_rounds = 30,
            cat_features=categorical_features,
            verbose=30)

### MLP

've used this to get a network that has a local CV AUC around 0.642, which corresponds to Gini of 0.284. 

the formula GINI = 2 * AUC -1
2*AUC-1 will be same as calculated with gini_normalized


In [21]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras import optimizers
from sklearn.metrics import roc_auc_score
# https://www.kaggle.com/rspadim/gini-keras-callback-earlystopping-validation

Using TensorFlow backend.


In [None]:
# kfold = StratifiedKFold(n_splits = K, 
#                             random_state = 100, 
#                             shuffle = True)    

In [29]:
import tensorflow as tf
import keras.backend as K
# FROM https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/41108
def jacek_auc(y_true, y_pred):
    score, up_opt = tf.metrics.auc(y_true, y_pred)
    #score, up_opt = tf.contrib.metrics.streaming_auc(y_pred, y_true)    
    K.get_session().run(tf.local_variables_initializer())
    with tf.control_dependencies([up_opt]):
        score = tf.identity(score)
    return score

# FROM https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/41015
# AUC for a binary classifier
def discussion41015_auc(y_true, y_pred):
    ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)
    binSizes = -(pfas[1:]-pfas[:-1])
    s = ptas*binSizes
    return K.sum(s, axis=0)

# PFA, prob false alert for binary classifier
def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # N = total number of negative labels
    N = K.sum(1 - y_true)
    # FP = total number of false alerts, alerts from the negative class labels
    FP = K.sum(y_pred - y_pred * y_true)
    return FP/N

#----------------
# P_TA prob true alerts for binary classifier
def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # P = total number of positive labels
    P = K.sum(y_true)
    # TP = total number of correct alerts, alerts from the positive class labels
    TP = K.sum(y_pred * y_true)
    return TP/P


def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

In [None]:
# https://www.kaggle.com/tomcwalker/keras-nn-with-custom-loss-function-for-gini-auc

In [44]:
target_mlp = keras.utils.np_utils.to_categorical(target)

In [49]:
batch_size = 50
epochs = 1000
learning_rate = 0.001
scores = []
y_test_pred= []

def mlpmodel(input_shape):
    model = Sequential()
    model.add(Dense(64, activation ='relu', input_dim = input_shape))
    model.add(Dropout(0.5))
#     model.add(Dense(64,activation='relu'))
#     model.add(Dropout(0.5))
#     model.add(Dense(64, activation='relu'))
    model.add(Dense(2,activation='sigmoid'))
    adam = optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
    model.compile(loss ='binary_crossentropy',
                 optimizer = adam,metrics = [jacek_auc])

    return model

callback_early_stopping = keras.callbacks.EarlyStopping(monitor='jacek_auc', patience=10, verbose=2, mode='max')

for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    x_train,x_valid = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train,y_valid = target_mlp[train_index], target_mlp[test_index]
    
    
    mlp = mlpmodel(x_train.shape[1])
    mlp.fit(x_train,y_train,
       epochs = epochs,
       batch_size = batch_size,
       callbacks = [callback_early_stopping],
       verbose = 2)
    
    
    pred  = mlp.predict(x_valid)
    score = -eval_gini(y_valid, pred[:,0])
    scores.append(score)
    print(i, "Gini ", score )
    
    y_test_pred.append(mlp.predict(test_df))
#     mlp.predict(test_df)
    del x_train,x_valid,y_train,y_valid
    gc.collect()

print(" Cross validadation score, Gini " ,sum(scores)/n_folds)
    
   

Epoch 1/1000
 - 22s - loss: 1211.5840 - jacek_auc: 0.9342
Epoch 2/1000
 - 21s - loss: 0.4911 - jacek_auc: 0.9556
Epoch 3/1000
 - 21s - loss: 0.2010 - jacek_auc: 0.9588
Epoch 4/1000
 - 21s - loss: 0.3525 - jacek_auc: 0.9601
Epoch 5/1000
 - 21s - loss: 0.1726 - jacek_auc: 0.9608
Epoch 6/1000
 - 21s - loss: 0.1830 - jacek_auc: 0.9613
Epoch 7/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9617
Epoch 8/1000
 - 21s - loss: 0.2207 - jacek_auc: 0.9619
Epoch 9/1000
 - 21s - loss: 0.1582 - jacek_auc: 0.9619
Epoch 10/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9622
Epoch 11/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9623
Epoch 12/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9624
Epoch 13/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9625
Epoch 14/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9625
Epoch 15/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9625
Epoch 16/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9628
Epoch 17/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9628
Epoch 18/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.962

KeyboardInterrupt: 

In [None]:
from datetime import datetime

sub_df = pd.DataFrame()
sub_df['id'] = test['id']
sub_df['target'] = sum(y_test_pred)/n_folds

sub_df.to_csv('submission_baseline_MLP_' + str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + '.csv', index=False)



In [41]:
y_test_pred[0][:,0]

array([0.03637472, 0.03637472, 0.03637472, ..., 0.03637472, 0.03637472,
       0.03637472], dtype=float32)

In [36]:
K


<module 'keras.backend' from 'C:\\Users\\yseon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\keras\\backend\\__init__.py'>