In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action ='ignore')
import os
import gc
pd.options.display.max_rows = 99
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from numba import jit

In [248]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [249]:
train['flag'], test['flag'] = 'train','test'
target = train['target']
full_df = pd.concat([train.drop(['target'],axis =1 ),test])
full_df.head()


Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin,flag
0,7,2,2,5,1,0,0,1,0,0,...,1,5,8,0,1,1,0,0,1,train
1,9,1,1,7,0,0,0,0,1,0,...,1,1,9,0,1,1,0,1,0,train
2,13,5,4,9,1,0,0,0,1,0,...,2,7,7,0,1,1,0,1,0,train
3,16,0,1,2,0,0,1,0,0,0,...,2,4,9,0,0,0,0,0,0,train
4,17,0,2,0,1,0,1,0,0,0,...,1,1,3,0,0,0,1,1,0,train


### Meta Table 구축

In [250]:
data = []
for f in full_df.columns:
    # Defining the role
    if f == 'target':
        role = 'target'
    elif f == 'id':
        role = 'id'
    else:
        role = 'input'
         
    # Defining the level
    if 'bin' in f or f == 'target':
        level = 'binary'
    elif 'cat' in f or f == 'id':
        level = 'nominal'
    elif train[f].dtype == float:
        level = 'interval'
    elif train[f].dtype == 'int64':
        level = 'ordinal'
        
    # Initialize keep to True for all variables except for id
    keep = True
    if f == 'id':
        keep = False
    
    # Defining the data type 
    dtype = full_df[f].dtype
    
    # Creating a Dict that contains all the metadata for the variable
    f_dict = {
        'varname': f,
        'role': role,
        'level': level,
        'keep': keep,
        'dtype': dtype
    }
    data.append(f_dict)
    
meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace=True)

In [251]:
meta.head()

Unnamed: 0_level_0,role,level,keep,dtype
varname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
id,id,nominal,False,int64
ps_ind_01,input,ordinal,True,int64
ps_ind_02_cat,input,nominal,True,int64
ps_ind_03,input,ordinal,True,int64
ps_ind_04_cat,input,nominal,True,int64


### Feature engineering

- ind / reg / car / calc  데이터
- _cat / _bin 
- 아무것도 없는 컬럼을 type으로 interval / ordinal로 구분


#### Null Value 처리

In [252]:
#  -1-> Null
full_df = full_df.replace(-1,np.nan)

In [253]:
# interval 
cols =list(meta[(meta['level']=='interval') & (meta['keep']==True)].index)
#  Null Value  Portion
# print(full_df[cols].isnull().sum()/full_df[cols].shape[0]*100)
# 평균으로 처리
full_df[cols] = full_df[cols].fillna(full_df[cols].mean())


In [254]:
# ordinal : 순서형 자료 (매우높다, 높다, 낮다 등)
cols =list(meta[(meta['level']=='ordinal') & (meta['keep']==True)].index)
#  Null Value  Portion
# print(full_df[cols].isnull().sum()/full_df[cols].shape[0]*100)
# 최빈값을 넣어두자 
full_df[cols] = full_df[cols].fillna(full_df[cols].mode().iloc[0][0])



In [255]:
# Norminal  : 카테고리 변수 --
cols =list(meta[(meta['level']=='nominal') & (meta['keep']==True)].index)
#  Null Value  Portion
# print(full_df[cols].isnull().sum()/full_df[cols].shape[0]*100)
# Null 값을 하나의 카테고리로 분류  = '-999' 입력
full_df[cols] = full_df[cols].fillna('-999')


In [256]:
# binary 
cols =list(meta[(meta['level']=='binary') & (meta['keep']==True)].index)
#  Null Value  Portion
# print(full_df[cols].isnull().sum()/full_df[cols].shape[0]*100)


#### Freq encoding 
- 각 카테고리 컬럼의 빈도수를 카운트 한 것

In [257]:
# Nominal value에 대해
cols =list(meta[(meta['level']=='nominal') & (meta['keep']==True)].index)

for col in cols :
    col_name = '%s_count_full' % col
    full_df[col_name] = full_df[col].map(full_df[col].value_counts())

#### aggregation

In [258]:
# # Catgory column별로 interval value 평균과 편차는 무슨 의미가 있을까

# # full_df.head()
# cat_cols =list(meta[(meta['level']=='nominal') & (meta['keep']==True)].index)
# cols = list(meta[(meta['level']!='nominal') & (meta['keep']==True)].index)
# cols.remove('flag')
# newcol_name = [x+'_avg' for x in cols]
# for col in tqdm(cat_cols) :
#     tmp = [x+'_for_%s'%col for x in newcol_name]
#     for c,original  in zip(tmp,cols) :
#         full_df[c] = full_df[col].map(full_df.groupby(col)[original].mean())
# #     print(full_df.groupby(col)[cols].mean())
   


#### interaction
- 상관관계를 기반으로 곱하기를 해보자


Car features
ps_car_12 are (with some approximations) square roots (divided by 10) of natural numbers whilst ps_car_15 are square roots of natural numbers. Let's represent the values using pairplot.

In [259]:
# sample = trainset.sample(frac=0.05)
# var = ['ps_car_12', 'ps_car_15', 'target']
# sample = sample[var]
# sns.pairplot(sample,  hue='target', palette = 'Set1', diag_kind='kde')
# plt.show()

### Feature Selection



Thanks to the public kernels (wheel of fortune eg.) that suggest to remove *calc features,

In [260]:
cols = [x for x in meta.index if 'calc' in x]
meta.loc[cols,'keep'] = False


In [261]:
# ## Removing featues with low variance
# from sklearn.feature_selection import VarianceThreshold
# sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
# sel.fit_transform(full_df)

In [262]:
# Tree-based Feature selection
#https://scikit-learn.org/stable/modules/feature_selection.html



### Label_encoding

In [263]:

# nominal value에 대해
cols =list(meta[(meta['level']=='nominal') & (meta['keep']==True)].index)
for col in tqdm(cols) :
    lbl = LabelEncoder()
    full_df[col] = lbl.fit_transform(list(full_df[col].values))
    


  0%|                                                                                           | 0/14 [00:00<?, ?it/s]
  7%|█████▉                                                                             | 1/14 [00:01<00:16,  1.29s/it]
 14%|███████████▊                                                                       | 2/14 [00:02<00:15,  1.29s/it]
 21%|█████████████████▊                                                                 | 3/14 [00:03<00:13,  1.22s/it]
 29%|███████████████████████▋                                                           | 4/14 [00:05<00:13,  1.33s/it]
 36%|█████████████████████████████▋                                                     | 5/14 [00:06<00:11,  1.30s/it]
 43%|███████████████████████████████████▌                                               | 6/14 [00:06<00:08,  1.02s/it]
 50%|█████████████████████████████████████████▌                                         | 7/14 [00:07<00:05,  1.26it/s]
 57%|██████████████████████████████████

### DataSet

In [264]:
cols =list(meta[meta['keep']==False].index)
cols  =[x for x in full_df.columns if x not in cols]
full_df = full_df[cols]

In [265]:
full_df.head()
train_df = full_df[full_df['flag']=='train'].drop(['flag'],axis =1 )
test_df = full_df[full_df['flag']=='test'].drop(['flag'],axis =1 )
target = target

In [266]:
train_df.shape

(595212, 51)

### Target encoding
Target encoding with smoothing
min_samples_leaf define a threshold where prior and target mean (for a given category value) have the same weight. Below the threshold prior becomes more important and above mean becomes more important.

How weight behaves against value counts is controlled by smoothing parameter

In [207]:
# def add_noise(series, noise_level):
#     return series * (1 + noise_level * np.random.randn(len(series)))

# def target_encode(trn_series=None, 
#                   tst_series=None, 
#                   target=None, 
#                   min_samples_leaf=1, 
#                   smoothing=1,
#                   noise_level=0):
#     """
#     Smoothing is computed like in the following paper by Daniele Micci-Barreca
#     https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
#     trn_series : training categorical feature as a pd.Series
#     tst_series : test categorical feature as a pd.Series
#     target : target data as a pd.Series
#     min_samples_leaf (int) : minimum samples to take category average into account
#     smoothing (int) : smoothing effect to balance categorical average vs prior  
#     """ 
#     assert len(trn_series) == len(target)
#     assert trn_series.name == tst_series.name
#     temp = pd.concat([trn_series, target], axis=1)
#     # Compute target mean 
#     averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
#     # Compute smoothing
#     smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
#     # Apply average function to all target data
#     prior = target.mean()
#     # The bigger the count the less full_avg is taken into account
#     averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
#     averages.drop(["mean", "count"], axis=1, inplace=True)
#     # Apply averages to trn and tst series
#     ft_trn_series = pd.merge(
#         trn_series.to_frame(trn_series.name),
#         averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
#         on=trn_series.name,
#         how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
#     # pd.merge does not keep the index so restore it
#     ft_trn_series.index = trn_series.index 
#     ft_tst_series = pd.merge(
#         tst_series.to_frame(tst_series.name),
#         averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
#         on=tst_series.name,
#         how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
#     # pd.merge does not keep the index so restore it
#     ft_tst_series.index = tst_series.index
#     return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)


In [208]:
# # Nominal value에 대해
# cols =list(meta[(meta['level']=='nominal') & (meta['keep']==True)].index)

# for col in tqdm(cols):
#     col_name = '%s_target_enc' % col
#     train_df[col_name],test_df[col_name] = target_encode(train_df[col], 
#                                              test_df[col], 
#                                              target=target, 
#                                              min_samples_leaf=100,
#                                              smoothing=10,
#                                              noise_level=0.01)
    


  0%|                                                                                           | 0/14 [00:00<?, ?it/s]
  7%|█████▉                                                                             | 1/14 [00:00<00:02,  4.40it/s]
 14%|███████████▊                                                                       | 2/14 [00:00<00:02,  4.48it/s]
 21%|█████████████████▊                                                                 | 3/14 [00:00<00:02,  4.51it/s]
 29%|███████████████████████▋                                                           | 4/14 [00:00<00:02,  4.60it/s]
 36%|█████████████████████████████▋                                                     | 5/14 [00:01<00:01,  4.59it/s]
 43%|███████████████████████████████████▌                                               | 6/14 [00:01<00:01,  4.64it/s]
 50%|█████████████████████████████████████████▌                                         | 7/14 [00:01<00:01,  4.65it/s]
 57%|██████████████████████████████████

In [209]:
train_df.head()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_car_02_cat_target_enc,ps_car_03_cat_target_enc,ps_car_04_cat_target_enc,ps_car_05_cat_target_enc,ps_car_06_cat_target_enc,ps_car_07_cat_target_enc,ps_car_08_cat_target_enc,ps_car_09_cat_target_enc,ps_car_10_cat_target_enc,ps_car_11_cat_target_enc
0,2,2,5,2,1,0,1,0,0,0,...,0.033271,0.032586,0.033896,0.040799,0.033947,0.035021,0.044502,0.033129,0.036352,0.038279
1,1,1,7,1,1,0,0,1,0,0,...,0.033567,0.032878,0.033816,0.032007,0.031528,0.034469,0.034424,0.035923,0.036617,0.023885
2,5,4,9,2,1,0,0,1,0,0,...,0.03395,0.032409,0.033498,0.031326,0.034721,0.035024,0.034983,0.035938,0.035904,0.031496
3,0,1,2,1,1,1,0,0,0,0,...,0.033346,0.03897,0.033283,0.040539,0.031726,0.034673,0.035028,0.035009,0.035725,0.044677
4,0,2,0,2,1,1,0,0,0,0,...,0.033573,0.032541,0.033655,0.03187,0.03428,0.035131,0.034745,0.036654,0.036667,0.026163


# Feature Selection


In [267]:
# Feature elimination : Xgboost 결과 Feature Importance 0.01 미만 삭제
cols = ['ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_14',
 'ps_car_02_cat',
 'ps_car_08_cat',
 'ps_car_10_cat',
 'ps_car_02_cat_count_full',
 'ps_car_03_cat_count_full',
 'ps_car_04_cat_count_full',
 'ps_car_08_cat_count_full',
 'ps_car_10_cat_count_full']

train_df = train_df.drop(cols,axis =1)
test_df = test_df.drop(cols,axis = 1)

# Oversampling 
https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html
https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets

Apart from the random sampling with replacement, there are two popular methods to over-sample minority classes: (i) the Synthetic Minority Oversampling Technique (SMOTE) [CBHK2002] and (ii) the Adaptive Synthetic (ADASYN) [HBGL2008] sampling method


In [36]:
# from imblearn.over_sampling import SMOTE
# sm = SMOTE(random_state = 99)
# train_df_sm, target_sm = sm.fit_sample(train_df, target.ravel())
# # SMOTE.fit_sample(train_df,target)

In [58]:
# print(train_df.shape, train_df_sm.shape)
# print(target.shape, target_sm.shape)
# print(target[target==1].shape,np.sum(target_sm))
# print(sum(target==0),sum(target==1))  # target ==1인 데이터가 26배가 되었다.
# print(sum(target_sm==0),sum(target_sm==1))

(595212, 37) (1147036, 37)
(595212,) (1147036,)
(21694,) 573518
573518 21694
573518 573518


In [86]:
# train_df = train_df_sm.copy()
# target= target_sm.copy()
# train_df['target'] = target
# target= train_df['target']
# train_df = train_df.drop(['target'],axis =1)

### Eval_gini

JIT 컴파일(just-in-time compilation) 또는 동적 번역(dynamic translation)은 프로그램을 실제 실행하는 시점에
기계어로 번역하는 컴파일 기법이다. 이 기법은 프로그램의 실행 속도를 빠르게 하기 위해 사용된다.


출처: https://hamait.tistory.com/476 [HAMA 블로그]

In [268]:
# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit

#
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283



### Modeling

In [279]:
learning_rate = 0.01
early_stopping_round = 30

from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedKFold

In [280]:
n_folds= 5
kf = KFold(n_splits =n_folds, random_state = 99, shuffle =True)
np.random.seed(99)

In [281]:
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]

In [282]:
%%time
model = XGBClassifier( 
            n_estimators = 1000,
            max_depth =4,
    objective = "binary:logistic",
    learing_rateing_rate = learning_rate,
    subsample = .8,
    min_child_weight = 6,
    colsample_bytree = .8,
    scale_pos_weight = 1.6,
    gamma = 10,
    reg_alpha = 8,
    reg_lambda =1.3,
    tree_method = 'gpu_hist'

)

Wall time: 0 ns


In [285]:
scores = []
y_test_pred = []
fi =[]
for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    x_train,x_valid = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train,y_valid = target.iloc[train_index], target.iloc[test_index]
    
    eval_set = [(x_valid,y_valid)]
    fit_model = model.fit(x_train,y_train,
                         eval_set =eval_set,
                         eval_metric = gini_xgb,
                        early_stopping_rounds = early_stopping_round,
                          verbose = 50
                         )
    print( "Best N tress = " , model.best_ntree_limit)
    print(" Best gini = ", -model.best_score)
    fi.append(model.feature_importances_)
#     plot_importance(fit_model)
#     pyplot.show()
#     selection = SelectFromModel(fit_model,threshold = 0.15,prefit =True)
#     sel_x_train = selection.transform(x_train)
#     sel_x_valid = selection.transform(x_valid)
#     s_model = XGBClassifier()
#     eval_set = [(sel_x_valid,y_valid)]
#     fit_model = s_model.fit(sel_x_train,y_train,
#                      eval_set =eval_set,
#                      eval_metric = gini_xgb,
#                     early_stopping_rounds = early_stopping_round,
#                       verbose = 50
#                      )
    
    #  prediction
    pred = fit_model.predict_proba(x_valid)[:,1]
    print( " Gini = ", eval_gini(y_valid, pred))
    scores.append(eval_gini(y_valid, pred))
    y_test_pred.append(fit_model.predict_proba(test_df)[:,1])
    
    
    del x_train,x_valid,y_train,y_valid
    gc.collect()
print(" Cross validadation score, Gini " ,sum(scores)/n_folds)

[0]	validation_0-error:0.036449	validation_0-gini:-0.183907
Multiple eval metrics have been passed: 'validation_0-gini' will be used for early stopping.

Will train until validation_0-gini hasn't improved in 30 rounds.
[50]	validation_0-error:0.036449	validation_0-gini:-0.263271
[100]	validation_0-error:0.036466	validation_0-gini:-0.277449
[150]	validation_0-error:0.036466	validation_0-gini:-0.281548
[200]	validation_0-error:0.036474	validation_0-gini:-0.28276
[250]	validation_0-error:0.036474	validation_0-gini:-0.283151
Stopping. Best iteration:
[240]	validation_0-error:0.036474	validation_0-gini:-0.283763

Best N tress =  241
 Best gini =  0.283763
 Gini =  0.2837628086228603
[0]	validation_0-error:0.036449	validation_0-gini:-0.193886
Multiple eval metrics have been passed: 'validation_0-gini' will be used for early stopping.

Will train until validation_0-gini hasn't improved in 30 rounds.
[50]	validation_0-error:0.036449	validation_0-gini:-0.268073
[100]	validation_0-error:0.036449

In [238]:
# Importance = pd.DataFrame()
# Importance['column'] = train_df.columns
# Importance['Feature_Importance'] = np.mean(fi,axis =0)
# list(Importance[Importance['Feature_Importance']<0.01]['column'])

['ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_14',
 'ps_car_02_cat',
 'ps_car_08_cat',
 'ps_car_10_cat',
 'ps_car_02_cat_count_full',
 'ps_car_03_cat_count_full',
 'ps_car_04_cat_count_full',
 'ps_car_08_cat_count_full',
 'ps_car_10_cat_count_full']

In [276]:
from datetime import datetime

In [286]:
sub_df = pd.DataFrame()
sub_df['id'] = test['id']
sub_df['target'] = sum(y_test_pred)/n_folds

sub_df.to_csv('submission_baseline_XGBOOST_' + str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + '.csv', index=False)



## LGBM

In [287]:
def gini_lgbm(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return 'gini', gini_score,True

In [288]:
# LGBM
import lightgbm as lgb
seeds = 99
params =  {
        'objective': 'binary',
        'num_threads': 4,
        'learning_rate': 0.01, 
        'num_iterations' : 1000,
        'max_depth': -1,
        'reg_alpha': 0.3,
         'reg_lambda': 0.3,
        'bagging_seed' : seeds,
        'verbose' : -1,
        'seed' :seeds
    }


In [292]:
scores = []
y_test_pred= []
for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    x_train,x_valid = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train,y_valid = target.iloc[train_index], target.iloc[test_index]
    
    lgb_train = lgb.Dataset(data = x_train, label = y_train)
    lgb_valid = lgb.Dataset(data = x_valid, label = y_valid)
    
    lgb_model = lgb.train(params, lgb_train, valid_sets = [lgb_valid], 
                          feval = gini_lgbm,verbose_eval = 100, early_stopping_rounds = early_stopping_round)
    
    pred = lgb_model.predict(x_valid, num_iteration = lgb_model.best_iteration)
#     pred = fit_model.predict_proba(x_valid)[:,1]
    print(pred.shape)
    print( " Gini = ", eval_gini(y_valid, pred))
    scores.append(eval_gini(y_valid, pred))
    y_test_pred.append(lgb_model.predict(test_df))
    
    del x_train,x_valid,y_train,y_valid
    gc.collect()
print(" Cross validadation score, Gini " ,sum(scores)/n_folds)
    

Training until validation scores don't improve for 30 rounds
[100]	valid_0's binary_logloss: 0.153469	valid_0's gini: 0.254685
[200]	valid_0's binary_logloss: 0.152624	valid_0's gini: 0.267997
[300]	valid_0's binary_logloss: 0.152298	valid_0's gini: 0.273225
[400]	valid_0's binary_logloss: 0.152162	valid_0's gini: 0.275546
[500]	valid_0's binary_logloss: 0.152089	valid_0's gini: 0.277333
[600]	valid_0's binary_logloss: 0.152059	valid_0's gini: 0.278012
[700]	valid_0's binary_logloss: 0.152036	valid_0's gini: 0.278568
Early stopping, best iteration is:
[706]	valid_0's binary_logloss: 0.152032	valid_0's gini: 0.278685
(119043,)
 Gini =  0.2786854032195609
Training until validation scores don't improve for 30 rounds
[100]	valid_0's binary_logloss: 0.153296	valid_0's gini: 0.261378
[200]	valid_0's binary_logloss: 0.152385	valid_0's gini: 0.270735
[300]	valid_0's binary_logloss: 0.152053	valid_0's gini: 0.274506
[400]	valid_0's binary_logloss: 0.151895	valid_0's gini: 0.277016
[500]	valid_0

In [294]:
from datetime import datetime

sub_df = pd.DataFrame()
sub_df['id'] = test['id']
sub_df['target'] = sum(y_test_pred)/n_folds

sub_df.to_csv('submission_baseline_LGBM_' + str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + '.csv', index=False)



### LGBM_Categorical Feature

In [63]:
def gini_lgbm(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return 'gini', gini_score,True

In [64]:
# nominal value에 대해
cols =list(meta[(meta['level']=='nominal') & (meta['keep']==True)].index)

# LGBM
import lightgbm as lgb
seeds = 99
params =  {
        'objective': 'binary',
        'num_threads': 4,
        'learning_rate': 0.01, 
        'num_iterations' : 1000,
        'max_depth': -1,
        'reg_alpha': 0.3,
         'reg_lambda': 0.3,
        'bagging_seed' : seeds,
        'verbose' : -1,
        'seed' :seeds
    }



In [66]:
scores = []
y_test_pred= []
for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    x_train,x_valid = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train,y_valid = target.iloc[train_index], target.iloc[test_index]
    
#     x_train[cols] , x_valid[cols]= x_train[cols].astype(object), x_valid[cols].astype(object)
    lgb_train = lgb.Dataset(data = x_train, label = y_train)
    lgb_valid = lgb.Dataset(data = x_valid, label = y_valid)
    
    lgb_model = lgb.train(params, lgb_train, valid_sets = [lgb_valid], 
                          feval = gini_lgbm,verbose_eval = 100, early_stopping_rounds = early_stopping_round,
                         categorical_feature=cols)
        
    pred = lgb_model.predict(x_valid, num_iteration = lgb_model.best_iteration)
#     pred = fit_model.predict_proba(x_valid)[:,1]
    print(pred.shape)
    print( " Gini = ", eval_gini(y_valid, pred))
    scores.append(eval_gini(y_valid, pred))
    y_test_pred.append(lgb_model.predict(test_df))
    
    del x_train,x_valid,y_train,y_valid
    gc.collect()
print(" Cross validadation score, Gini " ,sum(scores)/n_folds)
    

Training until validation scores don't improve for 30 rounds
[100]	valid_0's binary_logloss: 0.151767	valid_0's gini: 0.249186
[200]	valid_0's binary_logloss: 0.150992	valid_0's gini: 0.257855
[300]	valid_0's binary_logloss: 0.150695	valid_0's gini: 0.262629
[400]	valid_0's binary_logloss: 0.150577	valid_0's gini: 0.265408
[500]	valid_0's binary_logloss: 0.150535	valid_0's gini: 0.266414
Early stopping, best iteration is:
[531]	valid_0's binary_logloss: 0.150533	valid_0's gini: 0.266525
(119043,)
 Gini =  0.2665252768135875
Training until validation scores don't improve for 30 rounds
[100]	valid_0's binary_logloss: 0.155468	valid_0's gini: 0.239307
[200]	valid_0's binary_logloss: 0.154779	valid_0's gini: 0.248473
[300]	valid_0's binary_logloss: 0.154578	valid_0's gini: 0.251739
[400]	valid_0's binary_logloss: 0.154536	valid_0's gini: 0.252678
Early stopping, best iteration is:
[384]	valid_0's binary_logloss: 0.154533	valid_0's gini: 0.252729
(119043,)
 Gini =  0.252728996421557
Trainin

In [67]:
from datetime import datetime

sub_df = pd.DataFrame()
sub_df['id'] = test['id']
sub_df['target'] = sum(y_test_pred)/n_folds

sub_df.to_csv('submission_baseline_LGBM_Cat_' + str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + '.csv', index=False)



### Catboost

In [None]:
from catboost import CatBoostClassifier

params =    {     #'iterations' : 2000,
        'n_estimators': 2000,
        'learning_rate': 0.05,
        'eval_metric': 'RMSE',
        'loss_function': 'RMSE',
        'random_seed': seed,
        'metric_period': 10,
        'task_type': 'GPU',
        #'subsample' : 0.8,
        'depth': 8,
    }

model = CatBoostClassifier(iterations=2,
                           learning_rate=1,
                           depth=2)
# Fit model
model.fit(train_data, train_labels, cat_features)
# Get predicted classes
preds_class = model.predict(eval_data)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_data)
# Get predicted RawFormulaVal
preds_raw = model.predict(eval_data, prediction_type='RawFormulaVal')


 gbm = CatBoostRegressor(**params)
    print(i)
    gbm.fit(
            train_X, train_y,
             eval_set=(val_X, val_y),
            early_stopping_rounds = 30,
            cat_features=categorical_features,
            verbose=30)

### MLP

've used this to get a network that has a local CV AUC around 0.642, which corresponds to Gini of 0.284. 

the formula GINI = 2 * AUC -1
2*AUC-1 will be same as calculated with gini_normalized


In [21]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras import optimizers
from sklearn.metrics import roc_auc_score
# https://www.kaggle.com/rspadim/gini-keras-callback-earlystopping-validation

Using TensorFlow backend.


In [None]:
# kfold = StratifiedKFold(n_splits = K, 
#                             random_state = 100, 
#                             shuffle = True)    

In [29]:
import tensorflow as tf
import keras.backend as K
# FROM https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/41108
def jacek_auc(y_true, y_pred):
    score, up_opt = tf.metrics.auc(y_true, y_pred)
    #score, up_opt = tf.contrib.metrics.streaming_auc(y_pred, y_true)    
    K.get_session().run(tf.local_variables_initializer())
    with tf.control_dependencies([up_opt]):
        score = tf.identity(score)
    return score

# FROM https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/41015
# AUC for a binary classifier
def discussion41015_auc(y_true, y_pred):
    ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)
    binSizes = -(pfas[1:]-pfas[:-1])
    s = ptas*binSizes
    return K.sum(s, axis=0)

# PFA, prob false alert for binary classifier
def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # N = total number of negative labels
    N = K.sum(1 - y_true)
    # FP = total number of false alerts, alerts from the negative class labels
    FP = K.sum(y_pred - y_pred * y_true)
    return FP/N

#----------------
# P_TA prob true alerts for binary classifier
def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # P = total number of positive labels
    P = K.sum(y_true)
    # TP = total number of correct alerts, alerts from the positive class labels
    TP = K.sum(y_pred * y_true)
    return TP/P


def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

In [None]:
# https://www.kaggle.com/tomcwalker/keras-nn-with-custom-loss-function-for-gini-auc

In [44]:
target_mlp = keras.utils.np_utils.to_categorical(target)

In [49]:
batch_size = 50
epochs = 1000
learning_rate = 0.001
scores = []
y_test_pred= []

def mlpmodel(input_shape):
    model = Sequential()
    model.add(Dense(64, activation ='relu', input_dim = input_shape))
    model.add(Dropout(0.5))
#     model.add(Dense(64,activation='relu'))
#     model.add(Dropout(0.5))
#     model.add(Dense(64, activation='relu'))
    model.add(Dense(2,activation='sigmoid'))
    adam = optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
    model.compile(loss ='binary_crossentropy',
                 optimizer = adam,metrics = [jacek_auc])

    return model

callback_early_stopping = keras.callbacks.EarlyStopping(monitor='jacek_auc', patience=10, verbose=2, mode='max')

for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    x_train,x_valid = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train,y_valid = target_mlp[train_index], target_mlp[test_index]
    
    
    mlp = mlpmodel(x_train.shape[1])
    mlp.fit(x_train,y_train,
       epochs = epochs,
       batch_size = batch_size,
       callbacks = [callback_early_stopping],
       verbose = 2)
    
    
    pred  = mlp.predict(x_valid)
    score = -eval_gini(y_valid, pred[:,0])
    scores.append(score)
    print(i, "Gini ", score )
    
    y_test_pred.append(mlp.predict(test_df))
#     mlp.predict(test_df)
    del x_train,x_valid,y_train,y_valid
    gc.collect()

print(" Cross validadation score, Gini " ,sum(scores)/n_folds)
    
   

Epoch 1/1000
 - 22s - loss: 1211.5840 - jacek_auc: 0.9342
Epoch 2/1000
 - 21s - loss: 0.4911 - jacek_auc: 0.9556
Epoch 3/1000
 - 21s - loss: 0.2010 - jacek_auc: 0.9588
Epoch 4/1000
 - 21s - loss: 0.3525 - jacek_auc: 0.9601
Epoch 5/1000
 - 21s - loss: 0.1726 - jacek_auc: 0.9608
Epoch 6/1000
 - 21s - loss: 0.1830 - jacek_auc: 0.9613
Epoch 7/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9617
Epoch 8/1000
 - 21s - loss: 0.2207 - jacek_auc: 0.9619
Epoch 9/1000
 - 21s - loss: 0.1582 - jacek_auc: 0.9619
Epoch 10/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9622
Epoch 11/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9623
Epoch 12/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9624
Epoch 13/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9625
Epoch 14/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9625
Epoch 15/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9625
Epoch 16/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9628
Epoch 17/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.9628
Epoch 18/1000
 - 21s - loss: 0.1569 - jacek_auc: 0.962

KeyboardInterrupt: 

In [None]:
from datetime import datetime

sub_df = pd.DataFrame()
sub_df['id'] = test['id']
sub_df['target'] = sum(y_test_pred)/n_folds

sub_df.to_csv('submission_baseline_MLP_' + str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + '.csv', index=False)



In [41]:
y_test_pred[0][:,0]

array([0.03637472, 0.03637472, 0.03637472, ..., 0.03637472, 0.03637472,
       0.03637472], dtype=float32)

In [36]:
K


<module 'keras.backend' from 'C:\\Users\\yseon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\keras\\backend\\__init__.py'>