# 1. Pre definition

In [1]:
import pandas as pd
import sklearn
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV



In [2]:
# divid date
import datetime
def date2weekday(date):
    date = str(date)
    year = int(date[0:4])
    month = int(date[4:6])
    day = int(date[6:])
    return datetime.datetime(year,month,day).weekday()

def ant_score(truth,score):
    FNR1 = 0.001
    FNR2 = 0.005
    FNR3 = 0.01
    min1 = min2 = min3 = 1
    for thr in np.arange(0,1+0.001,0.001):
        evaluate_table = pd.DataFrame({'truth':truth,'score':score})
        evaluate_table.loc[evaluate_table['score']>=thr,'score']=1
        evaluate_table.loc[evaluate_table['score']<thr,'score']=0
        TP = evaluate_table.loc[(evaluate_table['score']==1)&(evaluate_table['truth']==1)].shape[0]
        FN = evaluate_table.loc[(evaluate_table['score']==0)&(evaluate_table['truth']==1)].shape[0]
        TN = evaluate_table.loc[(evaluate_table['score']==0)&(evaluate_table['truth']==0)].shape[0]
        FP = evaluate_table.loc[(evaluate_table['score']==1)&(evaluate_table['truth']==0)].shape[0]
        TPR = TP/(TP+FN)
        FNR = FP/(TN+FP)
        if abs(FNR-FNR1)<min1:
            min1 = abs(FNR-FNR1)
            FNR11 = FNR
            TPR1 = TPR
        if abs(FNR-FNR2)<min2:
            min2 = abs(FNR-FNR2)
            FNR22 = FNR
            TPR2 = TPR
        if abs(FNR-FNR3)<min3:
            min3 = abs(FNR-FNR3)
            FNR33 = FNR
            TPR3 = TPR
    return 0.4*TPR1+0.3*TPR2+0.3*TPR3


import numpy as np
from sklearn import metrics
import bisect


def get_tpr_from_fpr(fpr_array, tpr_array, target):
    fpr_index = np.where(fpr_array == target)
    assert target <= 0.01, 'the value of fpr in the custom metric function need lt 0.01'
    if len(fpr_index[0]) > 0:
        return np.mean(tpr_array[fpr_index])
    else:
        tmp_index = bisect.bisect(fpr_array, target)
        fpr_tmp_1 = fpr_array[tmp_index-1]
        fpr_tmp_2 = fpr_array[tmp_index]
        if (target - fpr_tmp_1) > (fpr_tmp_2 - target):
            tpr_index = tmp_index
        else:
            tpr_index = tmp_index - 1
        return tpr_array[tpr_index]


def eval_metric(labels,pred):
    fpr, tpr, _ = metrics.roc_curve(labels, pred, pos_label=1)
    tpr1 = get_tpr_from_fpr(fpr, tpr, 0.001)
    tpr2 = get_tpr_from_fpr(fpr, tpr, 0.005)
    tpr3 = get_tpr_from_fpr(fpr, tpr, 0.01)
    return 0.4*tpr1 + 0.3*tpr2 + 0.3*tpr3

# 2. data preparation

## 2.1 read train and test data

In [3]:
# constant definition
# small_data_path = './data/small_size/sample_atec_anti_fraud_train.csv'
train_path = '../data/full_size/atec_anti_fraud_train.csv'
testb_path='../data/full_size/atec_anti_fraud_test_b.csv'
train_data = pd.read_csv(train_path,index_col = 0)
testb_data = pd.read_csv('../data/full_size/atec_anti_fraud_test_b.csv',index_col = 0)

## 2.2 feature selection

In [4]:
# find no missing value features
no_nan_features = ['date']
i = 1
while i<len(testb_data.columns):
    name = testb_data.columns[i]
    if train_data[name].isnull().sum()==0:
        no_nan_features.append(name)
    i+=1
    
# find small missing features
small_missing_features=[]
i = 1
while i<len(testb_data.columns):
    name = testb_data.columns[i]
    train_missing_rate = train_data[name].isnull().sum()/train_data.shape[0]
    test_missing_rate = testb_data[name].isnull().sum()/testb_data.shape[0]
    if 0<train_missing_rate<0.3 and abs(test_missing_rate-train_missing_rate)<0.1:
        small_missing_features.append(name)
    i+=1
    
filldable_features = small_missing_features+no_nan_features
# feature selection
feature_score_files=['xgb_feature_scores.csv','lgb_feature_scores2.csv']
common_important_features=set()
all_important_features=set()
top=100
for file in feature_score_files:
    features = set(pd.read_csv(file,index_col = 0,header=None).sort_values(by=1,ascending=False).iloc[:top,0].index.tolist())
    all_important_features = all_important_features|features
    if common_important_features:
        common_important_features = common_important_features&features
    else:
        common_important_features=features

## 2.3 fill missing values for fiildable columns

In [5]:
train_data[filldable_features] = train_data[filldable_features].fillna(train_data[filldable_features].mean())
testb_data[filldable_features] = testb_data[filldable_features].fillna(testb_data[filldable_features].mean())

## 2.4 eliminate unlabeled data/convert -1 to 1

In [6]:
# convert -1 to 1
train_data['label'] = train_data['label'].apply(lambda x: 1 if x==-1 else x)
train_data = train_data.sort_values(by=['date'])

In [7]:
from imblearn.over_sampling import SMOTE,ADASYN
def random_subsample(data,target_ratio):
    pos_data = data[data['label']==0]
    neg_data = data[data['label']==1]
    target_pos_num = int(neg_data.shape[0]/target_ratio)
    pos_data = data.iloc[np.random.randint(pos_data.shape[0],size=target_pos_num),:]
    return pd.concat([pos_data,neg_data])
    
def random_oversample(data,target_ratio):
    pos_data = data[data['label']==0]
    neg_data = data[data['label']==1]
    target_new_neg_num = int(pos_data.shape[0]*target_ratio)-neg_data.shape[0]
    new_neg_data = neg_data.iloc[np.random.randint(neg_data.shape[0],size=target_new_neg_num),:]
    neg_data = pd.concat([neg_data,new_neg_data])
    
    return pd.concat([pos_data,neg_data])
    
def mySMOTE(data):
    X = data.drop(columns=['label'])
    Y = data.label
    resampled_x,resampled_y = SMOTE().fit_sample(X,Y)
    resampled_data = resasmpled_x.copy()
    resampled_data['label'] = resampled_y
    return resampled_data
    
def myADASYN(data):
    X = data.drop(columns=['label'])
    Y = data.label
    resampled_x,resampled_y = ADASYN().fit_sample(X,Y)
    resampled_data = resasmpled_x.copy()
    resampled_data['label'] = resampled_y
    return resampled_data
    
    

## 2.6 train test split

In [8]:
# # strategy 1: only filldable
# train_data = train_data[['label']+filldable_features]

# stragey 2: filldable | common_important:158
# selected_features = list(set(filldable_features)|common_important_features)
selected_features = testb_data.columns.tolist()
train_data = train_data[['label']+selected_features]

# # strategy 3: filldable|all_important
# selected_features = list(set(filldable_features)|all_important_features)
# train_data = train_data[['label']+selected_features]


train_num = int(0.8*train_data.shape[0])
test_data = train_data.iloc[train_num:,:]
cut_train_data = train_data.iloc[:train_num,:]
sampled_train_data = cut_train_data.copy()
# sampled_train_data = random_oversample(sampled_train_data,0.05)
train_x = sampled_train_data.drop(columns=['label'])
train_y = sampled_train_data['label']
test_x = test_data.drop(columns=['label'])
test_y = test_data['label']

In [None]:
len(selected_features)

# 3. Build LightGMB model

## 3.1 Grid Search for best parameters

In [None]:
param = {'max_depth':[3,5,7,9],'num_leaves':[6,25,85,350], 'num_trees':[100,300,500,700,900]}
lgb_clf = lgb.LGBMClassifier(boosting_type= 'gbdt', objective='binary',colsample_bytree=0.8,
                   subsample= 0.8)
lgb_random_cv = RandomizedSearchCV(lgb_clf,param,verbose=True,scoring='precision')
lgb_random_cv.fit(train_x,train_y,eval_metric='error',eval_set=[(test_x, test_y)],early_stopping_rounds=20)
lgb_random_cv.best_params_

## 3.2 train lightgbm based on best parameters with cross validation

In [None]:
from sklearn.model_selection import KFold
def kfold_by_date(data,k=5):
    kf = KFold(n_splits=k,random_state=0,shuffle=True)
    dates = np.array(data['date'].value_counts().index.tolist())
    for train_index,test_index in kf.split(dates):
        train_data = data[data['date'].isin(dates[train_index])]
        valid_data = data[data['date'].isin(dates[test_index])]
        yield train_data,valid_data

### 3.2.1 normal k-fold cross validation

In [9]:
train_x = train_x.drop(columns=['date'])
test_x = test_x.drop(columns=['date'])

In [16]:
# train best parameter lgb_random_cv
neg_num = sampled_train_data.label.value_counts()[0]
pos_num = sampled_train_data.label.value_counts()[0]
lgb_300n350le9d01l = lgb.LGBMClassifier(boosting_type= 'dart', objective='binary',colsample_bytree=1,
                   subsample= 1,min_data_in_leaf=150,lambda_l2=0.5,drop_rate = 0.5,max_depth=9,num_leaves=350,
                                        n_estimators=500,random_state = 0,scale_pos_weight=neg_num/pos_num,learning_rate=0.08)

from sklearn.model_selection import KFold
from sklearn import metrics
kf = KFold(n_splits=10,random_state=0,shuffle=True)
lgb_300n350le9d01l_list = []
predict_probas = []
i=1
for train_index,test_index in kf.split(sampled_train_data):
    sub_train_x = train_x.iloc[train_index]
    sub_train_y = train_y.iloc[train_index]
    sub_test_x = train_x.iloc[test_index]
    sub_test_y = train_y.iloc[test_index]
    
    # training
    print('fitting model{}...'.format(i))
    lgb_300n350le9d01l.fit(sub_train_x,sub_train_y,eval_metric='error',eval_set=[(test_x, test_y)],early_stopping_rounds=100)
    print('model{} predicting...'.format(i))
    sub_predict_y = lgb_300n350le9d01l.predict(sub_test_x)
    sub_predict_y_proba = lgb_300n350le9d01l.predict_proba(sub_test_x)[:,1]
    print('sub scoring...')
    print('precision: {},recall:{},ant_score:{}'.format(metrics.precision_score(sub_test_y,sub_predict_y),
                                                           metrics.recall_score(sub_test_y,sub_predict_y),
                                                           eval_metric(sub_test_y,sub_predict_y_proba)))
    print('out scoring...')
    predict_y = lgb_300n350le9d01l.predict(test_x)
    predict_y_proba = lgb_300n350le9d01l.predict_proba(test_x)[:,1]
    print('precision: {}, recall: {}, ant_score: {}'.format(metrics.precision_score(test_y,predict_y),
                                                       metrics.recall_score(test_y,predict_y),
                                                       eval_metric(test_y,predict_y_proba)))
    predict_probas.append(predict_y_proba)
    lgb_300n350le9d01l_list.append(lgb_300n350le9d01l)
    i+=1

fitting model1...
[1]	valid_0's binary_error: 0.0148532
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_error: 0.0146471
[3]	valid_0's binary_error: 0.0146019
[4]	valid_0's binary_error: 0.014622
[5]	valid_0's binary_error: 0.0145114
[6]	valid_0's binary_error: 0.0145315
[7]	valid_0's binary_error: 0.0145617
[8]	valid_0's binary_error: 0.0145818
[9]	valid_0's binary_error: 0.0144963
[10]	valid_0's binary_error: 0.0145667
[11]	valid_0's binary_error: 0.0144511
[12]	valid_0's binary_error: 0.014436
[13]	valid_0's binary_error: 0.0145516
[14]	valid_0's binary_error: 0.0144611
[15]	valid_0's binary_error: 0.0144812
[16]	valid_0's binary_error: 0.014436
[17]	valid_0's binary_error: 0.014436
[18]	valid_0's binary_error: 0.0144109
[19]	valid_0's binary_error: 0.0144159
[20]	valid_0's binary_error: 0.0144561
[21]	valid_0's binary_error: 0.0144762
[22]	valid_0's binary_error: 0.0144762
[23]	valid_0's binary_error: 0.0144812
[24]	valid_0's binary_error: 0.0144

[208]	valid_0's binary_error: 0.0136066
[209]	valid_0's binary_error: 0.0136217
[210]	valid_0's binary_error: 0.0136066
[211]	valid_0's binary_error: 0.0136117
[212]	valid_0's binary_error: 0.0136117
[213]	valid_0's binary_error: 0.0136066
[214]	valid_0's binary_error: 0.013667
[215]	valid_0's binary_error: 0.013677
[216]	valid_0's binary_error: 0.013677
[217]	valid_0's binary_error: 0.0136469
[218]	valid_0's binary_error: 0.0136569
[219]	valid_0's binary_error: 0.0136619
[220]	valid_0's binary_error: 0.013677
[221]	valid_0's binary_error: 0.013677
[222]	valid_0's binary_error: 0.013672
[223]	valid_0's binary_error: 0.0136619
[224]	valid_0's binary_error: 0.0136569
[225]	valid_0's binary_error: 0.013677
[226]	valid_0's binary_error: 0.013677
[227]	valid_0's binary_error: 0.0136619
[228]	valid_0's binary_error: 0.013677
[229]	valid_0's binary_error: 0.0136418
[230]	valid_0's binary_error: 0.0136368
[231]	valid_0's binary_error: 0.0136217
[232]	valid_0's binary_error: 0.0136469
[233]	val

[113]	valid_0's binary_error: 0.0138077
[114]	valid_0's binary_error: 0.0138379
[115]	valid_0's binary_error: 0.0137826
[116]	valid_0's binary_error: 0.0137876
[117]	valid_0's binary_error: 0.0138178
[118]	valid_0's binary_error: 0.0137976
[119]	valid_0's binary_error: 0.0137474
[120]	valid_0's binary_error: 0.0137775
[121]	valid_0's binary_error: 0.0137926
[122]	valid_0's binary_error: 0.0138178
[123]	valid_0's binary_error: 0.0137725
[124]	valid_0's binary_error: 0.0138077
[125]	valid_0's binary_error: 0.0138127
[126]	valid_0's binary_error: 0.0138077
[127]	valid_0's binary_error: 0.0138178
[128]	valid_0's binary_error: 0.0138127
[129]	valid_0's binary_error: 0.0138127
[130]	valid_0's binary_error: 0.0138127
[131]	valid_0's binary_error: 0.0138228
[132]	valid_0's binary_error: 0.0138178
[133]	valid_0's binary_error: 0.0138077
[134]	valid_0's binary_error: 0.0138027
[135]	valid_0's binary_error: 0.0137976
[136]	valid_0's binary_error: 0.0138228
[137]	valid_0's binary_error: 0.0138077


[18]	valid_0's binary_error: 0.0143204
[19]	valid_0's binary_error: 0.0143455
[20]	valid_0's binary_error: 0.0143707
[21]	valid_0's binary_error: 0.0143656
[22]	valid_0's binary_error: 0.0143757
[23]	valid_0's binary_error: 0.0144159
[24]	valid_0's binary_error: 0.0144611
[25]	valid_0's binary_error: 0.0144611
[26]	valid_0's binary_error: 0.0144611
[27]	valid_0's binary_error: 0.0144913
[28]	valid_0's binary_error: 0.0144913
[29]	valid_0's binary_error: 0.0143053
[30]	valid_0's binary_error: 0.0143857
[31]	valid_0's binary_error: 0.0144008
[32]	valid_0's binary_error: 0.0144058
[33]	valid_0's binary_error: 0.0143958
[34]	valid_0's binary_error: 0.0144209
[35]	valid_0's binary_error: 0.0144159
[36]	valid_0's binary_error: 0.0143304
[37]	valid_0's binary_error: 0.0143254
[38]	valid_0's binary_error: 0.0143254
[39]	valid_0's binary_error: 0.0143707
[40]	valid_0's binary_error: 0.0143656
[41]	valid_0's binary_error: 0.0143455
[42]	valid_0's binary_error: 0.0143707
[43]	valid_0's binary_err

[226]	valid_0's binary_error: 0.0135011
[227]	valid_0's binary_error: 0.0135413
[228]	valid_0's binary_error: 0.0135614
[229]	valid_0's binary_error: 0.0135363
[230]	valid_0's binary_error: 0.0135413
[231]	valid_0's binary_error: 0.0135513
[232]	valid_0's binary_error: 0.0135312
[233]	valid_0's binary_error: 0.0135262
[234]	valid_0's binary_error: 0.0135011
[235]	valid_0's binary_error: 0.0135413
[236]	valid_0's binary_error: 0.0135463
[237]	valid_0's binary_error: 0.0135664
[238]	valid_0's binary_error: 0.0135715
[239]	valid_0's binary_error: 0.0135413
[240]	valid_0's binary_error: 0.0135614
[241]	valid_0's binary_error: 0.0135111
[242]	valid_0's binary_error: 0.0135212
[243]	valid_0's binary_error: 0.0135463
[244]	valid_0's binary_error: 0.0135363
[245]	valid_0's binary_error: 0.0135463
[246]	valid_0's binary_error: 0.0135463
[247]	valid_0's binary_error: 0.0135363
[248]	valid_0's binary_error: 0.0135513
[249]	valid_0's binary_error: 0.0135513
[250]	valid_0's binary_error: 0.0135463


[432]	valid_0's binary_error: 0.0131995
[433]	valid_0's binary_error: 0.0132045
[434]	valid_0's binary_error: 0.0132045
[435]	valid_0's binary_error: 0.0132045
[436]	valid_0's binary_error: 0.0132045
[437]	valid_0's binary_error: 0.0131844
[438]	valid_0's binary_error: 0.0132045
[439]	valid_0's binary_error: 0.0132196
[440]	valid_0's binary_error: 0.0132146
[441]	valid_0's binary_error: 0.0132196
[442]	valid_0's binary_error: 0.0132196
[443]	valid_0's binary_error: 0.0132196
[444]	valid_0's binary_error: 0.0132297
[445]	valid_0's binary_error: 0.0132297
[446]	valid_0's binary_error: 0.0132196
[447]	valid_0's binary_error: 0.0132146
[448]	valid_0's binary_error: 0.0132095
[449]	valid_0's binary_error: 0.0131844
[450]	valid_0's binary_error: 0.0131794
[451]	valid_0's binary_error: 0.0131844
[452]	valid_0's binary_error: 0.0132196
[453]	valid_0's binary_error: 0.0132196
[454]	valid_0's binary_error: 0.0132246
[455]	valid_0's binary_error: 0.0132246
[456]	valid_0's binary_error: 0.0132246


[131]	valid_0's binary_error: 0.013863
[132]	valid_0's binary_error: 0.0138479
[133]	valid_0's binary_error: 0.0138429
[134]	valid_0's binary_error: 0.0138379
[135]	valid_0's binary_error: 0.0138429
[136]	valid_0's binary_error: 0.0138429
[137]	valid_0's binary_error: 0.0138379
[138]	valid_0's binary_error: 0.0138429
[139]	valid_0's binary_error: 0.0138429
[140]	valid_0's binary_error: 0.0138328
[141]	valid_0's binary_error: 0.0138429
[142]	valid_0's binary_error: 0.0138429
[143]	valid_0's binary_error: 0.0138328
[144]	valid_0's binary_error: 0.0138278
[145]	valid_0's binary_error: 0.0138379
[146]	valid_0's binary_error: 0.0137976
[147]	valid_0's binary_error: 0.0137675
[148]	valid_0's binary_error: 0.0137574
[149]	valid_0's binary_error: 0.0137725
[150]	valid_0's binary_error: 0.0137273
[151]	valid_0's binary_error: 0.013682
[152]	valid_0's binary_error: 0.0137021
[153]	valid_0's binary_error: 0.0137122
[154]	valid_0's binary_error: 0.0137172
[155]	valid_0's binary_error: 0.0137222
[1

[56]	valid_0's binary_error: 0.0143154
[57]	valid_0's binary_error: 0.0143757
[58]	valid_0's binary_error: 0.0143757
[59]	valid_0's binary_error: 0.0143455
[60]	valid_0's binary_error: 0.0143455
[61]	valid_0's binary_error: 0.0143405
[62]	valid_0's binary_error: 0.0142953
[63]	valid_0's binary_error: 0.0143103
[64]	valid_0's binary_error: 0.0142148
[65]	valid_0's binary_error: 0.0141394
[66]	valid_0's binary_error: 0.0140791
[67]	valid_0's binary_error: 0.0141193
[68]	valid_0's binary_error: 0.0140691
[69]	valid_0's binary_error: 0.0140892
[70]	valid_0's binary_error: 0.0141143
[71]	valid_0's binary_error: 0.0141093
[72]	valid_0's binary_error: 0.0141244
[73]	valid_0's binary_error: 0.014049
[74]	valid_0's binary_error: 0.014059
[75]	valid_0's binary_error: 0.0140138
[76]	valid_0's binary_error: 0.0140289
[77]	valid_0's binary_error: 0.0140289
[78]	valid_0's binary_error: 0.0139484
[79]	valid_0's binary_error: 0.0139032
[80]	valid_0's binary_error: 0.0137725
[81]	valid_0's binary_error

[263]	valid_0's binary_error: 0.0135664
[264]	valid_0's binary_error: 0.0135664
[265]	valid_0's binary_error: 0.0135614
[266]	valid_0's binary_error: 0.0135564
[267]	valid_0's binary_error: 0.0135564
[268]	valid_0's binary_error: 0.0135865
[269]	valid_0's binary_error: 0.0135815
[270]	valid_0's binary_error: 0.0135865
[271]	valid_0's binary_error: 0.0135865
[272]	valid_0's binary_error: 0.0135664
[273]	valid_0's binary_error: 0.0135715
[274]	valid_0's binary_error: 0.0135564
[275]	valid_0's binary_error: 0.0135614
[276]	valid_0's binary_error: 0.0135664
[277]	valid_0's binary_error: 0.0135664
[278]	valid_0's binary_error: 0.0135463
[279]	valid_0's binary_error: 0.0135413
[280]	valid_0's binary_error: 0.0135513
[281]	valid_0's binary_error: 0.0135111
[282]	valid_0's binary_error: 0.0135111
[283]	valid_0's binary_error: 0.0135564
[284]	valid_0's binary_error: 0.0135513
[285]	valid_0's binary_error: 0.0135262
[286]	valid_0's binary_error: 0.0135212
[287]	valid_0's binary_error: 0.0135212


[175]	valid_0's binary_error: 0.0135212
[176]	valid_0's binary_error: 0.0135463
[177]	valid_0's binary_error: 0.0135162
[178]	valid_0's binary_error: 0.0135162
[179]	valid_0's binary_error: 0.0135061
[180]	valid_0's binary_error: 0.0135111
[181]	valid_0's binary_error: 0.0135664
[182]	valid_0's binary_error: 0.0135513
[183]	valid_0's binary_error: 0.0135513
[184]	valid_0's binary_error: 0.0135664
[185]	valid_0's binary_error: 0.0135513
[186]	valid_0's binary_error: 0.0135312
[187]	valid_0's binary_error: 0.0135011
[188]	valid_0's binary_error: 0.0135061
[189]	valid_0's binary_error: 0.0135212
[190]	valid_0's binary_error: 0.0135212
[191]	valid_0's binary_error: 0.0135262
[192]	valid_0's binary_error: 0.0135463
[193]	valid_0's binary_error: 0.0135865
[194]	valid_0's binary_error: 0.0136016
[195]	valid_0's binary_error: 0.0136368
[196]	valid_0's binary_error: 0.0136318
[197]	valid_0's binary_error: 0.0136117
[198]	valid_0's binary_error: 0.0136066
[199]	valid_0's binary_error: 0.0135865


[86]	valid_0's binary_error: 0.0139685
[87]	valid_0's binary_error: 0.0139283
[88]	valid_0's binary_error: 0.0139233
[89]	valid_0's binary_error: 0.0138429
[90]	valid_0's binary_error: 0.0138529
[91]	valid_0's binary_error: 0.0138831
[92]	valid_0's binary_error: 0.013858
[93]	valid_0's binary_error: 0.0139233
[94]	valid_0's binary_error: 0.0139334
[95]	valid_0's binary_error: 0.0138781
[96]	valid_0's binary_error: 0.0138931
[97]	valid_0's binary_error: 0.013863
[98]	valid_0's binary_error: 0.013858
[99]	valid_0's binary_error: 0.013868
[100]	valid_0's binary_error: 0.0138982
[101]	valid_0's binary_error: 0.013868
[102]	valid_0's binary_error: 0.0139032
[103]	valid_0's binary_error: 0.0138077
[104]	valid_0's binary_error: 0.0138228
[105]	valid_0's binary_error: 0.0138278
[106]	valid_0's binary_error: 0.0138379
[107]	valid_0's binary_error: 0.0138479
[108]	valid_0's binary_error: 0.0138228
[109]	valid_0's binary_error: 0.0138278
[110]	valid_0's binary_error: 0.0137574
[111]	valid_0's bin

[14]	valid_0's binary_error: 0.014431
[15]	valid_0's binary_error: 0.0144561
[16]	valid_0's binary_error: 0.0144762
[17]	valid_0's binary_error: 0.0144712
[18]	valid_0's binary_error: 0.0144812
[19]	valid_0's binary_error: 0.0144712
[20]	valid_0's binary_error: 0.0144812
[21]	valid_0's binary_error: 0.0144812
[22]	valid_0's binary_error: 0.0145013
[23]	valid_0's binary_error: 0.0145064
[24]	valid_0's binary_error: 0.0144662
[25]	valid_0's binary_error: 0.0144611
[26]	valid_0's binary_error: 0.0143556
[27]	valid_0's binary_error: 0.0143405
[28]	valid_0's binary_error: 0.0143204
[29]	valid_0's binary_error: 0.014426
[30]	valid_0's binary_error: 0.0143405
[31]	valid_0's binary_error: 0.0143606
[32]	valid_0's binary_error: 0.0143757
[33]	valid_0's binary_error: 0.0143606
[34]	valid_0's binary_error: 0.0144159
[35]	valid_0's binary_error: 0.0143606
[36]	valid_0's binary_error: 0.0143857
[37]	valid_0's binary_error: 0.0143757
[38]	valid_0's binary_error: 0.0143958
[39]	valid_0's binary_error

[222]	valid_0's binary_error: 0.013667
[223]	valid_0's binary_error: 0.0136519
[224]	valid_0's binary_error: 0.0136418
[225]	valid_0's binary_error: 0.0136368
[226]	valid_0's binary_error: 0.0136318
[227]	valid_0's binary_error: 0.0136469
[228]	valid_0's binary_error: 0.0136267
[229]	valid_0's binary_error: 0.0136066
[230]	valid_0's binary_error: 0.0136167
[231]	valid_0's binary_error: 0.0136217
[232]	valid_0's binary_error: 0.013667
[233]	valid_0's binary_error: 0.0136519
[234]	valid_0's binary_error: 0.0136569
[235]	valid_0's binary_error: 0.0136619
[236]	valid_0's binary_error: 0.013672
[237]	valid_0's binary_error: 0.013672
[238]	valid_0's binary_error: 0.013667
[239]	valid_0's binary_error: 0.0136368
[240]	valid_0's binary_error: 0.0136368
[241]	valid_0's binary_error: 0.0136519
[242]	valid_0's binary_error: 0.0136519
[243]	valid_0's binary_error: 0.0136368
[244]	valid_0's binary_error: 0.0136418
[245]	valid_0's binary_error: 0.0136117
[246]	valid_0's binary_error: 0.0136267
[247]

MemoryError: 

### 3.2.2 date-based kfold-validation

In [None]:
# train best parameter lgb_random_cv

lgb_300n350le9d01l = lgb.LGBMClassifier(boosting_type= 'dart', objective='binary',colsample_bytree=0.8,
                   subsample= 0.8,max_depth=10,num_leaves=800,n_estimators=100)

from sklearn.model_selection import KFold
from sklearn import metrics
kf = KFold(n_splits=5,random_state=0,shuffle=True)
lgb_300n350le9d01l_list = []
predict_probas = []
i=1
for sub_train_data,sub_test_data in kfold_by_date(sampled_train_data,k=5):
    sub_train_x = sub_train_data.drop(columns=['label'])
    sub_train_y = sub_train_data['label']
    sub_test_x = sub_test_data.drop(columns=['label'])
    sub_test_y = sub_test_data['label']
    # training
    print('fitting model{}...'.format(i))
    lgb_300n350le9d01l.fit(sub_train_x,sub_train_y,eval_metric='error',eval_set=[(test_x, test_y)],early_stopping_rounds=50)
    print('model{} predicting...'.format(i))
    sub_predict_y = lgb_300n350le9d01l.predict(sub_test_x)
    sub_predict_y_proba = lgb_300n350le9d01l.predict_proba(sub_test_x)[:,1]
    print('sub scoring...')
    print('precision: {},recall:{},ant_score:{}'.format(metrics.precision_score(sub_test_y,sub_predict_y),
                                                           metrics.recall_score(sub_test_y,sub_predict_y),
                                                           eval_metric(sub_test_y,sub_predict_y)))
    print('out scoring...')
    predict_y = lgb_300n350le9d01l.predict(test_x)
    predict_y_proba = lgb_300n350le9d01l.predict_proba(test_x)[:,1]
    print('precision: {}, recall: {}, ant_score: {}'.format(metrics.precision_score(test_y,predict_y),
                                                       metrics.recall_score(test_y,predict_y),
                                                       eval_metric(test_y,predict_y)))
    predict_probas.append(predict_y_proba)
    lgb_300n350le9d01l_list.append(lgb_300n350le9d01l)
    i+=1

In [None]:
lgb_300n350le9d01l

## 3.3 evaluation on test set

In [17]:
predict_probas = pd.DataFrame(predict_probas).T
predict_scores = predict_probas.mean(axis = 1).values
print('ant_score:{}'.format(eval_metric(test_y,predict_scores)))

ant_score:0.4193946517778431


In [None]:
train_x.shape

## 3.4 prediction on testb set

In [19]:
# # predict on testb
testb_data = pd.read_csv('../data/full_size/atec_anti_fraud_test_b.csv',index_col = 0)

# predict
# testb_data[filldable_features] = testb_data[filldable_features].fillna(testb_data[filldable_features].mean())
testb_data = testb_data[selected_features].drop(columns=['date'])
# testb_data['date'] = testb_data['date'].apply(lambda x:int(str(x)[6:]))
print('predicting on final outer testset.....')
scores = [] # store score predicted by every cv model
for model in lgb_300n350le9d01l_list:
    score = model.predict_proba(testb_data)[:,1]
    scores.append(score)

predicting on final outer testset.....


In [20]:
final_scores = pd.DataFrame(scores).T.mean(axis=1).values
print(final_scores.mean(),final_scores.std())

0.115136787162 0.0553133613047


In [21]:
print('writing result to file...')
final_result = pd.DataFrame({'score':final_scores},index=testb_data.index)
final_result.to_csv('../submission/testb_lgb_500n350le9d008l_fillnan_sf157_normalcv8_original_date_scale_pos_weight.csv')

writing result to file...
