In [19]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
import xgboost as xgb
from sklearn import metrics

In [52]:
# divid date
import datetime
def date2weekday(date):
    date = str(date)
    year = int(date[0:4])
    month = int(date[4:6])
    day = int(date[6:])
    return datetime.datetime(year,month,day).weekday()

def ant_score(truth,score):
    FNR1 = 0.001
    FNR2 = 0.005
    FNR3 = 0.01
    min1 = min2 = min3 = 1
    for thr in np.arange(0,1+0.001,0.001):
        evaluate_table = pd.DataFrame({'truth':truth,'score':score})
        evaluate_table.loc[evaluate_table['score']>=thr,'score']=1
        evaluate_table.loc[evaluate_table['score']<thr,'score']=0
        TP = evaluate_table.loc[(evaluate_table['score']==1)&(evaluate_table['truth']==1)].shape[0]
        FN = evaluate_table.loc[(evaluate_table['score']==0)&(evaluate_table['truth']==1)].shape[0]
        TN = evaluate_table.loc[(evaluate_table['score']==0)&(evaluate_table['truth']==0)].shape[0]
        FP = evaluate_table.loc[(evaluate_table['score']==1)&(evaluate_table['truth']==0)].shape[0]
        TPR = TP/(TP+FN)
        FNR = FP/(TN+FP)
        if abs(FNR-FNR1)<min1:
            min1 = abs(FNR-FNR1)
            FNR11 = FNR
            TPR1 = TPR
        if abs(FNR-FNR2)<min2:
            min2 = abs(FNR-FNR2)
            FNR22 = FNR
            TPR2 = TPR
        if abs(FNR-FNR3)<min3:
            min3 = abs(FNR-FNR3)
            FNR33 = FNR
            TPR3 = TPR
    return 0.4*TPR1+0.3*TPR2+0.3*TPR3

In [21]:
# constant definition
full_data_path = '../data/full_size/atec_anti_fraud_train.csv'

# write sample data to file
df = pd.read_csv(full_data_path,index_col = 0)
df = df[df['label']!=-1] # drop no label
df = df.sort_values(by=['date']) # sort by date
df['day'] = df['date'].apply(lambda x:int(str(x)[6:]))
df['weekday'] =df['date'].apply(date2weekday)
df = df.drop(columns=['date'])

In [50]:
# train test split
X = df.iloc[:,1:]
Y = df.iloc[:,0]
train_ratio = 0.8
train_num = int(train_ratio*X.shape[0])
train_x = X.iloc[:train_num,:]
train_y = Y.iloc[:train_num]
test_x = X.iloc[train_num:,:]
test_y = Y.iloc[train_num:]

counts = train_y.value_counts()
neg_num = counts[0]
pos_num = counts[1]

In [79]:
from xgboost import XGBClassifier
xgb_500n9d01l = XGBClassifier(learning_rate=0.1,max_delt_step=1,calsample_bytree=0.8,
                                subsample=0.8,n_estimators=500,max_depth=9)
xgb_500n9d01l_list = []
from sklearn.model_selection import KFold
kf = KFold(n_splits=4,random_state=0,shuffle=True)

i = 1
predict_probas = [] # save each sub model's out scoring results
for train_index,test_index in kf.split(train_x):
    sub_train_x = train_x.iloc[train_index]
    sub_train_y = train_y.iloc[train_index]
    sub_test_x = train_x.iloc[test_index]
    sub_test_y = train_y.iloc[test_index]
    
    # training
    print('fitting model{}...'.format(i))
    xgb_500n9d01l.fit(sub_train_x,sub_train_y,eval_metric='error',verbose=True,
                      eval_set=[(test_x, test_y)],early_stopping_rounds=100)
    print('model{} predicting....'.format(i))
    sub_predict_y = xgb_500n9d01l.predict(sub_test_x)
    sub_predict_y_proba = xgb_500n9d01l.predict_proba(sub_test_x)[:,1]
    print('sub scoring....')
    print('precision: {}, recall: {}, ant_score: {}'.format(metrics.precision_score(sub_test_y,sub_predict_y),
                                                           metrics.recall_score(sub_test_y,sub_predict_y),
                                                           ant_score(sub_test_y,sub_predict_y_proba)))
    print('out scoring...')
    predict_y = xgb_500n9d01l.predict(test_x)
    predict_y_proba = xgb_500n9d01l.predict_proba(test_x)[:,1]
    print('precision: {}, recall: {}, ant_score: {}'.format(metrics.precision_score(test_y,predict_y),
                                                       metrics.recall_score(test_y,predict_y),
                                                       ant_score(test_y,predict_y_proba)))
    predict_probas.append(predict_y_proba)
    xgb_500n9d01l_list.append(xgb_500n9d01l)
    i+=1

fitting model1...
[0]	validation_0-error:0.010924
Will train until validation_0-error hasn't improved in 100 rounds.
[1]	validation_0-error:0.010939
[2]	validation_0-error:0.010747
[3]	validation_0-error:0.010672
[4]	validation_0-error:0.010742
[5]	validation_0-error:0.010651
[6]	validation_0-error:0.010555
[7]	validation_0-error:0.010535
[8]	validation_0-error:0.010454
[9]	validation_0-error:0.01048
[10]	validation_0-error:0.01048
[11]	validation_0-error:0.010449
[12]	validation_0-error:0.010439
[13]	validation_0-error:0.010424
[14]	validation_0-error:0.010409
[15]	validation_0-error:0.010358
[16]	validation_0-error:0.010374
[17]	validation_0-error:0.010338
[18]	validation_0-error:0.010313
[19]	validation_0-error:0.010338
[20]	validation_0-error:0.010303
[21]	validation_0-error:0.010263
[22]	validation_0-error:0.010288
[23]	validation_0-error:0.010222
[24]	validation_0-error:0.010207
[25]	validation_0-error:0.010207
[26]	validation_0-error:0.010222
[27]	validation_0-error:0.010227
[28

  if diff:


sub scoring....
precision: 0.8147321428571429, recall: 0.611646418098031, ant_score: 0.7030582320904901
out scoring...


  if diff:


precision: 0.6926658905704307, recall: 0.4524714828897338, ant_score: 0.5042585551330798
fitting model2...
[0]	validation_0-error:0.011242
Will train until validation_0-error hasn't improved in 100 rounds.
[1]	validation_0-error:0.011172
[2]	validation_0-error:0.01103
[3]	validation_0-error:0.010752
[4]	validation_0-error:0.010667
[5]	validation_0-error:0.010768
[6]	validation_0-error:0.010737
[7]	validation_0-error:0.010732
[8]	validation_0-error:0.010712
[9]	validation_0-error:0.010667
[10]	validation_0-error:0.010656
[11]	validation_0-error:0.010591
[12]	validation_0-error:0.010611
[13]	validation_0-error:0.010581
[14]	validation_0-error:0.010601
[15]	validation_0-error:0.010586
[16]	validation_0-error:0.010566
[17]	validation_0-error:0.010545
[18]	validation_0-error:0.010545
[19]	validation_0-error:0.010485
[20]	validation_0-error:0.010439
[21]	validation_0-error:0.010454
[22]	validation_0-error:0.010459
[23]	validation_0-error:0.010444
[24]	validation_0-error:0.010475
[25]	validat

[240]	validation_0-error:0.009884
[241]	validation_0-error:0.009874
[242]	validation_0-error:0.009889
[243]	validation_0-error:0.009879
[244]	validation_0-error:0.009909
[245]	validation_0-error:0.009909
[246]	validation_0-error:0.009914
[247]	validation_0-error:0.009959
[248]	validation_0-error:0.009959
[249]	validation_0-error:0.009944
[250]	validation_0-error:0.009965
[251]	validation_0-error:0.009959
[252]	validation_0-error:0.009944
[253]	validation_0-error:0.009954
[254]	validation_0-error:0.009939
[255]	validation_0-error:0.009975
[256]	validation_0-error:0.009995
[257]	validation_0-error:0.009949
[258]	validation_0-error:0.009975
[259]	validation_0-error:0.00997
[260]	validation_0-error:0.009954
[261]	validation_0-error:0.009965
[262]	validation_0-error:0.00997
[263]	validation_0-error:0.009985
[264]	validation_0-error:0.00998
[265]	validation_0-error:0.009995
[266]	validation_0-error:0.009965
[267]	validation_0-error:0.009965
[268]	validation_0-error:0.009975
[269]	validation_

  if diff:


sub scoring....
precision: 0.8067928730512249, recall: 0.6192307692307693, ant_score: 0.6918376068376068
out scoring...


  if diff:


precision: 0.6795366795366795, recall: 0.46844106463878327, ant_score: 0.49885931558935365
fitting model3...
[0]	validation_0-error:0.011146
Will train until validation_0-error hasn't improved in 100 rounds.
[1]	validation_0-error:0.010909
[2]	validation_0-error:0.010717
[3]	validation_0-error:0.010631
[4]	validation_0-error:0.010611
[5]	validation_0-error:0.01055
[6]	validation_0-error:0.010545
[7]	validation_0-error:0.010495
[8]	validation_0-error:0.0105
[9]	validation_0-error:0.010459
[10]	validation_0-error:0.010384
[11]	validation_0-error:0.010384
[12]	validation_0-error:0.010374
[13]	validation_0-error:0.010439
[14]	validation_0-error:0.010384
[15]	validation_0-error:0.010419
[16]	validation_0-error:0.010414
[17]	validation_0-error:0.010364
[18]	validation_0-error:0.010293
[19]	validation_0-error:0.010283
[20]	validation_0-error:0.010263
[21]	validation_0-error:0.010338
[22]	validation_0-error:0.010303
[23]	validation_0-error:0.010293
[24]	validation_0-error:0.010288
[25]	validat

[240]	validation_0-error:0.009773
[241]	validation_0-error:0.009773
[242]	validation_0-error:0.009813
[243]	validation_0-error:0.009778
[244]	validation_0-error:0.009757
[245]	validation_0-error:0.009747
[246]	validation_0-error:0.009737
[247]	validation_0-error:0.009727
[248]	validation_0-error:0.009763
[249]	validation_0-error:0.009773
[250]	validation_0-error:0.009798
[251]	validation_0-error:0.009823
[252]	validation_0-error:0.009747
[253]	validation_0-error:0.009778
[254]	validation_0-error:0.009768
[255]	validation_0-error:0.009763
[256]	validation_0-error:0.009783
[257]	validation_0-error:0.009788
[258]	validation_0-error:0.009768
[259]	validation_0-error:0.009763
[260]	validation_0-error:0.009757
[261]	validation_0-error:0.009742
[262]	validation_0-error:0.009757
[263]	validation_0-error:0.009768
[264]	validation_0-error:0.009783
[265]	validation_0-error:0.009803
[266]	validation_0-error:0.009783
[267]	validation_0-error:0.009783
[268]	validation_0-error:0.009793
[269]	validati

  if diff:


sub scoring....
precision: 0.806573275862069, recall: 0.6209041891331398, ant_score: 0.6976358357527996
out scoring...


  if diff:


precision: 0.6843766438716465, recall: 0.49467680608365017, ant_score: 0.5075285171102661
fitting model4...
[0]	validation_0-error:0.011192
Will train until validation_0-error hasn't improved in 100 rounds.
[1]	validation_0-error:0.011035
[2]	validation_0-error:0.010995
[3]	validation_0-error:0.010929
[4]	validation_0-error:0.010813
[5]	validation_0-error:0.010707
[6]	validation_0-error:0.010571
[7]	validation_0-error:0.010626
[8]	validation_0-error:0.010545
[9]	validation_0-error:0.010495
[10]	validation_0-error:0.01049
[11]	validation_0-error:0.01047
[12]	validation_0-error:0.010495
[13]	validation_0-error:0.010525
[14]	validation_0-error:0.010475
[15]	validation_0-error:0.010348
[16]	validation_0-error:0.010348
[17]	validation_0-error:0.010364
[18]	validation_0-error:0.010328
[19]	validation_0-error:0.010323
[20]	validation_0-error:0.010318
[21]	validation_0-error:0.010303
[22]	validation_0-error:0.010278
[23]	validation_0-error:0.010278
[24]	validation_0-error:0.010232
[25]	validat

  if diff:


sub scoring....
precision: 0.8029445073612684, recall: 0.6023789294817332, ant_score: 0.7019541206457094
out scoring...


  if diff:


precision: 0.6774734488541084, recall: 0.46083650190114067, ant_score: 0.49490494296577947


In [80]:
import pickle
with open("xgd_500n9d01l_cv4.bat","wb") as f:
    pickle.dump(xgb_500n9d01l_list,f)

In [85]:
metrics.precision_score(test_y,predict_y),metrics.recall_score(test_y,predict_y)

(0.6774734488541084, 0.46083650190114067)

In [81]:
predict_probas = pd.DataFrame(predict_probas).T
predict_scores = predict_probas.mean(axis = 1).values

# retest the cv_model
ant_score(test_y,predict_scores)

0.5168441064638783

In [82]:
# conduct final outer test
testa_data = pd.read_csv('../data/full_size/atec_anti_fraud_test_a.csv',index_col = 0)

# split time
testa_data['day'] = testa_data['date'].apply(lambda x:int(str(x)[6:]))
testa_data['weekday'] =testa_data['date'].apply(date2weekday)
testa_data = testa_data.drop(columns=['date'])

# predict
print('predicting on final outer testset.....')
scores = [] # store score predicted by every cv model
for model in xgb_500n9d01l_list:
    score = model.predict_proba(testa_data)[:,1]
    scores.append(score)

final_result = pd.DataFrame({'score':pd.DataFrame(scores).T.mean(axis=1).values},index=testa_data.index)
final_result.to_csv('../submission/time_split_xgb500n9d0.1l.csv')

predicting on final outer testset.....


In [78]:
final_result = pd.DataFrame({'score':pd.DataFrame(scores).T.mean(axis=1).values},index=testa_data.index)
final_result.to_csv('../submission/time_split_xgb500n9d0.1l.csv')