# 1. Pre definition

In [2]:
import pandas as pd
import sklearn
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [3]:
# divid date
import datetime
def date2weekday(date):
    date = str(date)
    year = int(date[0:4])
    month = int(date[4:6])
    day = int(date[6:])
    return datetime.datetime(year,month,day).weekday()

def ant_score(truth,score):
    FNR1 = 0.001
    FNR2 = 0.005
    FNR3 = 0.01
    min1 = min2 = min3 = 1
    for thr in np.arange(0,1+0.001,0.001):
        evaluate_table = pd.DataFrame({'truth':truth,'score':score})
        evaluate_table.loc[evaluate_table['score']>=thr,'score']=1
        evaluate_table.loc[evaluate_table['score']<thr,'score']=0
        TP = evaluate_table.loc[(evaluate_table['score']==1)&(evaluate_table['truth']==1)].shape[0]
        FN = evaluate_table.loc[(evaluate_table['score']==0)&(evaluate_table['truth']==1)].shape[0]
        TN = evaluate_table.loc[(evaluate_table['score']==0)&(evaluate_table['truth']==0)].shape[0]
        FP = evaluate_table.loc[(evaluate_table['score']==1)&(evaluate_table['truth']==0)].shape[0]
        TPR = TP/(TP+FN)
        FNR = FP/(TN+FP)
        if abs(FNR-FNR1)<min1:
            min1 = abs(FNR-FNR1)
            FNR11 = FNR
            TPR1 = TPR
        if abs(FNR-FNR2)<min2:
            min2 = abs(FNR-FNR2)
            FNR22 = FNR
            TPR2 = TPR
        if abs(FNR-FNR3)<min3:
            min3 = abs(FNR-FNR3)
            FNR33 = FNR
            TPR3 = TPR
    return 0.4*TPR1+0.3*TPR2+0.3*TPR3


import numpy as np
from sklearn import metrics
import bisect


def get_tpr_from_fpr(fpr_array, tpr_array, target):
    fpr_index = np.where(fpr_array == target)
    assert target <= 0.01, 'the value of fpr in the custom metric function need lt 0.01'
    if len(fpr_index[0]) > 0:
        return np.mean(tpr_array[fpr_index])
    else:
        tmp_index = bisect.bisect(fpr_array, target)
        fpr_tmp_1 = fpr_array[tmp_index-1]
        fpr_tmp_2 = fpr_array[tmp_index]
        if (target - fpr_tmp_1) > (fpr_tmp_2 - target):
            tpr_index = tmp_index
        else:
            tpr_index = tmp_index - 1
        return tpr_array[tpr_index]


def eval_metric(labels,pred):
    fpr, tpr, _ = metrics.roc_curve(labels, pred, pos_label=1)
    tpr1 = get_tpr_from_fpr(fpr, tpr, 0.001)
    tpr2 = get_tpr_from_fpr(fpr, tpr, 0.005)
    tpr3 = get_tpr_from_fpr(fpr, tpr, 0.01)
    return 0.4*tpr1 + 0.3*tpr2 + 0.3*tpr3

# 2. data preparation

## 2.1 read train and test data

In [4]:
# constant definition
# small_data_path = './data/small_size/sample_atec_anti_fraud_train.csv'
train_path = '../data/full_size/atec_anti_fraud_train.csv'
testb_path='../data/full_size/atec_anti_fraud_test_b.csv'
train_data = pd.read_csv(train_path,index_col = 0)
testb_data = pd.read_csv('../data/full_size/atec_anti_fraud_test_b.csv',index_col = 0)

## 2.2 feature selection

In [None]:
# find no missing value features
no_nan_features = ['date']
i = 1
while i<len(testb_data.columns):
    name = testb_data.columns[i]
    if train_data[name].isnull().sum()==0:
        no_nan_features.append(name)
    i+=1
    
# find small missing features
small_missing_features=[]
i = 1
while i<len(testb_data.columns):
    name = testb_data.columns[i]
    train_missing_rate = train_data[name].isnull().sum()/train_data.shape[0]
    test_missing_rate = testb_data[name].isnull().sum()/testb_data.shape[0]
    if 0<train_missing_rate<0.3 and abs(test_missing_rate-train_missing_rate)<0.1:
        small_missing_features.append(name)
    i+=1
    
filldable_features = small_missing_features+no_nan_features
# feature selection
feature_score_files=['xgb_feature_scores.csv','lgb_feature_scores2.csv']
common_important_features=set()
all_important_features=set()
top=100
for file in feature_score_files:
    features = set(pd.read_csv(file,index_col = 0,header=None).sort_values(by=1,ascending=False).iloc[:top,0].index.tolist())
    all_important_features = all_important_features|features
    if common_important_features:
        common_important_features = common_important_features&features
    else:
        common_important_features=features

## 2.3 fill missing values for fiildable columns

In [None]:
train_data[filldable_features] = train_data[filldable_features].fillna(train_data[filldable_features].mean())
testb_data[filldable_features] = testb_data[filldable_features].fillna(testb_data[filldable_features].mean())

## 2.4 eliminate unlabeled data/convert -1 to 1

In [None]:
# convert -1 to 1
train_data['label'] = train_data['label'].apply(lambda x: 1 if x==-1 else x)
train_data = train_data.sort_values(by=['date'])

In [None]:
from imblearn.over_sampling import SMOTE,ADASYN
def random_subsample(data,target_ratio):
    pos_data = data[data['label']==0]
    neg_data = data[data['label']==1]
    target_pos_num = int(neg_data.shape[0]/target_ratio)
    pos_data = data.iloc[np.random.randint(pos_data.shape[0],size=target_pos_num),:]
    return pd.concat([pos_data,neg_data])
    
def random_oversample(data,target_ratio):
    pos_data = data[data['label']==0]
    neg_data = data[data['label']==1]
    target_new_neg_num = int(pos_data.shape[0]*target_ratio)-neg_data.shape[0]
    new_neg_data = neg_data.iloc[np.random.randint(neg_data.shape[0],size=target_new_neg_num),:]
    neg_data = pd.concat([neg_data,new_neg_data])
    
    return pd.concat([pos_data,neg_data])
    
def mySMOTE(data):
    X = data.drop(columns=['label'])
    Y = data.label
    resampled_x,resampled_y = SMOTE().fit_sample(X,Y)
    resampled_data = resasmpled_x.copy()
    resampled_data['label'] = resampled_y
    return resampled_data
    
def myADASYN(data):
    X = data.drop(columns=['label'])
    Y = data.label
    resampled_x,resampled_y = ADASYN().fit_sample(X,Y)
    resampled_data = resasmpled_x.copy()
    resampled_data['label'] = resampled_y
    return resampled_data
    

## 2.6 train test split

In [None]:
# # strategy 1: only filldable
# train_data = train_data[['label']+filldable_features]

# stragey 2: filldable | common_important:158
selected_features = list(set(filldable_features)|common_important_features)
selected_features = testb_data.columns.tolist()
train_data = train_data[['label']+selected_features]

# # strategy 3: filldable|all_important
# selected_features = list(set(filldable_features)|all_important_features)
# train_data = train_data[['label']+selected_features]


train_num = int(0.8*train_data.shape[0])
test_data = train_data.iloc[train_num:,:]
cut_train_data = train_data.iloc[:train_num,:]
sampled_train_data = cut_train_data.copy()
# sampled_train_data = random_oversample(sampled_train_data,0.05)
train_x = sampled_train_data.drop(columns=['label'])
train_y = sampled_train_data['label']
test_x = test_data.drop(columns=['label'])
test_y = test_data['label']

In [None]:
len(selected_features)

# 3. Build LightGMB model

## 3.1 Grid Search for best parameters

In [None]:
param = {'max_depth':[3,5,7,9],'num_leaves':[6,25,85,350], 'num_trees':[100,300,500,700,900]}
lgb_clf = lgb.LGBMClassifier(boosting_type= 'gbdt', objective='binary',colsample_bytree=0.8,
                   subsample= 0.8)
lgb_random_cv = RandomizedSearchCV(lgb_clf,param,verbose=True,scoring='precision')
lgb_random_cv.fit(train_x,train_y,eval_metric='error',eval_set=[(test_x, test_y)],early_stopping_rounds=20)
lgb_random_cv.best_params_

## 3.2 train lightgbm based on best parameters with cross validation

In [None]:
from sklearn.model_selection import KFold
def kfold_by_date(data,k=5):
    kf = KFold(n_splits=k,random_state=0,shuffle=True)
    dates = np.array(data['date'].value_counts().index.tolist())
    for train_index,test_index in kf.split(dates):
        train_data = data[data['date'].isin(dates[train_index])]
        valid_data = data[data['date'].isin(dates[test_index])]
        yield train_data,valid_data

### 3.2.1 normal k-fold cross validation

In [None]:
# train best parameter lgb_random_cv
neg_num = sampled_train_data.label.value_counts()[0]
pos_num = sampled_train_data.label.value_counts()[0]
lgb_300n350le9d01l = lgb.LGBMClassifier(boosting_type= 'dart', objective='binary',colsample_bytree=0.6,
                   subsample= 0.6,max_depth=9,num_leaves=400,n_estimators=200,random_state = 0,scale_pos_weight=neg_num/pos_num)

from sklearn.model_selection import KFold
from sklearn import metrics
kf = KFold(n_splits=5,random_state=0,shuffle=True)
lgb_300n350le9d01l_list = []
predict_probas = []
i=1
for train_index,test_index in kf.split(sampled_train_data):
    sub_train_x = train_x.iloc[train_index]
    sub_train_y = train_y.iloc[train_index]
    sub_test_x = train_x.iloc[test_index]
    sub_test_y = train_y.iloc[test_index]
    
    # training
    print('fitting model{}...'.format(i))
    lgb_300n350le9d01l.fit(sub_train_x,sub_train_y,eval_metric='error',eval_set=[(test_x, test_y)],early_stopping_rounds=100)
    print('model{} predicting...'.format(i))
    sub_predict_y = lgb_300n350le9d01l.predict(sub_test_x)
    sub_predict_y_proba = lgb_300n350le9d01l.predict_proba(sub_test_x)[:,1]
    print('sub scoring...')
    print('precision: {},recall:{},ant_score:{}'.format(metrics.precision_score(sub_test_y,sub_predict_y),
                                                           metrics.recall_score(sub_test_y,sub_predict_y),
                                                           eval_metric(sub_test_y,sub_predict_y)))
    print('out scoring...')
    predict_y = lgb_300n350le9d01l.predict(test_x)
    predict_y_proba = lgb_300n350le9d01l.predict_proba(test_x)[:,1]
    print('precision: {}, recall: {}, ant_score: {}'.format(metrics.precision_score(test_y,predict_y),
                                                       metrics.recall_score(test_y,predict_y),
                                                       eval_metric(test_y,predict_y)))
    predict_probas.append(predict_y_proba)
    lgb_300n350le9d01l_list.append(lgb_300n350le9d01l)
    i+=1

### 3.2.2 date-based kfold-validation

In [None]:
# train best parameter lgb_random_cv

lgb_300n350le9d01l = lgb.LGBMClassifier(boosting_type= 'dart', objective='binary',colsample_bytree=0.8,
                   subsample= 0.8,max_depth=10,num_leaves=800,n_estimators=100)

from sklearn.model_selection import KFold
from sklearn import metrics
kf = KFold(n_splits=5,random_state=0,shuffle=True)
lgb_300n350le9d01l_list = []
predict_probas = []
i=1
for sub_train_data,sub_test_data in kfold_by_date(sampled_train_data,k=5):
    sub_train_x = sub_train_data.drop(columns=['label'])
    sub_train_y = sub_train_data['label']
    sub_test_x = sub_test_data.drop(columns=['label'])
    sub_test_y = sub_test_data['label']
    # training
    print('fitting model{}...'.format(i))
    lgb_300n350le9d01l.fit(sub_train_x,sub_train_y,eval_metric='error',eval_set=[(test_x, test_y)],early_stopping_rounds=50)
    print('model{} predicting...'.format(i))
    sub_predict_y = lgb_300n350le9d01l.predict(sub_test_x)
    sub_predict_y_proba = lgb_300n350le9d01l.predict_proba(sub_test_x)[:,1]
    print('sub scoring...')
    print('precision: {},recall:{},ant_score:{}'.format(metrics.precision_score(sub_test_y,sub_predict_y),
                                                           metrics.recall_score(sub_test_y,sub_predict_y),
                                                           eval_metric(sub_test_y,sub_predict_y)))
    print('out scoring...')
    predict_y = lgb_300n350le9d01l.predict(test_x)
    predict_y_proba = lgb_300n350le9d01l.predict_proba(test_x)[:,1]
    print('precision: {}, recall: {}, ant_score: {}'.format(metrics.precision_score(test_y,predict_y),
                                                       metrics.recall_score(test_y,predict_y),
                                                       eval_metric(test_y,predict_y)))
    predict_probas.append(predict_y_proba)
    lgb_300n350le9d01l_list.append(lgb_300n350le9d01l)
    i+=1

In [None]:
lgb_300n350le9d01l

## 3.3 evaluation on test set

In [None]:
predict_probas = pd.DataFrame(predict_probas).T
predict_scores = predict_probas.mean(axis = 1).values
print('ant_score:{}'.format(eval_metric(test_y,predict_scores)))

In [None]:
train_x.shape

## 3.4 prediction on testb set

In [None]:
# # predict on testb
testb_data = pd.read_csv('../data/full_size/atec_anti_fraud_test_b.csv',index_col = 0)

# predict
testb_data[filldable_features] = testb_data[filldable_features].fillna(testb_data[filldable_features].mean())
testb_data = testb_data[selected_features]
# testb_data['date'] = testb_data['date'].apply(lambda x:int(str(x)[6:]))
print('predicting on final outer testset.....')
scores = [] # store score predicted by every cv model
for model in lgb_300n350le9d01l_list:
    score = model.predict_proba(testb_data)[:,1]
    scores.append(score)

In [None]:
final_scores = pd.DataFrame(scores).T.mean(axis=1).values
print(final_scores.mean(),final_scores.std())

In [None]:
print('writing result to file...')
final_result = pd.DataFrame({'score':final_scores},index=testb_data.index)
final_result.to_csv('../submission/testb_lgb_500n400le9d01l_fillnan_sf158_normalcv5_original_date_scale_pos_weight.csv')