In [1]:
# import packages
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
def Normalization(x):
    max_val = max(x)
    min_val = min(x)
    return [(i-min_val)/(max_val-min_val) for i in x]

In [3]:
def ROC_curve(y, prediction):
    prediction = prediction.reshape(1, -1)
    y = y.reshape(-1, 1)
    pos = np.sum(y == 1)
    neg = np.sum(y == 0)
    pred_sort = np.sort(prediction)[::-1][0]  #从大到小排序
    index = prediction.argsort()[::-1][0]  #从大到小排序
    y_sort = y[index]
#     print(y_sort)
    tpr = []
    fpr = []
    thr = []

    for i,item in enumerate(pred_sort):
        tpr.append(np.sum((y_sort[:i] == 1)) / pos)
        fpr.append(np.sum((y_sort[:i] == 0)) / neg)
        thr.append(item)
    return tpr, fpr

In [4]:
def weight_lost_cal(real, pre):
    
    test_auc = metrics.roc_auc_score(real, pre)
    fpr, tpr, thresholds = metrics.roc_curve(real, pre, pos_label=1) 
    score=0.4*tpr[np.where(fpr>=0.001)[0][0]]+0.3*tpr[np.where(fpr>=0.005)[0][0]]+0.3*tpr[np.where(fpr>=0.01)[0][0]] 
    
    print("AUC value = " + str(test_auc))
    print("Score = " + str(score))
    return score
    
#     # auc
#     test_auc = metrics.roc_auc_score(real, pre)  # 验证集上的auc值
    

#     # print(tpr)
#     # print(fpr)
#     fpr, tpr = ROC_curve(real, pre)
#     # plt.plot(fpr,tpr)

#     tmp = [np.abs(ite - 0.001) for ite in fpr]
#     tpr1 = tpr[tmp.index(min(tmp))]

#     tmp = [np.abs(ite - 0.005) for ite in fpr]
#     tpr2 = tpr[tmp.index(min(tmp))]

#     tmp = [np.abs(ite - 0.01) for ite in fpr]
#     tpr3 = tpr[tmp.index(min(tmp))]
#     res_val = 0.4 * tpr1 + 0.3 * tpr2 + 0.3 * tpr3
#     print("AUC value = " + str(test_auc))
#     print("TRP value = " + str(res_val))
#     print("------------------------- ")
#     return res_val

In [5]:
# read data
root = "./data/"
data_train = pd.read_csv(root + "atec_anti_fraud_train.csv")
# data_train_add = pd.read_csv(root + "atec_anti_fraud_train_added.csv")
data_test = pd.read_csv(root + "atec_anti_fraud_test_a.csv")


In [6]:
# data preprocessing
# 是否要采样
# data_train = data_train.sample(100000)
# data_train = data_train[data_train['label'] != -1]

data_test = data_test.drop(['id'], 1)
data_test = data_test.drop(['date'], 1)

data_train = data_train.drop(['id'], 1)
data_train = data_train.drop(['date'], 1)
# data_train = pd.concat([data_train,data_train_add], sort=True)

data_train['label'] = np.abs(data_train['label'])
train_y = data_train['label']
data_train = data_train.drop(['label'], 1)

# 删除特征
b = ['20','22','24','26','28','30','32','34','46','47','48','50','52','53']        
c = [str(i) for i in range(64,72)]
d = [str(i) for i in range(111,155)]

drop_col = ['f'+i for i in (b+c+d)]

data_test = data_test.drop(drop_col,1)
data_train = data_train.drop(drop_col,1)

In [7]:
train_x = data_train
test_x = data_test

train_x = np.array(train_x)
train_y = np.array(train_y).reshape(-1, 1)
test_x = np.array(test_x)

X = train_x
Y = train_y
X_test = test_x

In [8]:
# train
# 交叉验证取平均
k = 5
kf = KFold(n_splits = k, shuffle=True, random_state=2)
# kf = KFold(n_splits=k)

eta = 0.05
result = 0
xgb_result = 0
importances = 0
for train_index, test_index in kf.split(X):
    X1_train, X1_test = X[train_index], X[test_index]
    Y1_train, Y1_test = Y[train_index], Y[test_index]

    data_train = xgb.DMatrix(X1_train, label=Y1_train)
    data_vtest = xgb.DMatrix(X1_test, label=Y1_test)
    data_rtest = xgb.DMatrix(X_test)
    param = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',  # 二分类问题
        'max_depth': 7,  # 构建树的深度，越大越容易过拟合
        'lambda':0.1,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
        'gamma':0.1,
        'subsample':0.8,
        'max_leaves':63,
        'colsample_bytree':0.8,
        'seed' : 1,
        'scale_pos_weight':1,
        # 'min_child_weight':4,
        # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言
        # ，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
        # 这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。
        'silent': 0,  # 设置成1则没有运行信息输出，最好是设置为0.
        'eta': eta,  # 如同学习率
        'eval_metric': 'auc'
    }
    bst = xgb.train(param, data_train, num_boost_round=100)
    
#     print("TRAIN")
#     xgb_tra = bst.predict(data_train)  # 训练集
#     weight_lost_cal(Y1_train.reshape(-1, 1), xgb_tra.reshape(-1, 1))
    
    print("TEST：")
    xgb_val = bst.predict(data_vtest)  # 交叉验证集
    
    xgb_result += bst.predict(data_rtest)  # 测试集
#     xgb_result += 1-cur_result  # 测试集

    result += weight_lost_cal(Y1_test.reshape(-1, 1), xgb_val.reshape(-1, 1))

    print("done...")
    
# 计算结果
result /= k
xgb_result /= k


[09:58:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 222 extra nodes, 0 pruned nodes, max_depth=7
[09:58:03] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 192 extra nodes, 0 pruned nodes, max_depth=7
[09:58:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 194 extra nodes, 2 pruned nodes, max_depth=7
[09:58:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 206 extra nodes, 0 pruned nodes, max_depth=7
[09:58:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 188 extra nodes, 0 pruned nodes, max_depth=7
[09:58:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 194 extra nodes, 0 pruned nodes, max_depth=7
[09:58:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 206 extra nodes, 0 pruned nodes, max_depth=7
[09:58:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 210 extra nodes, 0 pruned nodes, max_depth=7
[09:58:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 214 extra nodes, 0 pruned no

[10:01:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 226 extra nodes, 2 pruned nodes, max_depth=7
[10:01:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 218 extra nodes, 0 pruned nodes, max_depth=7
[10:01:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 222 extra nodes, 0 pruned nodes, max_depth=7
[10:01:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 216 extra nodes, 0 pruned nodes, max_depth=7
[10:01:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 224 extra nodes, 0 pruned nodes, max_depth=7
[10:01:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 196 extra nodes, 0 pruned nodes, max_depth=7
[10:01:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 230 extra nodes, 0 pruned nodes, max_depth=7
[10:01:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 226 extra nodes, 2 pruned nodes, max_depth=7
[10:01:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 222 extra nodes, 0 pruned no

[10:04:50] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 200 extra nodes, 0 pruned nodes, max_depth=7
[10:04:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 208 extra nodes, 2 pruned nodes, max_depth=7
[10:04:56] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 238 extra nodes, 0 pruned nodes, max_depth=7
[10:04:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 226 extra nodes, 0 pruned nodes, max_depth=7
[10:05:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 246 extra nodes, 0 pruned nodes, max_depth=7
[10:05:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 236 extra nodes, 0 pruned nodes, max_depth=7
[10:05:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 224 extra nodes, 0 pruned nodes, max_depth=7
[10:05:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 212 extra nodes, 4 pruned nodes, max_depth=7
[10:05:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 222 extra nodes, 0 pruned no

[10:08:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 200 extra nodes, 2 pruned nodes, max_depth=7
[10:08:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 196 extra nodes, 0 pruned nodes, max_depth=7
[10:08:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 208 extra nodes, 0 pruned nodes, max_depth=7
[10:08:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 234 extra nodes, 0 pruned nodes, max_depth=7
[10:08:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 218 extra nodes, 0 pruned nodes, max_depth=7
[10:08:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 220 extra nodes, 2 pruned nodes, max_depth=7
[10:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 232 extra nodes, 0 pruned nodes, max_depth=7
[10:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 230 extra nodes, 0 pruned nodes, max_depth=7
[10:08:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 218 extra nodes, 0 pruned no

[10:11:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 216 extra nodes, 0 pruned nodes, max_depth=7
[10:11:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 200 extra nodes, 2 pruned nodes, max_depth=7
[10:11:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 228 extra nodes, 0 pruned nodes, max_depth=7
[10:11:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 212 extra nodes, 0 pruned nodes, max_depth=7
[10:11:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 222 extra nodes, 4 pruned nodes, max_depth=7
[10:11:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 224 extra nodes, 0 pruned nodes, max_depth=7
[10:11:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 218 extra nodes, 0 pruned nodes, max_depth=7
[10:11:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 232 extra nodes, 0 pruned nodes, max_depth=7
[10:11:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 238 extra nodes, 0 pruned no

[10:14:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 236 extra nodes, 0 pruned nodes, max_depth=7
[10:14:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 220 extra nodes, 0 pruned nodes, max_depth=7
[10:14:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 222 extra nodes, 0 pruned nodes, max_depth=7
[10:14:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 232 extra nodes, 0 pruned nodes, max_depth=7
[10:14:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 224 extra nodes, 0 pruned nodes, max_depth=7
[10:14:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 234 extra nodes, 0 pruned nodes, max_depth=7
[10:14:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 214 extra nodes, 2 pruned nodes, max_depth=7
[10:14:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 224 extra nodes, 0 pruned nodes, max_depth=7
[10:14:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 206 extra nodes, 0 pruned no

[10:17:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 238 extra nodes, 0 pruned nodes, max_depth=7
[10:17:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 234 extra nodes, 0 pruned nodes, max_depth=7
[10:17:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 230 extra nodes, 2 pruned nodes, max_depth=7
[10:17:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 238 extra nodes, 0 pruned nodes, max_depth=7
[10:17:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 230 extra nodes, 0 pruned nodes, max_depth=7
[10:17:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 192 extra nodes, 0 pruned nodes, max_depth=7
[10:17:45] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 228 extra nodes, 0 pruned nodes, max_depth=7
[10:17:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 232 extra nodes, 2 pruned nodes, max_depth=7
[10:17:50] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 228 extra nodes, 0 pruned no

In [9]:
#     print("TRAIN")
#     xgb_tra = bst.predict(data_train)  # 训练集
#     weight_lost_cal(Y1_train.reshape(-1, 1), xgb_tra.reshape(-1, 1))
    
#     print("TEST：")
#     xgb_val = bst.predict(data_vtest)  # 交叉验证集
#     weight_lost_cal(Y1_test.reshape(-1, 1), xgb_val.reshape(-1, 1))

In [10]:
# real_result = max_abs_scaler_y.inverse_transform(result.reshape(-1,1))
# print(xgb_result)
print(result)
print(max(xgb_result))
print(min(xgb_result))
# 归一化
# xgb_result_scaled = Normalization(xgb_result)

0.45807025920021793
0.83065623
0.0036241114


In [11]:
# 输出
data_test = pd.read_csv(root + "atec_anti_fraud_test_a.csv")
data_test['score'] = xgb_result
data_test['score'].round(decimals=5)
data = data_test[['id', 'score']]  #选择表格中的两列
data.to_csv(root + "result_717_1000_V3.csv", index=None, encoding='utf8')