In [165]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score, roc_auc_score
import xgboost as xgb
import seaborn as sns
from utils import *

ModuleNotFoundError: No module named 'tensorflow'

### 静态信息

In [2]:
user = pd.read_csv("../../data/账户静态信息.csv")
print(user.shape)
user.head()

(6000, 5)


Unnamed: 0,zhdh,khrq,khjgdh,xb,nl
0,DDF394282B1E1508,2018-04-13,577BCC91,1,25
1,CAE68290A37CC77D,2016-04-02,34ED066D,1,27
2,41E4A8AECE47E5F3,2014-09-28,30BB3825,1,44
3,163C42F2A3FD518E,2010-06-11,34ED066D,1,55
4,6FBFEB03252FDB9F,2015-08-20,D64A340B,0,44


In [13]:
def build_static_feats(df : pd.DataFrame):
    def label_encode(series):
        unique = list(series.unique())
        return series.map(dict(zip(
            unique, range(series.nunique()))))

    df = df.set_index("zhdh")
    df['khrq']  = pd.to_datetime(df['khrq'], format='%Y-%m-%d')
    df['year']  = df['khrq'].dt.year
    df['month'] = df['khrq'].dt.month
    df['day']   = df['khrq'].dt.day
    
    for col in ['khjgdh']:
        df[col] = label_encode(df[col])

    date = df.pop("khrq")
    
    return df

### 交易信息

In [4]:
trade= pd.read_csv("../../data/账户交易信息.csv")
print(trade.shape)
trade.head()

(816270, 12)


Unnamed: 0,jylsxh,zhdh,dfzh,jdbj,jyje,zhye,dfhh,jyrq,jysj,jyqd,zydh,dfmccd
0,5D252156AE9F6B6595A1C56F56D4F91C,86C379D938234BAA,14BEFED1370B730A,0,310.0,57806.83,834E1F06,2020-03-01,00:18:06,E96ED478,4E0CB6FB,45
1,8BB3D82CA8E5F95577CA3E2DF432DF64,8EB373F073727157,FD7F11B33576339B,1,599.99,7099.73,B3D461D4,2020-03-01,00:18:17,621461AF,A3C65C29,6
2,412B7E903BC06882EEB9FB6A484D0773,997DED969A377D40,014F2782648E7FDA,1,4000.0,34448.04,A71C76B8,2020-03-01,00:18:30,621461AF,A3C65C29,6
3,F1122F893AC75DC8751190C67E1C3DB6,8EB373F073727157,129FAF9FD9D03346,1,299.98,7399.71,8A1BC467,2020-03-01,00:19:06,621461AF,A3C65C29,6
4,8BD9575EA55E67D4E99AC43B2A444172,8EB373F073727157,3B9CD92F13274EBA,1,999.96,8399.67,A8DA3378,2020-03-01,00:19:17,091D584F,2618045A,6


In [5]:
def build_dynamic_feats(df : pd.DataFrame):
    # 用户维度单独聚合
    feats = pd.DataFrame() # init features

    # 用户和其他1个维度交叉聚合
    feats = aggregate_trade(df,feats)
    feats = aggregate_trade_people(df,feats)
    feats = aggregate_trade_bank(df,feats)
    feats = aggregate_trade_days(df,feats)
    feats = aggregate_trade_channel(df,feats)
    feats = aggregate_trade_summary(df,feats)
    feats = aggregate_trade_period(df,feats,freq=3600*1)

    # 用户和其他 2 个维度交叉聚合
    feats = aggregate_trade_label_money(df,feats) # jdbj & jyje


    feats.fillna(value=-1,inplace=True)

    return feats

In [162]:
def cross_val_predict(X_train, y_train, X_test, folds : int=5):
    spliter = KFold(n_splits=folds)
    oof_pred = np.zeros_like(y_train,dtype=np.float64)
    y_test = np.zeros(X_test.shape[0])
    cv_scores = []
    for i, (train_idx, valid_idx) in enumerate(spliter.split(X_train,y_train)):
        print("Training Fold %d"%(i+1))
        train_data = xgb.DMatrix(X_train.iloc[train_idx],label=y_train[train_idx])
        valid_data = xgb.DMatrix(X_train.iloc[valid_idx],label=y_train[valid_idx])
        test_data = xgb.DMatrix(X_test)

        # 创建模型
        params = {
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'min_child_weight': 1.5,
            'max_depth': 6,
            'lambda': 10,
            'subsample': 0.7,
            'colsample_bytree': 0.7,
            'colsample_bylevel': 0.7,
            'eta': 0.3,
            'tree_method': 'exact',
        }
        model = xgb.train(
            params, train_data,
            evals=[(train_data,"train"),(valid_data,"valid")],
            num_boost_round=5000,
            verbose_eval=100, 
            early_stopping_rounds=200
        )

        oof_pred[valid_idx] = model.predict(valid_data,ntree_limit=model.best_ntree_limit)
        y_test += model.predict(test_data,ntree_limit=model.best_ntree_limit) / folds
        cv_scores.append(roc_auc_score(y_train[valid_idx],oof_pred[valid_idx]))
        print("CV score: %.4f"%(cv_scores[-1]))
    
    return oof_pred, y_test, cv_scores

In [114]:
static = build_static_feats(user)
dynamic = build_dynamic_feats(trade)

In [146]:
X_train, y_train, X_test = merge_feats(static,dynamic)

In [150]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train,test_size=0.3,random_state=42)
cols = X_train.columns
model = GradientBoostingClassifier(
    n_estimators=300,learning_rate=0.02,max_depth=3)
model.fit(X_train[cols],y_train)
output = evaluation_model(
    model,X_train,y_train,X_valid,y_valid,
    cols=cols,verbose=True)
cols = output[2].index[0:30]

f1 score on train: 0.9931
f1 score on valid: 0.8077


In [163]:
oof, y_test, cv_scores = cross_val_predict(X_train[cols],y_train,X_test[cols])

Training Fold 1
[0]	train-auc:0.94690	valid-auc:0.86281
[100]	train-auc:1.00000	valid-auc:0.90293
[200]	train-auc:1.00000	valid-auc:0.90603
[225]	train-auc:1.00000	valid-auc:0.90603
CV score: 0.9087
Training Fold 2
[0]	train-auc:0.91003	valid-auc:0.92517
[100]	train-auc:0.99988	valid-auc:0.96405
[200]	train-auc:1.00000	valid-auc:0.96112
[252]	train-auc:1.00000	valid-auc:0.96028
CV score: 0.9649
Training Fold 3
[0]	train-auc:0.91785	valid-auc:0.81264
[100]	train-auc:0.99983	valid-auc:0.96628
[200]	train-auc:1.00000	valid-auc:0.96169
[278]	train-auc:1.00000	valid-auc:0.95939
CV score: 0.9678
Training Fold 4
[0]	train-auc:0.90882	valid-auc:0.80901
[100]	train-auc:0.99992	valid-auc:0.98323
[200]	train-auc:1.00000	valid-auc:0.98225
[247]	train-auc:1.00000	valid-auc:0.98225
CV score: 0.9849
Training Fold 5
[0]	train-auc:0.90473	valid-auc:0.88663
[100]	train-auc:0.99976	valid-auc:0.98683
[200]	train-auc:1.00000	valid-auc:0.98354
[240]	train-auc:1.00000	valid-auc:0.98395
CV score: 0.9885


In [164]:
scores, thresholds = [], []
best_score, best_threshold = 0, 0

for threshold in np.arange(0.3,0.7,0.01):
    preds = (oof > threshold).astype(int)
    f1 = f1_score(y_train, preds)   
    scores.append(f1)
    thresholds.append(threshold)
    if f1 > best_score:
        best_score = f1
        best_threshold = threshold
print("oof best score: %.4f"%(best_score))

oof best score: 0.8367


In [135]:
submit = pd.Series(index=X_test.index,name="black_flag",
                   data=(y_test > best_threshold).astype(int))
submit.to_csv("../../data/test_submit.csv")

In [143]:
y_pred = model.predict(X_test)
submit = pd.Series(index=X_test.index,name="black_flag",
                   data=y_pred)
submit.to_csv("../../data/test_submit.csv")