In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import seaborn as sns
from utils import *

### 静态信息

In [2]:
data = pd.read_csv("../data/账户静态信息.csv")
print(data.shape)
data.head()

(6000, 5)


Unnamed: 0,zhdh,khrq,khjgdh,xb,nl
0,DDF394282B1E1508,2018-04-13,577BCC91,1,25
1,CAE68290A37CC77D,2016-04-02,34ED066D,1,27
2,41E4A8AECE47E5F3,2014-09-28,30BB3825,1,44
3,163C42F2A3FD518E,2010-06-11,34ED066D,1,55
4,6FBFEB03252FDB9F,2015-08-20,D64A340B,0,44


### 交易信息

In [3]:
data = pd.read_csv("../data/账户交易信息.csv")
print(data.shape)
data.head()

(816270, 12)


Unnamed: 0,jylsxh,zhdh,dfzh,jdbj,jyje,zhye,dfhh,jyrq,jysj,jyqd,zydh,dfmccd
0,5D252156AE9F6B6595A1C56F56D4F91C,86C379D938234BAA,14BEFED1370B730A,0,310.0,57806.83,834E1F06,2020-03-01,00:18:06,E96ED478,4E0CB6FB,45
1,8BB3D82CA8E5F95577CA3E2DF432DF64,8EB373F073727157,FD7F11B33576339B,1,599.99,7099.73,B3D461D4,2020-03-01,00:18:17,621461AF,A3C65C29,6
2,412B7E903BC06882EEB9FB6A484D0773,997DED969A377D40,014F2782648E7FDA,1,4000.0,34448.04,A71C76B8,2020-03-01,00:18:30,621461AF,A3C65C29,6
3,F1122F893AC75DC8751190C67E1C3DB6,8EB373F073727157,129FAF9FD9D03346,1,299.98,7399.71,8A1BC467,2020-03-01,00:19:06,621461AF,A3C65C29,6
4,8BD9575EA55E67D4E99AC43B2A444172,8EB373F073727157,3B9CD92F13274EBA,1,999.96,8399.67,A8DA3378,2020-03-01,00:19:17,091D584F,2618045A,6


In [49]:
def aggregate_trade(data : pd.DataFrame, feats : pd.DataFrame):
    # 交易总次数
    feats["trade_cnts"] = data.groupby(by="zhdh")["jdbj"].count()
    # 入账比例
    feats["income_ratio"] = data.groupby(by="zhdh")["jdbj"].sum() / data.groupby(by="zhdh")["jdbj"].count()
    # 与多少人发生交易
    feats["trade_people_cnts"] = data.groupby(by="zhdh")["dfzh"].nunique()

    return feats

In [50]:
def aggregate_trade_people(data : pd.DataFrame, feats : pd.DataFrame):
    trade_people_cnts = data.groupby(by=["zhdh","dfzh"])["jdbj"].count()
    # 与多少人发生交易
    feats["trade_people_cnts"] = trade_people_cnts.groupby("zhdh").count()
    # 与同一用户交易的次数最大值
    feats["trade_pelple_cnts_max"] = trade_people_cnts.groupby("zhdh").max()
    # 与同一用户交易的次数平均值
    feats["trade_pelple_cnts_avg"] = trade_people_cnts.groupby("zhdh").mean()
    # 与同一用户交易的次数波动性
    feats["trade_pelple_cnts_std"] = trade_people_cnts.groupby("zhdh").std()

    return feats

def aggregate_trade_bank(data : pd.DataFrame, feats : pd.DataFrame):
    trade_bank_cnts = data.groupby(by=["zhdh","dfhh"])["jdbj"].count()
    # 与多少银行发生交易
    feats["trade_bank_cnts"] = trade_bank_cnts.groupby("zhdh").count()
    # 与同一银行交易的次数最大值
    feats["trade_bank_cnts_max"] = trade_bank_cnts.groupby("zhdh").max()
    # 与同一银行交易的次数平均值
    feats["trade_bank_cnts_avg"] = trade_bank_cnts.groupby("zhdh").mean()
    # 与同一银行交易的次数波动性
    feats["trade_bank_cnts_std"] = trade_bank_cnts.groupby("zhdh").std()

    return feats

def aggregate_trade_days(data : pd.DataFrame, feats : pd.DataFrame):
    # 每日交易次数
    trade_cnts_days = data.groupby(by=["zhdh","jyrq"])["jdbj"].count()
    # 交易天数
    feats["trade_days_cnts"] = trade_cnts_days.groupby("zhdh").count()
    # 单日最大交易次数
    feats["trade_day_cnts_max"] = trade_cnts_days.groupby("zhdh").max()
    # 平均每日交易次数
    feats["trade_day_cnts_avg"] = trade_cnts_days.groupby("zhdh").mean()
    # 每日交易次数波动性
    feats["trade_day_cnts_std"] = trade_cnts_days.groupby("zhdh").std()

    return feats

In [51]:
def build_stats_feats(df : pd.DataFrame):
    feats = pd.DataFrame() # init features

    feats = aggregate_trade(df,feats)
    feats = aggregate_trade_people(df,feats)
    feats = aggregate_trade_bank(df,feats)
    feats = aggregate_trade_days(df,feats)

    feats.fillna(value=-1,inplace=True)

    return feats

In [52]:
def merge_feats(static_feats : pd.DataFrame, dynamic_feats : pd.DataFrame):
    # 训练集
    train_df = pd.read_csv("../data/训练集标签.csv",index_col=0)
    train_df = train_df.merge(right=static_feats,left_index=True,right_index=True)
    train_df = train_df.merge(right=dynamic_feats,left_index=True,right_index=True)
    y_train = train_df.pop("black_flag")
    X_train = train_df

    # 测试集
    test_df = pd.read_csv("../data/test_dataset.csv",index_col=0)
    test_df = test_df.merge(right=static_feats,left_index=True,right_index=True)
    test_df = test_df.merge(right=dynamic_feats,left_index=True,right_index=True)
    X_test = test_df

    return X_train, y_train, X_test

In [53]:
static = pd.read_csv("../data/账户静态信息.csv",index_col=0)
df = pd.read_csv("../data/账户交易信息.csv")
dynamic = build_stats_feats(df)

In [63]:
X_train, y_train, X_test = merge_feats(static,dynamic)
X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train,test_size=0.3,random_state=42)
cols = X_train.columns[2:]

In [64]:
model = GradientBoostingClassifier(n_estimators=100,learning_rate=0.05,max_depth=3)
model.fit(X_train[cols],y_train)
output = evaluation_model(model,X_train,y_train,X_valid,y_valid,cols=cols,verbose=True)

f1 score on train: 0.9062
f1 score on valid: 0.8539


In [65]:
model = RandomForestClassifier(n_estimators=100,max_depth=9)
model.fit(X_train[cols],y_train)
output = evaluation_model(model,X_train,y_train,X_valid,y_valid,cols=cols,verbose=True)

f1 score on train: 0.9490
f1 score on valid: 0.8492


In [39]:
df = data.groupby(by=["zhdh","dfhh"]).count()

In [40]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,jylsxh,dfzh,jdbj,jyje,zhye,jyrq,jysj,jyqd,zydh,dfmccd
zhdh,dfhh,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
00037295453A928A,1D03DB92,5,5,5,5,5,5,5,5,5,5
00037295453A928A,26C2E7EF,1,1,1,1,1,1,1,1,1,1
00037295453A928A,28E7286D,2,2,2,2,2,2,2,2,2,2
00037295453A928A,32931AA4,3,3,3,3,3,3,3,3,3,3
00037295453A928A,3B604737,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...
FFEC4454D961DEB1,9807F0FF,4,4,4,4,4,4,4,4,4,4
FFEC4454D961DEB1,98CB458A,1,1,1,1,1,1,1,1,1,1
FFEC4454D961DEB1,C89C9E84,1,1,1,1,1,1,1,1,1,1
FFEC4454D961DEB1,CAD56B44,1,1,1,1,1,1,1,1,1,1
