In [1]:
import gc
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## 1. 读取数据

In [2]:
path = "./data/"
train_file = "gbdt_data_2015_2019_train.csv"
test_file = "gbdt_data_2015_2019_test.csv"

trainDf = pd.read_csv(path + train_file)

In [4]:
pos_trainDf = trainDf[trainDf['target'] == 1]
pos_trainDf.shape

(21694, 59)

In [5]:
neg_trainDf = trainDf[trainDf['target'] == 0]
neg_trainDf.shape

(573518, 59)

In [6]:
neg_trainDf = trainDf[trainDf['target'] == 0].sample(n=20000, random_state=2018)
trainDf = pd.concat([pos_trainDf, neg_trainDf], axis=0).sample(frac=1.0, random_state=2018)
del pos_trainDf; del neg_trainDf; gc.collect();

In [7]:
print(trainDf.shape, trainDf['target'].mean())

(41694, 59) 0.5203146735741354


In [8]:
trainDf, testDf, _, _ = train_test_split(trainDf, trainDf['target'], test_size=0.25, random_state=2018)

print(trainDf['target'].mean(), trainDf.shape)
print(testDf['target'].mean(), testDf.shape)

0.5215542053086025 (31270, 59)
0.5165963161933999 (10424, 59)


In [9]:
"""
一共59个特征，包括id， target
bin特征17个;cat特征14个;连续特征26个;
"""

columns = trainDf.columns.tolist()
bin_feats = []
cat_feats = []
con_feats = []
for col in  columns:
    if 'bin' in col:
        bin_feats.append(col)
        continue
    if 'cat' in col:
        cat_feats.append(col)
        continue
    if 'id' != col and 'target' != col:
        con_feats.append(col)
        
print(len(bin_feats), bin_feats)
print(len(cat_feats), cat_feats)
print(len(con_feats), con_feats)

17 ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']
14 ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat']
26 ['ps_ind_01', 'ps_ind_03', 'ps_ind_14', 'ps_ind_15', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14']


## 2. 特征处理

In [10]:
trainDf = trainDf.fillna(0)
testDf = testDf.fillna(0)

train_sz = trainDf.shape[0]
combineDf = pd.concat([trainDf, testDf], axis=0)
del trainDf
del testDf
gc.collect()

31

### 2.1 连续特征全部归一化

In [11]:
from sklearn.preprocessing import MinMaxScaler
for col in con_feats:
    scaler = MinMaxScaler()
    combineDf[col] = scaler.fit_transform(np.array(combineDf[col].values.tolist()).reshape(-1,1))




### 2.2 离散特征one-hot

In [12]:
for col in bin_feats + cat_feats:
    onehotret = pd.get_dummies(combineDf[col], prefix=col)
    combineDf = pd.concat([combineDf, onehotret], axis=1)

In [13]:
combineDf.columns.tolist()

['id',
 'target',
 'ps_ind_01',
 'ps_ind_02_cat',
 'ps_ind_03',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_ind_06_bin',
 'ps_ind_07_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_14',
 'ps_ind_15',
 'ps_ind_16_bin',
 'ps_ind_17_bin',
 'ps_ind_18_bin',
 'ps_reg_01',
 'ps_reg_02',
 'ps_reg_03',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat',
 'ps_car_11',
 'ps_car_12',
 'ps_car_13',
 'ps_car_14',
 'ps_car_15',
 'ps_calc_01',
 'ps_calc_02',
 'ps_calc_03',
 'ps_calc_04',
 'ps_calc_05',
 'ps_calc_06',
 'ps_calc_07',
 'ps_calc_08',
 'ps_calc_09',
 'ps_calc_10',
 'ps_calc_11',
 'ps_calc_12',
 'ps_calc_13',
 'ps_calc_14',
 'ps_calc_15_bin',
 'ps_calc_16_bin',
 'ps_calc_17_bin',
 'ps_calc_18_bin',
 'ps_calc_19_bin',
 'ps_calc_20_bin',
 'ps_ind_06_bin_0',
 'ps_ind_06_bin_1',


In [14]:
combineDf.shape

(41694, 276)

## 3. 训练模型

In [15]:
label = 'target'
onehot_feats = [col for col in combineDf.columns if col not in ['id', 'target'] + con_feats + cat_feats + bin_feats]


In [16]:
np.shape(onehot_feats)

(217,)

In [17]:
train = combineDf[:train_sz]
test = combineDf[train_sz:]
print("Train.shape: {0}, Test.shape: {0}".format(train.shape, test.shape))
del combineDf
gc.collect()

Train.shape: (31270, 276), Test.shape: (31270, 276)


21

### 3.1 LR模型

In [18]:
lr_feats = con_feats + onehot_feats
lr = LogisticRegression(penalty='l2', C=1)
lr.fit(train[lr_feats], train[label].values)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
from sklearn.metrics import roc_auc_score,accuracy_score

def do_model_metric(y_true, y_pred, y_pred_prob):
    print("Predict 1 percent: {0}".format(np.mean(y_pred)))
    print("Label 1 percent: {0}".format(train[label].mean()))
    print("AUC: {0:.3}".format(roc_auc_score(y_true=y_true, y_score=y_pred_prob[:,1])))
    print("Accuracy: {0}".format(accuracy_score(y_true=y_true, y_pred=y_pred)))

In [20]:
print("Train............")
do_model_metric(y_true=train[label], y_pred=lr.predict(train[lr_feats]), y_pred_prob=lr.predict_proba(train[lr_feats]))


Train............
Predict 1 percent: 0.5394627438439399
Label 1 percent: 0.5215542053086025
AUC: 0.637
Accuracy: 0.594755356571794


In [21]:
print("\n\n")
print("Test.............")
do_model_metric(y_true=test[label], y_pred=lr.predict(test[lr_feats]), y_pred_prob=lr.predict_proba(test[lr_feats]))





Test.............
Predict 1 percent: 0.5304105909439755
Label 1 percent: 0.5215542053086025
AUC: 0.629
Accuracy: 0.5909439754412893


## 3.2 GBDT

In [22]:
lgb_feats = con_feats + cat_feats + bin_feats
categorical_feature_list = cat_feats + bin_feats


In [23]:
np.shape(lgb_feats)

(57,)

In [24]:
np.shape(categorical_feature_list)

(31,)

In [25]:
import lightgbm as lgb


lgb_params ={
    'objective':'binary',
    'boosting_type': 'gbdt',
    'metric':'auc',
    'learning_rate': 0.01,
    'num_leaves': 5,
    'max_depth': 4,
    'min_data_in_leaf': 100,
    'bagging_fraction': 0.8,
    'feature_fraction':0.8,
    'bagging_freq':10,
    'lambda_l1':0.2,
    'lambda_l2':0.2,
    'scale_pos_weight':1,
}


In [26]:
lgbtrain = lgb.Dataset(train[lgb_feats].values, label=train[label].values,
                          feature_name=lgb_feats,
                          categorical_feature=categorical_feature_list
                          )
lgbvalid = lgb.Dataset(test[lgb_feats].values, label=test[label].values,
                          feature_name=lgb_feats,
                          categorical_feature=categorical_feature_list
                          )

In [27]:
evals_results = {}
print('train.............')
lgb_model = lgb.train(lgb_params,
                 lgbtrain,
                 valid_sets=lgbvalid,
                 evals_result=evals_results,
                 num_boost_round=1000,
                 early_stopping_rounds=60,
                 verbose_eval=50,
                 categorical_feature=categorical_feature_list,
                 )

train.............
Training until validation scores don't improve for 60 rounds.




[50]	valid_0's auc: 0.607672
[100]	valid_0's auc: 0.614587
[150]	valid_0's auc: 0.619941
[200]	valid_0's auc: 0.621069
[250]	valid_0's auc: 0.622795
[300]	valid_0's auc: 0.62422
[350]	valid_0's auc: 0.625249
[400]	valid_0's auc: 0.625787
[450]	valid_0's auc: 0.626351
[500]	valid_0's auc: 0.626982
[550]	valid_0's auc: 0.627196
[600]	valid_0's auc: 0.62759
[650]	valid_0's auc: 0.627911
[700]	valid_0's auc: 0.627982
[750]	valid_0's auc: 0.628257
[800]	valid_0's auc: 0.628453
[850]	valid_0's auc: 0.628526
Early stopping, best iteration is:
[818]	valid_0's auc: 0.628596


### 3.3 LR + GBDT

In [28]:
train_sz = train.shape[0]
combineDf = pd.concat([train, test], axis=0, ignore_index=True)

In [29]:
#得到叶节点编号 Feature Transformation
gbdt_feats_vals = lgb_model.predict(combineDf[lgb_feats], pred_leaf=True)
gbdt_feats_vals

array([[3, 4, 4, ..., 2, 2, 2],
       [4, 0, 0, ..., 2, 2, 2],
       [4, 0, 0, ..., 0, 0, 0],
       ...,
       [3, 4, 4, ..., 4, 4, 2],
       [0, 0, 0, ..., 3, 3, 2],
       [2, 3, 3, ..., 2, 2, 3]])

In [30]:
# 因为是818轮结束的LightGBM训练，所以LightGBM一共有818棵树，对应每行样本有818个索引值（一个索引对应一棵树的叶子节点索引）

np.shape(gbdt_feats_vals)


(41694, 818)

In [31]:
gbdt_feats_vals[:10,:10]

array([[3, 4, 4, 2, 1, 3, 4, 1, 1, 2],
       [4, 0, 0, 0, 0, 0, 0, 0, 0, 4],
       [4, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [4, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [2, 3, 3, 4, 3, 4, 3, 4, 4, 3],
       [4, 0, 0, 0, 0, 0, 0, 3, 3, 3],
       [1, 1, 1, 3, 4, 1, 4, 1, 1, 4],
       [3, 4, 4, 3, 1, 3, 4, 1, 1, 3],
       [1, 1, 1, 4, 4, 1, 4, 1, 1, 3],
       [2, 3, 3, 4, 3, 0, 3, 3, 3, 4]])

In [32]:
gbdt_columns = ["gbdt_leaf_indices_" + str(i) for i in range(0, gbdt_feats_vals.shape[1])]

In [34]:
gbdt_columns

['gbdt_leaf_indices_0',
 'gbdt_leaf_indices_1',
 'gbdt_leaf_indices_2',
 'gbdt_leaf_indices_3',
 'gbdt_leaf_indices_4',
 'gbdt_leaf_indices_5',
 'gbdt_leaf_indices_6',
 'gbdt_leaf_indices_7',
 'gbdt_leaf_indices_8',
 'gbdt_leaf_indices_9',
 'gbdt_leaf_indices_10',
 'gbdt_leaf_indices_11',
 'gbdt_leaf_indices_12',
 'gbdt_leaf_indices_13',
 'gbdt_leaf_indices_14',
 'gbdt_leaf_indices_15',
 'gbdt_leaf_indices_16',
 'gbdt_leaf_indices_17',
 'gbdt_leaf_indices_18',
 'gbdt_leaf_indices_19',
 'gbdt_leaf_indices_20',
 'gbdt_leaf_indices_21',
 'gbdt_leaf_indices_22',
 'gbdt_leaf_indices_23',
 'gbdt_leaf_indices_24',
 'gbdt_leaf_indices_25',
 'gbdt_leaf_indices_26',
 'gbdt_leaf_indices_27',
 'gbdt_leaf_indices_28',
 'gbdt_leaf_indices_29',
 'gbdt_leaf_indices_30',
 'gbdt_leaf_indices_31',
 'gbdt_leaf_indices_32',
 'gbdt_leaf_indices_33',
 'gbdt_leaf_indices_34',
 'gbdt_leaf_indices_35',
 'gbdt_leaf_indices_36',
 'gbdt_leaf_indices_37',
 'gbdt_leaf_indices_38',
 'gbdt_leaf_indices_39',
 'gbdt_lea

In [35]:
combineDf = pd.concat([combineDf, pd.DataFrame(data=gbdt_feats_vals, index=range(0, gbdt_feats_vals.shape[0]),columns=gbdt_columns)], axis=1)


In [36]:
"""
因为是采用GBDT+LR方案：将GBDT的输出节点index做出LR的输入，必须将LR的节点index进行one-hot编码；
如果是采用GBDT+FM方案：则无需将GBDT的输出index进行编码，而是需要按照FM的形式进行组织！
"""

# onehotencoder(gbdt_feats)
origin_columns = combineDf.columns
for col in gbdt_columns:
    combineDf = pd.concat([combineDf, pd.get_dummies(combineDf[col], prefix=col)],axis=1)
gbdt_onehot_feats = [col for col in combineDf.columns if col not in origin_columns]


In [37]:
# 恢复train, test
train = combineDf[:train_sz]
test = combineDf[train_sz:]
del combineDf; gc.collect();

In [38]:
"""
这里是将在原特征（缩放后的连续特征 + one-hot编码后的（bin特征以及类别特征））与
利用GBDT生成的特征进行联立，其中# lr_feats = lr_feats = con_feats + onehot_feats
"""


lr_gbdt_feats = lr_feats + gbdt_onehot_feats

In [39]:
lr_gbdt_model = LogisticRegression(penalty='l2', C=1)
lr_gbdt_model.fit(train[lr_gbdt_feats], train[label])

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [40]:
print("Train................")
do_model_metric(y_true=train[label], y_pred=lr_gbdt_model.predict(train[lr_gbdt_feats]), y_pred_prob=lr_gbdt_model.predict_proba(train[lr_gbdt_feats]))

print("Test..................")
do_model_metric(y_true=test[label], y_pred=lr_gbdt_model.predict(test[lr_gbdt_feats]), y_pred_prob=lr_gbdt_model.predict_proba(test[lr_gbdt_feats]))


Train................
Predict 1 percent: 0.5304445155100735
Label 1 percent: 0.5215542053086025
AUC: 0.739
Accuracy: 0.6719539494723377
Test..................
Predict 1 percent: 0.5292594013814275
Label 1 percent: 0.5215542053086025
AUC: 0.589
Accuracy: 0.5635072908672295
