# 构建模型&预测

In [1]:
import pandas as pd

# 加载已经完成特征工程的训练集和测试集
# 特征工程见feature_engineering.ipynb
split_train_data_feature = pd.read_csv("/home/brian/Projects/Repurchase/trainset_final.csv")
split_test_data_feature = pd.read_csv("/home/brian/Projects/Repurchase/testset_final.csv")
split_test_data = pd.read_csv('/home/brian/Projects/Repurchase/data_format1/test_format1.csv')

## 一、调参确定最佳参数

Note：调参时将加载的训练集拆分为train, validation两部分。而当确定参数后将完整训练集用来训练已获得更强的拟合能力，具体见二。

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split

# 准备训练集与测试集
label = split_train_data_feature['label']
data = split_train_data_feature.drop(columns=['user_id','merchant_id','label'],axis=1)

# 训练集和测试集切分,确保生成随机数种子，并且训练和测试的样本分配尽量均衡
train_data, validation_data, train_label, validation_label = train_test_split(data,label, test_size = 0.2,random_state = 10)
print(train_data.shape)
print(train_label.shape)
print(validation_data.shape)
print(validation_label.shape)
# 观察训练和测试集的两个数据分布是否均等
print(np.sum(train_label==1)/len(train_label))
print(np.sum((validation_label)==1)/len(validation_label))

(221452, 72)
(221452,)
(55364, 72)
(55364,)
0.1148330112168777
0.11693519254389134


In [3]:
# 构建模型并训练，根据验证集指标调参
"""
参数意义：
min_child_weight：子节点的权重阈值。它刻画的是：对于一个叶子节点，当对它采取划分之后，它的所有子节点的权重之和的阈值。值越大算法越保守。
gamma： 也称作最小划分损失min_split_loss。 它刻画的是：对于一个叶子节点，当对它采取划分之后，损失函数的降低值的阈值。值越大算法越保守。
subsample： 对训练样本的采样比例。取值范围为 (0,1]。小的值有助于缓解过拟合。
colsample_bytree： 构建子树时，对特征的采样比例。取值范围为 (0,1]。小的值有助于缓解过拟合。
reg_lambda： L2 正则化系数（基于weights的正则化），默认为 1。 该值越大则模型越简单。
reg_alpha： L1 正则化系数（基于weights的正则化），默认为 0。 该值越大则模型越简单。
scale_pos_weight：=负样本/正样本，正负样本偏差较大时调整该值，使得预测label为正(=1)数量接近真实情况。
"""
import xgboost as xgb

# XGBOOST算法做分类器
xgb_predict_model = xgb.XGBClassifier(
    eta = 0.1,
    n_estimators = 200,
    max_depth = 5, # default=6
    min_child_weight = 300, # default=1
    gamma = 10.3, # default=0
    subsample = 0.8, # default=1
    colsample_bytree = 0.75, # default=1
    reg_alpha = 1.2,
    reg_lambda = 8.16,
    eval_metric = 'auc',
    scale_pos_weight = 4.5
)
xgb_predict_model.fit(
    train_data, 
    train_label, 
    eval_set=[(train_data, train_label),(validation_data, validation_label)],
    verbose = True,
    early_stopping_rounds = 40,
)



[0]	validation_0-auc:0.63932	validation_1-auc:0.63031
[1]	validation_0-auc:0.64923	validation_1-auc:0.63960
[2]	validation_0-auc:0.65980	validation_1-auc:0.64868
[3]	validation_0-auc:0.66377	validation_1-auc:0.65092
[4]	validation_0-auc:0.66600	validation_1-auc:0.65274
[5]	validation_0-auc:0.66713	validation_1-auc:0.65331
[6]	validation_0-auc:0.66774	validation_1-auc:0.65409
[7]	validation_0-auc:0.66887	validation_1-auc:0.65440
[8]	validation_0-auc:0.66950	validation_1-auc:0.65498
[9]	validation_0-auc:0.67192	validation_1-auc:0.65675
[10]	validation_0-auc:0.67311	validation_1-auc:0.65787
[11]	validation_0-auc:0.67439	validation_1-auc:0.65891
[12]	validation_0-auc:0.67569	validation_1-auc:0.66006
[13]	validation_0-auc:0.67746	validation_1-auc:0.66172
[14]	validation_0-auc:0.67821	validation_1-auc:0.66250
[15]	validation_0-auc:0.67968	validation_1-auc:0.66365
[16]	validation_0-auc:0.68033	validation_1-auc:0.66393
[17]	validation_0-auc:0.68197	validation_1-auc:0.66524
[18]	validation_0-au

[149]	validation_0-auc:0.73464	validation_1-auc:0.69930
[150]	validation_0-auc:0.73494	validation_1-auc:0.69934
[151]	validation_0-auc:0.73531	validation_1-auc:0.69956
[152]	validation_0-auc:0.73555	validation_1-auc:0.69970
[153]	validation_0-auc:0.73585	validation_1-auc:0.69977
[154]	validation_0-auc:0.73625	validation_1-auc:0.69999
[155]	validation_0-auc:0.73674	validation_1-auc:0.70025
[156]	validation_0-auc:0.73699	validation_1-auc:0.70038
[157]	validation_0-auc:0.73706	validation_1-auc:0.70034
[158]	validation_0-auc:0.73761	validation_1-auc:0.70051
[159]	validation_0-auc:0.73769	validation_1-auc:0.70059
[160]	validation_0-auc:0.73789	validation_1-auc:0.70075
[161]	validation_0-auc:0.73831	validation_1-auc:0.70099
[162]	validation_0-auc:0.73861	validation_1-auc:0.70118
[163]	validation_0-auc:0.73878	validation_1-auc:0.70132
[164]	validation_0-auc:0.73892	validation_1-auc:0.70142
[165]	validation_0-auc:0.73935	validation_1-auc:0.70176
[166]	validation_0-auc:0.73944	validation_1-auc:

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.75, eta=0.1,
              eval_metric='auc', gamma=10.3, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.100000001,
              max_delta_step=0, max_depth=5, min_child_weight=300, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=56,
              num_parallel_tree=1, random_state=0, reg_alpha=1.2,
              reg_lambda=8.16, scale_pos_weight=4.5, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

## 二、Recycle挑选特征，强化模型

在已确定近似最佳参数的情况下，本节使用全部训练集的数据。本节使用了**recycle技巧**，先将训练集全部放到最佳模型中训练，由第一次训练后的模型获得所有特征的重要性(feature_importance)，然后挑选重要性最高的50个特征(原来共有72维特征, 选50维是经过查看重要性大小和反复多次实验确定的)，根据这个**循环反馈**，再次使用这50维特征再次建立模型，此模型除subsample由0.8增大到0.9外其余参数完全一致。

In [4]:
# 使用全部训练数据，不再拆分train、validation
train_label = split_train_data_feature['label']
train_data = split_train_data_feature.drop(columns=['user_id','merchant_id','label'],axis=1)

In [5]:
# 第一次建模训练，获取最佳参数下各个特征的重要性，为下一步挑选做准备
xgb_predict_model = xgb.XGBClassifier(
    eta = 0.1,
    n_estimators = 200,
    max_depth = 5, # default=6
    min_child_weight = 300, # default=1
    gamma = 10.3, # default=0
    subsample = 0.8, # default=1
    colsample_bytree = 0.75, # default=1
    reg_alpha = 1.2,
    reg_lambda = 8.16,
    eval_metric = 'auc',
    scale_pos_weight = 4.5
)
xgb_predict_model.fit(
    train_data, 
    train_label, 
    eval_set=[(train_data, train_label)],
    verbose = True,
    early_stopping_rounds = 40,
)

[0]	validation_0-auc:0.63937
[1]	validation_0-auc:0.65001
[2]	validation_0-auc:0.65760
[3]	validation_0-auc:0.66165
[4]	validation_0-auc:0.66445
[5]	validation_0-auc:0.66515
[6]	validation_0-auc:0.66552
[7]	validation_0-auc:0.66717
[8]	validation_0-auc:0.66885
[9]	validation_0-auc:0.67083
[10]	validation_0-auc:0.67260
[11]	validation_0-auc:0.67346
[12]	validation_0-auc:0.67531
[13]	validation_0-auc:0.67693
[14]	validation_0-auc:0.67784
[15]	validation_0-auc:0.67891
[16]	validation_0-auc:0.68001
[17]	validation_0-auc:0.68111
[18]	validation_0-auc:0.68208
[19]	validation_0-auc:0.68317
[20]	validation_0-auc:0.68390
[21]	validation_0-auc:0.68465
[22]	validation_0-auc:0.68588
[23]	validation_0-auc:0.68685
[24]	validation_0-auc:0.68741
[25]	validation_0-auc:0.68801
[26]	validation_0-auc:0.68887
[27]	validation_0-auc:0.68972
[28]	validation_0-auc:0.69040
[29]	validation_0-auc:0.69161
[30]	validation_0-auc:0.69221
[31]	validation_0-auc:0.69343
[32]	validation_0-auc:0.69427
[33]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.75, eta=0.1,
              eval_metric='auc', gamma=10.3, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.100000001,
              max_delta_step=0, max_depth=5, min_child_weight=300, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=56,
              num_parallel_tree=1, random_state=0, reg_alpha=1.2,
              reg_lambda=8.16, scale_pos_weight=4.5, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [7]:
# 获取特征重要性矩阵
importance_df = pd.DataFrame(split_train_data_feature.columns.drop(['user_id', 'merchant_id', 'label']),
                             columns=['feature'])
importance_df['importance'] = xgb_predict_model.feature_importances_
importance_df.sort_values(by='importance', ascending=False, inplace=True)
# importance_df.to_csv('feature_importance.csv', index=False)
high_imp_f = importance_df['feature'][:50] # top 50

Note: 第二次建模将subsample(训练采样率)由0.8增大到0.9，大量对比实验表明这样可以提升拟合效果。**筛掉低重要性特征并增大训练采样率共同提升效果的可解释性：筛掉低重要性特征可以消除其中带有的噪声信号，如此小幅增大训练采样率并不会使模型过拟合，反而会过滤掉噪声并收敛更快。**

Note: 通过recyle技巧，算法提交结果有0.6862提升到0.6876，由排行榜101名提升到91名，提升明显。

In [8]:
# 第二次建模训练，使用高重要性特征，生成本算法最终模型

# 挑选高重要性特征
train_data = split_train_data_feature.drop(columns=['user_id','merchant_id','label'],axis=1)[high_imp_f]

xgb_predict_model = xgb.XGBClassifier(
    eta = 0.1,
    n_estimators = 200,
    max_depth = 5,
    min_child_weight = 300,
    gamma = 10.3,
    subsample = 0.9, # 滤除噪声后，训练采样率由0.8增大到0.9
    colsample_bytree = 0.75,
    reg_alpha = 1.2,
    reg_lambda = 8.16,
    eval_metric = 'auc',
    scale_pos_weight = 4.5
)
xgb_predict_model.fit(
    train_data, 
    train_label, 
    eval_set=[(train_data, train_label)],
    verbose = True,
    early_stopping_rounds = 40,
)

[0]	validation_0-auc:0.64300
[1]	validation_0-auc:0.65035
[2]	validation_0-auc:0.65429
[3]	validation_0-auc:0.65592
[4]	validation_0-auc:0.65710
[5]	validation_0-auc:0.66196
[6]	validation_0-auc:0.66278
[7]	validation_0-auc:0.66580
[8]	validation_0-auc:0.66797
[9]	validation_0-auc:0.66954
[10]	validation_0-auc:0.67127
[11]	validation_0-auc:0.67237
[12]	validation_0-auc:0.67321
[13]	validation_0-auc:0.67468
[14]	validation_0-auc:0.67667
[15]	validation_0-auc:0.67833
[16]	validation_0-auc:0.67944
[17]	validation_0-auc:0.68039
[18]	validation_0-auc:0.68138
[19]	validation_0-auc:0.68211
[20]	validation_0-auc:0.68321
[21]	validation_0-auc:0.68368
[22]	validation_0-auc:0.68446
[23]	validation_0-auc:0.68507
[24]	validation_0-auc:0.68629
[25]	validation_0-auc:0.68720
[26]	validation_0-auc:0.68798
[27]	validation_0-auc:0.68866
[28]	validation_0-auc:0.68961
[29]	validation_0-auc:0.69053
[30]	validation_0-auc:0.69163
[31]	validation_0-auc:0.69254
[32]	validation_0-auc:0.69327
[33]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.75, eta=0.1,
              eval_metric='auc', gamma=10.3, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.100000001,
              max_delta_step=0, max_depth=5, min_child_weight=300, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=56,
              num_parallel_tree=1, random_state=0, reg_alpha=1.2,
              reg_lambda=8.16, scale_pos_weight=4.5, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbosity=None)

## 三、预测，保存

In [9]:
# 放到模型中进行预测，生成结果
# 预测
test_predict_prob = xgb_predict_model.predict_proba(split_test_data_feature[high_imp_f])
# 整合结合
submission_df = split_test_data.copy().drop(columns=['prob'])
submission_df['xgb_boost_prob'] = test_predict_prob[:,1]
submission_df.rename(columns={'xgb_boost_prob':'prob'},inplace=True)
submission_df.head(10)

Unnamed: 0,user_id,merchant_id,prob
0,163968,4605,0.357426
1,360576,1581,0.551531
2,98688,1964,0.382418
3,98688,3645,0.220104
4,295296,3361,0.34057
5,33408,98,0.156705
6,230016,1742,0.457133
7,164736,598,0.281444
8,164736,1963,0.097014
9,164736,2634,0.325702


In [10]:
# 观察预测结果统计分布，给效果一个instinct
submission_df['prob'].min(), submission_df['prob'].max(), submission_df['prob'].mean(), submission_df['prob'].sum(), (submission_df['prob']>=0.5).sum()

(0.05096617341041565, 0.9223620295524597, 0.3290819823741913, 86047.23, 32241)

In [11]:
# 写入本地文件保存
submission_df.to_csv(path_or_buf='./repurchase_submission.csv', index=False)