# Stacking

## 先说结论，该数据集（fetch_covtype）Stacking的方法比线性加权更好
比赛中我们常用线性加权作为最终的融合方式，我们同样也会好奇怎样的线性加权权重更好，下面也会举例子
参考：https://github.com/rushter/heamy/tree/master/examples

通过对训练集进行五折验证，将验证结果作为第二层的训练和测试集合
<img src="assets/stacking.jpg" width="50%">

In [1]:
pip install heamy  # 安装相关包

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting heamy
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/20/32/2f3e1efa38a8e34f790d90b6d49ef06ab812181ae896c50e89b8750fa5a0/heamy-0.0.7.tar.gz (30 kB)
Building wheels for collected packages: heamy
  Building wheel for heamy (setup.py): started
  Building wheel for heamy (setup.py): finished with status 'done'
  Created wheel for heamy: filename=heamy-0.0.7-py2.py3-none-any.whl size=15353 sha256=e3ba65b34e2bdee3b90b45b637e28836afdbdb0c9547f76b36fe10d17f8aba8f
  Stored in directory: c:\users\administrator\appdata\local\pip\cache\wheels\6e\f1\7d\048e558da94f495a0ed0d9c09d312e73eb176a092e36774ec2
Successfully built heamy
Installing collected packages: heamy
Successfully installed heamy-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [7]:
import sys
print(sys.version)  # 版本信息

3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]


In [61]:
import numpy as np
import time

from heamy.dataset import Dataset
from heamy.estimator import Classifier 
from heamy.pipeline import ModelsPipeline
# 导入相关模型，没有的pip install xxx 即可
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb 
import lightgbm as lgb 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import log_loss

## 准备数据集

In [15]:
from sklearn.datasets import fetch_covtype
data = fetch_covtype()

In [47]:
# 预处理
X, y = data['data'], data['target']
# 由于模型标签需要从0开始，所以数字需要全部减1
print('七分类任务，处理前：',np.unique(y))
print(y)
ord = OrdinalEncoder()
y = ord.fit_transform(y.reshape(-1, 1))
y = y_enc.reshape(-1, )
print('七分类任务，处理后：',np.unique(y))
print(y)

七分类任务，处理前： [1 2 3 4 5 6 7]
[5 5 2 ... 3 3 3]
七分类任务，处理后： [0. 1. 2. 3. 4. 5. 6.]
[4. 4. 1. ... 2. 2. 2.]


In [48]:
# 切分训练和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)
print(X_train.shape)
print(X_test.shape)

(435759, 54)
(145253, 54)


In [49]:
# 创建数据集
'''use_cache : bool, default True
    If use_cache=True then preprocessing step will be cached until function codeis changed.'''
dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test, y_test=None,use_cache=True)  # 注意这里的y_test=None，即不存在数据泄露
print(dataset)

Dataset(5c3ccfb5c81451d098565ef5e7e36ac5)


In [50]:
# 处理后的数据集
dataset.X_train

array([[2.833e+03, 2.580e+02, 2.600e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [3.008e+03, 4.500e+01, 2.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.949e+03, 0.000e+00, 1.100e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [3.153e+03, 2.870e+02, 1.700e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [3.065e+03, 3.480e+02, 2.100e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [3.021e+03, 2.600e+01, 1.600e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

## 定义模型

In [40]:
def xgb_model(X_train, y_train, X_test, y_test):
    """参数必须为X_train,y_train,X_test,y_test"""
    # 可以内置参数
    params = {'objective': 'multi:softprob',
              "eval_metric": 'mlogloss',
              "verbosity": 0,
              'num_class': 7,
              'nthread': -1}
    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test)
    model = xgb.train(params, dtrain, num_boost_round=300)
    predict = model.predict(dtest)
    return predict  # 返回值必须为X_test的预测


def lgb_model(X_train, y_train, X_test, y_test,**parameters):
    # 也可以开放参数接口
    if parameters is None:
        parameters = {}
    lgb_train = lgb.Dataset(X_train, y_train)
    model = lgb.train(params=parameters, train_set=lgb_train,num_boost_round=300)
    predict = model.predict(X_test)
    return predict


def rf_model(X_train, y_train, X_test, y_test):
    params = {"n_estimators": 100, "n_jobs": -1}
    model = RandomForestClassifier(**params).fit(X_train, y_train)
    predict = model.predict_proba(X_test)
    return predict

## 构建和训练模型

In [52]:
params = {"objective": "multiclass",
          "num_class": 7,
          "n_jobs": -1,
          "verbose": -4, 
          "metric": ("multi_logloss",)}

model_xgb = Classifier(dataset=dataset, estimator=xgb_model, name='xgb',use_cache=False)
model_lgb = Classifier(dataset=dataset, estimator=lgb_model, name='lgb',parameters=params,use_cache=False)
model_rf = Classifier(dataset=dataset, estimator=rf_model,name='rf',use_cache=False)

pipeline = ModelsPipeline(model_xgb, model_lgb, model_rf)

In [53]:
pipeline.find_weights(scorer=log_loss, )  # 输出最优权重组合

Best Score (log_loss): 0.18744137777851164
Best Weights: [0.36556831 0.00303401 0.63139768]


array([0.36556831, 0.00303401, 0.63139768])

In [55]:
# 5折训练构建5折模型特征集，这里比较耗时
stack_ds = pipeline.stack(k=5,stratify=False,seed=42,full_test=False)  # full_test指明预测全部还是预测当前折的验证集

In [56]:
# 模型输出的训练集，7个特征对应7个标签的预测概率
print(stack_ds.X_train)

           xgb_0     xgb_1         xgb_2         xgb_3         xgb_4  \
0       0.177179  0.818728  2.185222e-07  9.264143e-09  4.090067e-03   
1       0.005155  0.994845  7.055579e-10  1.326343e-08  6.331572e-09   
2       0.293492  0.706508  3.650662e-10  1.017633e-09  8.823530e-09   
3       0.478112  0.521816  3.207779e-06  2.878019e-08  1.076500e-08   
4       0.992430  0.006652  1.233117e-05  1.887496e-07  1.569583e-06   
...          ...       ...           ...           ...           ...   
435754  0.988518  0.011477  3.190797e-09  5.645121e-08  2.940739e-09   
435755  0.969212  0.030723  2.142020e-08  1.572054e-05  4.321913e-07   
435756  0.415850  0.584142  4.283793e-08  7.367601e-08  6.148067e-07   
435757  0.602601  0.397399  6.606462e-10  1.015894e-09  7.221973e-08   
435758  0.834587  0.165411  3.267833e-09  2.057172e-08  2.078704e-08   

               xgb_5         xgb_6     lgb_0     lgb_1          lgb_2  ...  \
0       1.725062e-06  1.048052e-06  0.172406  0.812678   

In [59]:
# 模型输出的测试集，7个特征对应7个标签的预测概率
print(stack_ds.X_test) 

               xgb_0     xgb_1         xgb_2         xgb_3         xgb_4  \
0       9.876224e-01  0.000789  2.774616e-06  4.129093e-07  1.311387e-06   
1       5.139124e-02  0.929659  1.852793e-03  1.518293e-07  1.692924e-02   
2       7.695035e-04  0.973729  6.878623e-04  1.573823e-07  2.408167e-02   
3       3.376913e-02  0.966229  2.024872e-07  7.321523e-08  1.071163e-06   
4       1.013981e-03  0.998553  3.794874e-06  8.755425e-08  4.243054e-04   
...              ...       ...           ...           ...           ...   
145248  9.615189e-01  0.038480  6.486028e-08  1.744931e-08  1.069370e-06   
145249  3.055384e-02  0.969440  2.475371e-07  5.530033e-08  4.299908e-06   
145250  8.224608e-06  0.058361  9.212288e-01  9.705171e-08  5.440121e-05   
145251  9.183387e-01  0.081601  5.612090e-08  1.088283e-08  5.225256e-07   
145252  9.203915e-07  0.003578  2.372825e-01  1.582836e-06  3.307252e-07   

               xgb_5         xgb_6     lgb_0     lgb_1         lgb_2  ...  \
0       7.

In [60]:
# 用lr做最后一层
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={"solver": 'lbfgs', "max_iter": 1000},use_cache=False)
predict_stack = stacker.predict()

In [64]:
print(predict_stack)  # stacking后的结果

[[9.95173402e-01 2.67623709e-03 4.23846755e-08 ... 3.15435935e-05
  5.66194220e-06 2.11044140e-03]
 [2.23612439e-02 9.70927685e-01 1.23929922e-03 ... 4.49727904e-03
  8.73983383e-04 9.97020226e-05]
 [6.22588197e-03 9.89402233e-01 9.81655972e-04 ... 2.83331258e-03
  5.22139184e-04 3.45071569e-05]
 ...
 [5.36335125e-06 2.06267200e-03 9.90604140e-01 ... 8.55252386e-04
  4.18405061e-03 1.64678945e-05]
 [9.96602824e-01 2.15991442e-03 7.27481581e-08 ... 3.63552051e-05
  6.80942632e-06 1.19199377e-03]
 [5.89156494e-05 1.15333400e-03 1.09178439e-02 ... 3.09244417e-04
  9.85167196e-01 2.21261408e-05]]


## 验证结果

### 单模分数

In [65]:
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, :7].values, axis=1),y_test))  # XGB
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 7:14].values, axis=1),y_test))  # LGB
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 14:].values, axis=1),y_test))  # RF

0.9284696357390209
0.8890005714167694
0.9511404239499356


### 线性加权分数

In [70]:
# blending的分数
xgb_t = stack_ds.X_test.iloc[:, :7].values
lgb_t = stack_ds.X_test.iloc[:, 7:14].values
rf_t = stack_ds.X_test.iloc[:, 14:].values

# 根据分数好坏随机定
result = 0.3*xgb_t+0.2*lgb_t+0.5*rf_t
print('主观根据结果blending：', accuracy_score(np.argmax(result, axis=1), y_test))
# 根据上面提供的最优权重 Best Weights: [0.36556831 0.00303401 0.63139768]
result =  0.36556831*xgb_t+0.00303401*lgb_t+0.63139768*rf_t
print('根据最优权重的blending：',accuracy_score(np.argmax(result, axis=1), y_test))

主观根据结果blending： 0.9425209806337908
根据最优权重的blending： 0.9488616414118813


可以观察到最优权重比我们主观选权重更优

### stacking的分数

In [71]:
print(accuracy_score(np.argmax(predict_stack, axis=1), y_test))

0.957439777491687


## 再说结论，该数据集（fetch_covtype）Stacking的方法更好