In this example, the stacker was handled with cross-validation instead of a train-test-split.
The whole training set was used in both of the two stacking phases.

In [21]:
from __future__ import division

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing

cali_housing = fetch_california_housing()

X = cali_housing.data
y = cali_housing.target

bins = np.arange(6)
 

from sklearn.model_selection import train_test_split

binned_y = np.digitize(y, bins)

from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
 
from sklearn.model_selection import GridSearchCV

X_train_prin, X_test_prin, y_train_prin, y_test_prin = train_test_split(X, y,test_size=0.2,stratify=binned_y,random_state=7)

binned_y_train_prin = np.digitize(y_train_prin, bins)

In [25]:
from sklearn.model_selection import StratifiedKFold

# 初始化 StratifiedKFold，不再包含 random_state 参数
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=7)

# 使用 StratifiedKFold 进行分层交叉验证
for train_index, test_index in skf.split(X_train_prin, binned_y_train_prin):
    print("TRAIN:", train_index, "TEST:", test_index)


TRAIN: [    0     1     2 ... 16507 16509 16511] TEST: [    3     4     5 ... 16500 16508 16510]
TRAIN: [    3     4     5 ... 16508 16509 16510] TEST: [    0     1     2 ... 16505 16506 16511]
TRAIN: [    0     1     2 ... 16508 16510 16511] TEST: [    8    16    20 ... 16504 16507 16509]


In [31]:
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV

# 定义参数分布
param_dist = {
    'max_samples': [0.5, 1.0],
    'max_features': [0.5, 1.0],
    'oob_score': [True, False],
    'estimator__n_neighbors': [3, 5],  # 使用 'estimator__' 代替 'base_estimator__'
    'n_estimators': [100]
}

# 基学习器
single_estimator = KNeighborsRegressor()

# 集成模型
ensemble_estimator = BaggingRegressor(estimator=single_estimator, bootstrap=True)

# KFold 交叉验证
kf = KFold(n_splits=3, shuffle=True, random_state=7)

# 随机搜索
pre_gs_inst_bag = RandomizedSearchCV(
    ensemble_estimator,
    param_distributions=param_dist,
    cv=kf,  # 使用 KFold 而非 StratifiedKFold
    n_iter=5,
    n_jobs=-1,
    random_state=7
)

# 拟合模型
pre_gs_inst_bag.fit(X_train_prin, y_train_prin)


In [33]:
pre_gs_inst_bag.best_params_

{'oob_score': True,
 'n_estimators': 100,
 'max_samples': 1.0,
 'max_features': 0.5,
 'estimator__n_neighbors': 3}

In [37]:
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor

# 正确的参数使用
rs_bag = BaggingRegressor(**{
    'max_features': 0.5,
    'max_samples': 1.0,
    'n_estimators': 3000,
    'oob_score': True, 
    'estimator': KNeighborsRegressor(n_neighbors=5)  # 使用 'estimator' 替代 'base_estimator'
})


In [41]:
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor

# 配置 BaggingRegressor
rs_bag = BaggingRegressor(
    max_features=0.5,
    max_samples=1.0,
    n_estimators=3000,
    oob_score=True, 
    estimator=KNeighborsRegressor(n_neighbors=5)
)

# 使用 KFold 进行交叉验证
kf = KFold(n_splits=3, shuffle=True, random_state=7)

# 使用 cross_val_predict 进行预测
bag_predicted = cross_val_predict(rs_bag, X_train_prin, y_train_prin, cv=kf, n_jobs=-1)


In [45]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold

# 定义参数分布
param_dist = {
    'max_features': ['log2', 0.4, 0.5, 0.6, 1.0],
    'max_depth': [2, 3, 4, 5, 6, 7, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 10],
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.05, 0.1, 0.25, 0.275, 0.3, 0.325],
    'loss': ['squared_error', 'huber']  # 使用 'squared_error' 替代 'ls'
}

# 使用 KFold 作为交叉验证器
kf = KFold(n_splits=3, shuffle=True, random_state=7)

# 配置 RandomizedSearchCV
pre_gs_inst_gb = RandomizedSearchCV(
    GradientBoostingRegressor(warm_start=True),
    param_distributions=param_dist,
    cv=kf,  # 使用 KFold 而不是 StratifiedKFold
    n_iter=30,
    n_jobs=-1,
    random_state=7
)

# 训练模型
pre_gs_inst_gb.fit(X_train_prin, y_train_prin)


In [47]:
pre_gs_inst_gb.best_estimator_

In [57]:
from sklearn.ensemble import GradientBoostingRegressor

# 实例化 GradientBoostingRegressor
gbt_inst = GradientBoostingRegressor(
    learning_rate=0.25,
    loss='huber',
    max_depth=6,
    max_features=1.0,
    min_samples_leaf=10,
    n_estimators=3000,
    warm_start=True
)

# 拟合模型
gbt_inst.fit(X_train_prin, y_train_prin)

# 预测
y_pred = gbt_inst.predict(X_test_prin)


In [58]:
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.ensemble import GradientBoostingRegressor

# 假设 gbt_inst 已经按照之前的设置进行了实例化
# 配置 KFold
kf = KFold(n_splits=3, shuffle=True, random_state=7)

# 生成交叉验证预测结果
gbt_predicted = cross_val_predict(gbt_inst, X_train_prin, y_train_prin, cv=kf, n_jobs=-1)


In [61]:
preds_df = pd.DataFrame(X_train_prin.copy(),columns = cali_housing .feature_names )#pd.DataFrame(columns = ['bag', 'gbt'])

preds_df['bag'] = bag_predicted
preds_df['gbt'] = gbt_predicted


preds_df[['bag','gbt']].corr()

Unnamed: 0,bag,gbt
bag,1.0,0.888023
gbt,0.888023,1.0


In [63]:
preds_df.shape

(16512, 10)

In [67]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold

# 定义参数分布
param_dist = {
    'max_features': ['sqrt', 'log2', 1.0],
    'min_samples_leaf': [1, 2, 3, 7, 11],
    'n_estimators': [50, 100],
    'oob_score': [True, False]  # 注意 oob_score 只有在 bootstrap=True 时才有效
}

# 使用 KFold 作为交叉验证器
kf = KFold(n_splits=3, shuffle=True, random_state=7)

# 配置 RandomizedSearchCV
pre_gs_inst_etr = RandomizedSearchCV(
    ExtraTreesRegressor(bootstrap=True),  # 默认不使用 warm_start
    param_distributions=param_dist,
    cv=kf,  # 使用 KFold 作为交叉验证器
    n_iter=15,
    random_state=7
)

# 拟合模型
pre_gs_inst_etr.fit(preds_df.values, y_train_prin)


In [69]:
pre_gs_inst_etr.best_params_

{'oob_score': False,
 'n_estimators': 100,
 'min_samples_leaf': 11,
 'max_features': 1.0}

In [71]:
final_etr = ExtraTreesRegressor(**{'max_features': 1.0,
 'min_samples_leaf': 11,
 'n_estimators': 2000,
 'oob_score': False})
final_etr.fit(preds_df.values, y_train_prin)

In [73]:
rs_bag.fit(X_train_prin, y_train_prin)

In [75]:
gbt_inst.fit(X_train_prin, y_train_prin)

In [77]:
def handle_X_set(X_set):
    X_copy = X_set.copy()
    
    y_pred_bag = rs_bag.predict(X_copy)
    y_pred_gbt = gbt_inst.predict(X_copy)
    preds_df = pd.DataFrame(X_copy, columns = cali_housing .feature_names)

    preds_df['bag'] = y_pred_bag
    preds_df['gbt'] = y_pred_gbt
 
    return preds_df.values

def predict_from_X_set(X_set):
    return final_etr.predict(handle_X_set(X_set)) 

y_pred = predict_from_X_set(X_test_prin)

In [79]:
def mase(y_test, y_pred):
    y_avg = y_test.mean()
    denominator = np.abs(y_test - y_avg).mean()
    numerator = y_test - y_pred
    
    return np.abs(numerator/denominator).mean()

In [81]:
# https://www.otexts.org/fpp/2/5 : contains SMAPE (attributed to Armstrong) and MASE (Hyndman and Koehler)
from sklearn.metrics import r2_score, mean_absolute_error

print ("R-squared",r2_score(y_test_prin, y_pred))
print ("MAE   :  ",mean_absolute_error(y_test_prin, y_pred))
print ("MAPE  :  ",(np.abs(y_test_prin- y_pred)/y_test_prin).mean())
print ("SMAPE :  ",(np.abs(y_test_prin- y_pred)/((y_test_prin + y_pred)/2)).mean())
print ("MASE  :  ",mase(y_test_prin, y_pred)) 

R-squared 0.8472724788371242
MAE   :   0.29485382787560005
MAPE  :   0.16444220061854956
SMAPE :   0.151974292792966
MASE  :   0.32409068890667836
