In [1]:
import numpy as np
import numpy.ma as ma
import matplotlib.pyplot as plt
# 基学习器
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# 次级学习器
from sklearn.ensemble import RandomForestClassifier
# 预处理
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
#from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin, clone
import sys
import warnings

warnings.simplefilter('ignore')

Stacking先从初始数据集训练出初级学习器,然后'生成'一个新数据集用于训练次级学习器,在这个新数据集中,初级学习器的输出被当做样例输入特征,而初始样本标记仍被当做样例标记.

初级学习器是异质的.

在训练阶段,次级训练集是利用初级学习器产生的,若直接用初级学习器的训练集来产生次级训练集,则过拟合风险会比较大;因此,一般是通过使用交叉验证或留一法这样的方式,

In [2]:
def load_data():
    data = load_digits()
    return data.data,data.target

In [3]:
def load_data_split():
    data = load_digits()
    X_train,X_test,y_train,y_test = train_test_split(X,y)
    return X_train,X_test,y_train,y_test

In [4]:
X,y = load_data()
X_train,X_test,y_train,y_test = train_test_split(X,y)

def standardize(X_):
    X_ = StandardScaler().fit_transform(X_)
    return X_

print('-'*30 ,'before','-'*30)
print(X_train.min())
print(X_train.max())
X_train = standardize(X_train)
X_test = standardize(X_test)
print('-'*30 ,'after','-'*30)
print(X_train.min())
print(X_train.max())

------------------------------ before ------------------------------
0.0
16.0
------------------------------ after ------------------------------
-3.018816281385554
36.68787265568844


In [5]:
def load_data_split_standarlize():
    X_train,X_test,y_train,y_test = load_data_split()
    return standardize(X_train),standardize(X_test),y_train,y_test
X_1,X_2,_,_=load_data_split()
print(X_1.min())
print(X_2.max())
X_train,X_test,y_train,y_test = load_data_split_standarlize()
print(X_train.min())
print(X_train.max())

0.0
16.0
-2.9612382936006596
36.68787265568888


In [6]:
def obtain_rand_seed():
    return np.random.randint(2**32 - 1)

In [7]:
X,y = load_data()
X[:2]

array([[ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
        15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
        12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
         0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
        10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.],
       [ 0.,  0.,  0., 12., 13.,  5.,  0.,  0.,  0.,  0.,  0., 11., 16.,
         9.,  0.,  0.,  0.,  0.,  3., 15., 16.,  6.,  0.,  0.,  0.,  7.,
        15., 16., 16.,  2.,  0.,  0.,  0.,  0.,  1., 16., 16.,  3.,  0.,
         0.,  0.,  0.,  1., 16., 16.,  6.,  0.,  0.,  0.,  0.,  1., 16.,
        16.,  6.,  0.,  0.,  0.,  0.,  0., 11., 16., 10.,  0.,  0.]])

In [8]:
def kfold_indexes(X,y,n_fold=5):
    kf = KFold(shuffle=True,n_splits=n_fold)
#   return [(train_index.shape,test_index.shape) for train_index,test_index in kf.split(X,y)] 
    return [(train_index,test_index) for train_index,test_index in kf.split(X,y)] 
# kfold_indexes(X,y)
#X,y = load_data()
#kf_indexes = kfold_indexes(X,y)

#hole_i = np.array([],dtype=np.int32)
#for i,kf_sub in enumerate(kf_indexes):
#    hole_i = np.hstack([hole_i,kf_sub[1]])
#print(len(np.unique(hole_i)))
#print(len(y))

In [9]:
def score(y_test,y_predict):
    return np.sum(y_predict == y_test) / len(y_test)

In [10]:
class SimpleStacking :#(BaseEstimator, RegressorMixin, TransformerMixin): sklearn中的超类.

    def __init__(
        self
        , models_stack_1 # 初级学习算法
        , models_stack_2 # 次级学习算法
        , n_fold = 6
    ):
        self.models_stack_1_ = models_stack_1
        self.models_stack_2_ = models_stack_2
        self.T_ = len(self.models_stack_1_)
        self.n_fold_ = n_fold
        self.X_2_train_ = None
        self.X_2_test_ = None

    def __kfold_indexes(
        self
        , X
        ,y
    ):
        '''
        K折交叉验证分成 self.n_folds_ 个集合
        '''
        kf = KFold(
            shuffle=True
            ,n_splits=self.n_fold_
        )
        return [(train_index,test_index) for train_index,test_index in kf.split(X,y)] 
    
    def train_stack_1(
        self
        ,X
        ,y
    ):
        '''
        训练初级学习器
        @return 初级学习器的学习结果
        '''
        n = len(y)
        # 避免过拟合,将训练集分为k折
        kf_indexes = self.__kfold_indexes(X ,y)
        # 生成的新的训练集X_2.shape = (n,T)
        X_2 = np.empty(shape=(n, self.T_) ,dtype=np.int32)
        # 训练初级学习器
        models = self.models_stack_1_
        for t in range(self.T_):
            # 第t个学习器的训练结果
            y_res_t = np.empty(shape=n, dtype=np.int32)
            # 第j折的数据
            for kf_j in kf_indexes:
                # 第j折数据的训练集和测试集的索引
                kf_j_train,kf_j_test = kf_j[0],kf_j[1]
                X_j_train,X_j_test,y_j_train,y_j_test = X[kf_j_train],X[kf_j_test],y[kf_j_train],y[kf_j_test]
                models[t].fit(X_j_train,y_j_train)
                y_t_p = models[t].predict(X_j_test)
                # X_2对应新的特征t的值
                y_res_t[kf_j_test] = y_t_p
            # 给特征t即第t个学习器的训练结果
            X_2[:,t] = y_res_t
        self.X_2_train_= X_2
        return X_2
    
    def train_stack_2(
        self
        ,X
        ,y
    ):
        '''
        训练次级学习器
        @return 
        '''
        return self.models_stack_2_.fit(X,y)
    
    def fit(
        self
        , X
        , y
    ):
        '''
        先以初级学习器生成新的训练集
        然后用次级学习器进行学习.
        '''
        self.train_stack_2(self.train_stack_1(X,y),y)
        return self

    def __stack1_X(self, X):
        '''
        用初级学习器生成新的训练集X_2
        '''
        # 样本数量
        n = len(X)
        # 生成的新的训练集X_2.shape = (n,T)
        X_2 = np.empty(shape=(n, self.T_), dtype=np.int32)
        # 初级学习器
        models = self.models_stack_1_
        for t in range(self.T_):
            # 第t个学习器的训练结果
            y_res = models[t].predict(X)
            X_2[:,t] = y_res
            
        self.X_2_test_ = X_2
        return X_2
        
    def predict(self, X):
        X_2 = self.__stack1_X(X)
        return self.models_stack_2_.predict(X_2)
    
    def score(self,X_test,y_test):
        return np.sum(self.predict(X_test) == y_test) / len(y_test)
        
X,y = load_data()

'''
stack1_models = [
    SVC(C=0.025)
    ,LogisticRegression(C=0.01001,penalty='l1', max_iter=10)
    ,GaussianNB(var_smoothing=0.000035)
    ,MLPClassifier(hidden_layer_sizes=3)
    ,DecisionTreeClassifier(max_depth=5)
]
'''

stack1_models = [
    SVC()
    ,LogisticRegression()
    ,GaussianNB()
    ,MLPClassifier()
    ,DecisionTreeClassifier()
]

stack2_model = DecisionTreeClassifier()

X_train,X_test,y_train,y_test = load_data_split_standarlize()

stack1_scores = []
model_names = []
X = standardize(X)

for model in stack1_models:
    model_names.append(model.__class__.__name__)
    model.fit(X_train,y_train)
    stack1_scores.append(model.score(X_test,y_test))

print('stack1 mean scores : ', np.mean(stack1_scores))
model_name_cor_score = dict(zip(model_names,stack1_scores))
print('stack1 scores: ', model_name_cor_score)
        
print('-'*20 , 'Stacking score' , '-'*20)
ss = SimpleStacking(stack1_models,stack2_model)
ss.fit(X_train,y_train)
print(ss.score(X_test,y_test))

print('-' * 20, 'stack1_scores', '-'*20)
for t in range(5):
    print(score(y_test, ss.X_2_test_[:, t]))

stack1 mean scores :  0.7586666666666667
stack1 scores:  {'SVC': 0.9866666666666667, 'LogisticRegression': 0.9533333333333334, 'GaussianNB': 0.1111111111111111, 'MLPClassifier': 0.9777777777777777, 'DecisionTreeClassifier': 0.7644444444444445}
-------------------- Stacking score --------------------
0.9755555555555555
-------------------- stack1_scores --------------------
0.9822222222222222
0.9555555555555556
0.1111111111111111
0.9711111111111111
0.7577777777777778


In [11]:
X,y = load_data()

stack1_models = [
    SVC(C=0.025)
    ,LogisticRegression(C=0.01001,penalty='l1', max_iter=10)
    ,GaussianNB(var_smoothing=0.000035)
    ,MLPClassifier(hidden_layer_sizes=3)
    ,DecisionTreeClassifier(max_depth=5)
]


'''
stack1_models = [
    SVC()
    ,LogisticRegression()
    ,GaussianNB()
    ,MLPClassifier()
    ,DecisionTreeClassifier()
]
'''
stack2_model = DecisionTreeClassifier()

X_train,X_test,y_train,y_test = load_data_split_standarlize()

stack1_scores = []
model_names = []
X = standardize(X)

for model in stack1_models:
    model_names.append(model.__class__.__name__)
    model.fit(X_train,y_train)
    stack1_scores.append(model.score(X_test,y_test))
    
print(dict(zip(model_names,stack1_scores)))
print('*'*50)

for m,s,n in zip(stack1_models,stack1_scores,model_names):
    if s < 0.7:
        stack1_models.remove(m)
        stack1_scores.remove(s)
        model_names.remove(n)
    
print(dict(zip(model_names,stack1_scores)))

print('stack1 mean scores : ', np.mean(stack1_scores))
model_name_cor_score = dict(zip(model_names,stack1_scores))
print('stack1 scores: ', model_name_cor_score)
        
print('-'*20 , 'Stacking score' , '-'*20)
ss = SimpleStacking(stack1_models,stack2_model)
ss.fit(X_train,y_train)
print(ss.score(X_test,y_test))

print('-' * 20, 'stack1_scores', '-'*20)
for t in range(len(stack1_models)):
    print(score(y_test, ss.X_2_test_[:, t]))

{'SVC': 0.7244444444444444, 'LogisticRegression': 0.7288888888888889, 'GaussianNB': 0.5644444444444444, 'MLPClassifier': 0.6777777777777778, 'DecisionTreeClassifier': 0.6044444444444445}
**************************************************
{'SVC': 0.7244444444444444, 'LogisticRegression': 0.7288888888888889, 'MLPClassifier': 0.6777777777777778}
stack1 mean scores :  0.7103703703703704
stack1 scores:  {'SVC': 0.7244444444444444, 'LogisticRegression': 0.7288888888888889, 'MLPClassifier': 0.6777777777777778}
-------------------- Stacking score --------------------
0.7844444444444445
-------------------- stack1_scores --------------------
0.5844444444444444
0.64
0.7444444444444445


# 有不解之处,现在看起来stacking与基学习器的准确率有关系,结合以后效果更差了

可能写错了,需要引用一些更详尽的资料