In [1]:
import numpy as np
import numpy.ma as ma
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import sys
import warnings

warnings.simplefilter('ignore')

In [2]:
def load_data():
    digits = load_digits()
    return digits.data,digits.target
X,y = load_data()
print(X.shape)
print(y.shape)

(1797, 64)
(1797,)


In [3]:
X_train,X_test,y_train,y_test = train_test_split(X,y)
lor = DecisionTreeClassifier(max_depth=5,splitter='random').fit(X_train,y_train)
lor.score(X_test,y_test)

0.7

In [4]:
# 自助采样法
def bs_test(X,y,m):
    '''
    bootstrap m次
    '''
    lens = len(y)
    args = np.arange(0,lens,1)
    # 放回抽样
    samp_args = np.random.choice(args, m, True)
    uniq_args = np.unique(samp_args) 
    return X[uniq_args],y[uniq_args],uniq_args
    
X_samp,y_samp,args = bs_test(X,y, len(y)//10)
print(X_samp.shape)
print(y_samp.shape)
uniq ,cs = np.unique(args,return_counts=True)
print(uniq.shape)

(172, 64)
(172,)
(172,)


In [5]:
# 算下oob,放回抽样
n = 200

# 样本集D 包含n个样本
X_bs = X.copy()[:n]
y_bs = y.copy()[:n]

# 每次从样本集中D有放回的挑选1
_, _, args = bs_test(X_bs,y_bs,1)

# 重复执行n次
for i in range(n - 1):
    _, _, args_new = bs_test(X_bs,y_bs,1)
    args = np.concatenate((args,args_new))
    args = np.unique(args)

# 得到样本子集D'
oob_score = (n - args.shape[0]) / n

# 样本数量
print('m:',  n)
# D'中的样本数量
print('bag :',args.shape[0])
# 袋外数据占比
print('oob :',oob_score)

m: 200
bag : 118
oob : 0.41


### boosstrap产生的数据集改变了初始数据的分布,这会引入估计偏差

In [6]:
def bootstrap_sampling(X,y):
    n = len(y)
    args = np.arange(0,n,1)
    # 放回抽样
    samp_args = np.random.choice(args, n, True)
    uniq_args = np.unique(samp_args) 
    return X[uniq_args],y[uniq_args],uniq_args

# 试一试这样能否达到boosttrap的效果
#oob_score = []
#for _ in range(1000):
#    _,_,samples = bootstrap_sampling(X,y)
#    oob_score.append(1 - samples.shape[0] / y.shape[0])
#np.mean(oob_score)# 约等于0.368,正确.

### 我们采样出T个含m个样本的采样集,然后基于每个采样集训练处一个基学习器,再将这些基学习器进行结合.这就是Bagging的基本流程,Bagging通常对分类任务使用简单投票法,对回归任务使用简单平均法.

In [7]:
class SimpleBagging:
    
    def __init__(
        self
        #,base_estimator
        ,T = 30
    ):
        '''
        @params T 循环的次数
        @params 基分类器
        @params X_ 训练数据
        @params y_ 训练数据对应的标签
        @params es_inbag_sample_args 每个基分类器对应的袋内索引
        @params es_oobag_sample_args 每个基分类器对应的袋外索引
        '''
        self.T_ = T
        # self.base_estimator_ = base_estimator
        self.estimators_ = None
        self.X_ = None
        self.y_ = None
        self.es_inbag_sample_args_ = None
        self.es_oobag_sample_args_ = None

        
    def bootstrap_sampling(self,X,y):
        n = len(y)
        args = np.arange(0,n,1)
        # 放回抽样
        samp_args = np.random.choice(args, n, True)
        uniq_args = np.unique(samp_args) 
        return X[uniq_args] ,y[uniq_args] ,uniq_args

    def oob_sample_args(self,n,inb_sample_args):
        args = np.arange(0 ,n ,1)
        args[inb_sample_args] = -1
        return args[ args >= 0 ]
        
    def fit(self ,X ,y):
        # 初始化成员变量
        n = len(y)
        self.X_ = X
        self.y_ = y
        self.estimators_ = []
        self.es_inbag_sample_args_ = []
        self.es_oobag_sample_args_ = []
        for i in range(self.T_):
            # bootstrap采样
            X_sample,y_sample,sample_args = self.bootstrap_sampling(X,y)
            # print(sample_args.shape)
            # 用采样数据训练数据
            rnd_seed = np.random.randint(2**32 - 1)
            clf = DecisionTreeClassifier(
                max_depth = 5
                , splitter='random'
                , random_state=rnd_seed
            ).fit(X_sample,y_sample)
            # 更新成员变量
            self.es_inbag_sample_args_.append(sample_args)
            self.es_oobag_sample_args_.append(self.oob_sample_args(n, sample_args))
            self.estimators_.append(clf)
        return self
    
    def predict(self,X_test):
        n = len(X_test)
        y_p_set = np.zeros((self.T_,n),dtype=np.int32)
        # For vote
        for i , est in enumerate(self.estimators_):
            yp = est.predict(X_test)
            y_p_set[i]=yp
        # 初始化预测结果
        y_res = np.zeros(n, dtype=np.int32)
        for i in range(n):
            # 对每个样本各个预测器的投票结果
            uniqs, uniq_cnts = np.unique(y_p_set[:,i], return_counts=True)
            # 获取投票数量最大的索引
            cnt_argmax = np.argmax(uniq_cnts)
            #print('uniqs : ',uniqs, 'cnts : ',uniq_cnts,'argmax:',cnt_argmax)
            y_res[i] = uniqs[cnt_argmax]
        return y_res
    
    def __oob_predict(self):
        # 仅考虑未使用x的分类器进行预测.
        
        # 投票的稀疏矩阵,行对应m个样本,列对应投票结果
        y_res = []
        # 用于预测的袋外数据
        y_args = []
        for i , x in enumerate(self.X_):
            # 第i个样本的投票结果
            y_i = []
            # 遍历 T个基分类器
            for t in range(self.T_):
                # 第t个分类器的袋内数据索引.
                es_inbag_args = self.es_inbag_sample_args_[t]
                # 判断当前数据x是否参与了分类器t的训练.
                if np.any(es_inbag_args == i):
                    # 如果已经参与就终结循环,遍历后面的分类器.
                    continue
                # 当前分类没有用到当前数据x = X[i],用当前分类器来预测数据x.
                y_p = self.estimators_[t].predict(x.reshape(1,-1))
                # 记录投票结果
                y_i.append(y_p.ravel()[0])
            # 没有任何分类器没用到当前数据
            if len(y_i) == 0:
                continue
            y_res.append(y_i)
            y_args.append(i)
        n = len(y_args)
        y_final_p = np.full(n, -1)
        
        for i in range(n):
            # 对每个样本各个预测器的投票结果
            uniqs, uniq_cnts = np.unique(np.array(y_res[i]), return_counts=True)
            # 获取投票数量最大的索引
            cnt_argmax = np.argmax(uniq_cnts)
            # print('uniqs : ',uniqs, 'cnts : ',uniq_cnts,'argmax:',cnt_argmax)
            y_final_p[i] = uniqs[cnt_argmax]
        return y_final_p, y_args
    
    def oob_score(self):
        y_p , y_args = self.__oob_predict()
        return np.sum(self.y_[y_args]==y_p) / len(y_args)
    
X,y = load_data()
X_train,X_test,y_train,y_test = train_test_split(X,y)
# sklearn学习器的训练
sk_bgclf = BaggingClassifier(DecisionTreeClassifier(max_depth=5, splitter='random')).fit(X_train,y_train)
sk_bagging_score = sk_bgclf.score(X_test,y_test)
# 我的Simple Bagging 训练
sb = SimpleBagging().fit(X_train,y_train)
# 留出法
print('my test score : ' , np.sum(sb.predict(X_test)==y_test) / len(y_test))
print('sklearn bagging test score : ' , sk_bagging_score)

print("-"*100)
# 自采样oob
sk_bgclf = BaggingClassifier(DecisionTreeClassifier(max_depth=5, splitter='random'),oob_score=True).fit(X,y)
#print('my oob score : ' , sb.oob_score())
print('my oob score: ', sb.oob_score())
print('sklearn bagging oob score' , sk_bgclf.oob_score_)

#sb.oob_predict()
#np.argpartition

my test score :  0.9244444444444444
sklearn bagging test score :  0.9155555555555556
----------------------------------------------------------------------------------------------------
my oob score:  0.8663697104677061
sklearn bagging oob score 0.8057874234835838


In [9]:
idxs = np.arange(0,11,1)
mask = np.array([1,2,3])
print(idxs)
print(mask)
#np.argwhere(mask)
#idxs mask
# Kanx
idxs[mask] = -1
idxs[idxs != -1]
#idxs[np.isinmask]
#idxs==mask
#idxs[idxs==mask]

[ 0  1  2  3  4  5  6  7  8  9 10]
[1 2 3]


array([ 0,  4,  5,  6,  7,  8,  9, 10])