In [160]:
import platform
import matplotlib.pyplot as plt
system = platform.system()
if system == "Linux":
    plt.rcParams['font.sans-serif'] = ["AR PL UKai CN"] #["Noto Sans CJK JP"]
elif system == "Darwin":
    plt.rcParams['font.sans-serif'] = ["Kaiti SC"]
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from hyperopt import hp, fmin, tpe, Trials, partial
from MyLogColor import  log,LogLevel
import time
import datetime
from hyperopt.early_stop import no_progress_loss
from sklearn.metrics import classification_report
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from OptMetrics import MyMetric


class BayesOptGBDT(MyMetric):
    
    def __init__(self,x,y
                 ,Folds=6 #如果采用分层k折交叉验证的时候则写出分多少折
                 ,TEST_SPLIT=0.2 #测试集的比例
                 ,EARLY_STOP_BAYES =100 #当参数优化多少次没有进步的时候就停止搜索
                 ,NUM_EVALS=600 #最大优化参数的搜索次数
                 ,MAX_SHUFFLE=100 #数据洗牌的次数
                 ,x_train=[], x_val=[], y_train=[], y_val=[]
                 ,metrics_class = "pr-auc" #"[pr-auc],[roc-auc],[f1-score],[recall],[precision],[accuracy],[roc-auc-recall],[roc-auc-accuracy]"
                 # all=[pr_auc,roc_auc,accuracy,precision,recall,false_alarm,miss_rate,specificity,f1score]
                 ,metrics_weight = [0.5,0.5]
                 ,min_recall = 0.5 #召回率的最小值
                 ,cost_wight = 0.1 #对召回率不满足的情况下权重值的惩罚值
                 ,StratifiedKFoldShuffle=True
                #  ,is_verbose = False # 是否查看详细信息
                ):
        self.Folds = Folds
        self.TEST_SPLIT =TEST_SPLIT
        self.EARLY_STOP_BAYES = EARLY_STOP_BAYES
        self.NUM_EVALS = NUM_EVALS
        self.MAX_SHUFFLE = MAX_SHUFFLE
        self.metrics_class = metrics_class
        self.metrics_weight = np.array(metrics_weight)
        self.metrics_weight = self.metrics_weight/self.metrics_weight.sum()
        self.min_recall = min_recall
        self.cost_wight = cost_wight
        self.StratifiedKFoldShuffle = StratifiedKFoldShuffle
        
        self.Bayes_start_time = None
        self.NOW_FUC_RUN_ITER = 0
        self.Trials = None
        self.bayes_opt_parser = None
        self.PARAMS_BEST = None
        self.historical_metrics = np.zeros(self.NUM_EVALS)
        self.historical_params = {}
        
        self.all_metrice_names = ['pr_auc',
                            'roc_auc',
                            'accuracy',
                            'precision',
                            'recall',
                            'false_alarm',
                            'miss_rate',
                            'specificity',
                            'f1score']
        
        self.losss = [ 'deviance', 'exponential']
        self.criterions = ['friedman_mse', 'squared_error', 'absolute_error']
        # self.n_features = x.shape[-1]
        self.Max_features = ['auto', 'sqrt', 'log2',None ]#+list(np.arange(1,x.shape[-1],1))
                        # ,hp.randint("max_features_int",0,x_train.shape[-1])
                        # ,hp.uniform("max_features_float",0,1)]
        self.warm_starts = [True,False]
        self.param_grid_hp = {
            "loss":hp.choice("loss",self.losss)
            ,"learning_rate":hp.uniform("learning_rate",0,1)
            ,'n_estimators': hp.quniform("n_estimators",10,1000,1)
            ,'subsample':hp.uniform("subsample",0,1)
            ,"criterion":hp.choice("criterion",self.criterions)
            ,"min_samples_leaf":hp.uniform("min_samples_leaf",0,0.5)
            ,"min_samples_split":hp.uniform("min_samples_split",0,1)
            ,"min_weight_fraction_leaf":hp.uniform("min_weight_fraction_leaf",0,0.5)
            ,"max_depth":hp.quniform("max_depth",1,1000,1)
            ,"min_impurity_decrease":hp.uniform("min_impurity_decrease",0,1)
            # ,"min_impurity_split":hp.uniform("min_impurity_split",0,1)
            ,"random_state":hp.randint("random_state",100)
            ,"max_features":hp.choice("max_features",self.Max_features)
            ,"max_leaf_nodes":hp.quniform("max_leaf_nodes",2,1000,1)
            ,"warm_start":hp.choice("warm_start",self.warm_starts)
        }
        
        self.x = x
        self.y = y
        self.m,self.n = x.shape
        self.y = np.array(y,dtype=int)
        
        
        if len(x_train) and len(x_val) and len(y_train) and len(y_val):
            self.x_train, self.x_val, self.y_train, self.y_val = x_train,x_val,y_train,y_val
        else:
            if self.MAX_SHUFFLE > 0:
                self.shuffle_x,self.shuffle_y = self.shuffle_data(x,y)
                self.x_train, self.x_val, self.y_train, self.y_val = train_test_split(self.shuffle_x, self.shuffle_y, test_size=self.TEST_SPLIT, random_state = 42)
            else:
                self.x_train, self.x_val, self.y_train, self.y_val = train_test_split(x, y, test_size=self.TEST_SPLIT, random_state = 42)
        self.train_positive = (self.y_train==1).sum()
        self.train_negative = (self.y_train==0).sum()
        self.train_y_counter = self.y_train.size
        self.alpha = self.train_negative/self.train_y_counter

        log(f"""训练数据中,正例有【{self.train_positive}】个占比【{self.train_positive/self.train_y_counter}】
            ，负例有【{self.train_negative}】个占比【{self.train_negative/self.train_y_counter}】
            ，alpha值为【{self.alpha}】，""",LogLevel.INFO)
        
        self.test_positive = (self.y_val==1).sum()
        self.test_negative = (self.y_val==0).sum()
        self.test_y_counter = self.y_val.size

        log(f"""测试数据中,正例有【{self.test_positive}】个占比【{self.test_positive/self.test_y_counter }】
            ，负例有【{self.test_negative}】个占比【{self.test_negative/self.test_y_counter }】
            ，alpha值为【{self.test_negative/self.test_y_counter}】，""",LogLevel.INFO)


    def shuffle_data(self,x,y):
        """
        数据洗牌
        """
        xy = np.c_[x,y]
        for i in tqdm(range(self.MAX_SHUFFLE),desc="数据洗牌"):
            np.random.shuffle(xy)
        x,y = xy[:,0:self.n],xy[:,self.n:self.n+1]
        y = np.ravel(y)
        return x,y
    
    def hyperopt_objective(self,params):
        
        func_start = time.time()
        # log(f"本次参数:{params}",LogLevel.INFO) 
        try:
            if isinstance(self.Folds,(int,float)) and self.Folds > 0:
                # print(self.Folds)
                # self.StratifiedKFoldShuffle
                strkf = StratifiedKFold(n_splits=self.Folds, shuffle=self.StratifiedKFoldShuffle)
                '''
                n_splits=6（默认5）：将数据集分成6个互斥子集，每次用5个子集数据作为训练集，1个子集为测试集，得到6个结果

                shuffle=True（默认False）：每次划分前数据重新洗牌，每次的运行结果不同；shuffle=False：每次运行结果相同，相当于random_state=整数
                random_state=1（默认None）：随机数设置为1，使得每次运行的结果一致
                '''
                # roc_aucs = np.zeros(self.Folds)
                metrics_ = np.zeros(self.Folds)
                log(f"k={self.Folds}折分层交叉验证",LogLevel.PASS)
                for i,index_ in enumerate(strkf.split(self.x,self.y)):
                    train_index,test_index = index_
                    # print(train_index,test_index)
                    X_train_KFold, X_test_KFold = self.x[train_index],self.x[test_index]
                    y_train_KFold, y_test_KFold = self.y[train_index],self.y[test_index]
                    gbdtc = GradientBoostingClassifier(
                            loss = params["loss"]
                            ,learning_rate = params["learning_rate"]
                            ,n_estimators= int(params["n_estimators"])
                            ,subsample = params["subsample"]
                            ,criterion = params["criterion"]
                            ,min_samples_leaf = params["min_samples_leaf"]
                            ,min_samples_split = params["min_samples_split"]
                            ,min_weight_fraction_leaf = params["min_weight_fraction_leaf"]
                            ,max_depth = int(params["max_depth"])
                            ,min_impurity_decrease = params["min_impurity_decrease"]
                            # ,min_impurity_split = params["min_impurity_split"]
                            ,random_state = params["random_state"]
                            ,max_features = params["max_features"]
                            ,max_leaf_nodes = int(params["max_leaf_nodes"])
                            ,warm_start = params["warm_start"]
                        )
                    gbdtc = gbdtc.fit(X_train_KFold, y_train_KFold)
                    gbdtc_proba = gbdtc.predict_proba(X_test_KFold)[:,1]
                    gbdtc_pred = gbdtc.predict(X_test_KFold)
                    
                    if self.metrics_class == "pr-auc":
                        metrics_[i] = self.PR_AUC(y_test_KFold, gbdtc_proba,gbdtc_pred)
                    elif self.metrics_class == "roc-auc":
                        metrics_[i] = self.ROC_AUC(y_test_KFold, gbdtc_proba)
                    elif self.metrics_class == "f1-score":
                        metrics_[i] = f1_score(y_test_KFold,gbdtc_pred) 
                    elif self.metrics_class == "recall":
                        metrics_[i] = self.TPRN_Score(y_test_KFold,gbdtc_pred)["召回率|recall｜真阳率｜命中率"] 
                    elif self.metrics_class == "precision":
                        metrics_[i] = self.TPRN_Score(y_test_KFold,gbdtc_pred)["精确率|precision"] 
                    elif self.metrics_class == "accuracy":
                        metrics_[i] = self.TPRN_Score(y_test_KFold,gbdtc_pred)["ACC正确率|accuracy"]
                    elif self.metrics_class == "roc-auc-recall":
                        roc_auc = self.ROC_AUC(y_test_KFold, gbdtc_proba)
                        recall = self.TPRN_Score(y_test_KFold,gbdtc_pred)["召回率|recall｜真阳率｜命中率"]
                        metrics_[i] = roc_auc*self.metrics_weight[0]+recall*self.metrics_weight[-1]
                    elif self.metrics_class == "roc-auc-recall-accuracy":
                        roc_auc = self.ROC_AUC(y_test_KFold, gbdtc_proba)
                        recall = self.TPRN_Score(y_test_KFold,gbdtc_pred)["召回率|recall｜真阳率｜命中率"]
                        accuracy = self.TPRN_Score(y_test_KFold,gbdtc_pred)["ACC正确率|accuracy"]
                        metrics_[i] = roc_auc*self.metrics_weight[0]+recall*self.metrics_weight[1]+accuracy*self.metrics_weight[-1] 
                    if self.metrics_class == "all":
                        TPRN = self.TPRN_Score(y_test_KFold,gbdtc_pred)
                        pr_auc = self.PR_AUC(y_test_KFold, gbdtc_proba,gbdtc_pred)
                        roc_auc = self.ROC_AUC(y_test_KFold, gbdtc_proba)
                        accuracy = TPRN["ACC正确率|accuracy"]
                        precision = TPRN['精确率|precision']
                        recall = TPRN["召回率|recall｜真阳率｜命中率"]
                        false_alarm = TPRN['误报率|false alarm｜假阳率｜虚警率｜误检率'] 
                        miss_rate = TPRN['漏报率|miss rate|也称为漏警率|漏检率']
                        specificity = TPRN['特异度|specificity']
                        f1score = f1_score(y_test_KFold,gbdtc_pred) 
                        
                        metrics_values = np.array([pr_auc,roc_auc,accuracy,precision,recall,-false_alarm,-miss_rate,specificity,f1score])
                        
                        metrics_values = np.nan_to_num(metrics_values,0)
                        if recall > self.min_recall:
                            metrics_weight = np.nan_to_num(self.metrics_weight,0)
                        else:
                            metrics_weight = np.nan_to_num(self.metrics_weight,0)*self.cost_wight
                        metrics_[i] = metrics_values.dot(metrics_weight)                    
                # roc_auc  = roc_aucs.mean()
                metrics_value = metrics_.mean()
                    
            else: 
                gbdtc = GradientBoostingClassifier(
                    loss = params["loss"]
                    ,learning_rate = params["learning_rate"]
                    ,n_estimators= int(params["n_estimators"])
                    ,subsample = params["subsample"]
                    ,criterion = params["criterion"]
                    ,min_samples_leaf = params["min_samples_leaf"]
                    ,min_samples_split = params["min_samples_split"]
                    ,min_weight_fraction_leaf = params["min_weight_fraction_leaf"]
                    ,max_depth = int(params["max_depth"])
                    ,min_impurity_decrease = params["min_impurity_decrease"]
                    # ,min_impurity_split = params["min_impurity_split"]
                    ,random_state = params["random_state"]
                    ,max_features = params["max_features"]
                    ,max_leaf_nodes = int(params["max_leaf_nodes"])
                    ,warm_start = params["warm_start"]
                )
                gbdtc =gbdtc.fit(self.x_train, self.y_train)
                gbdtc_proba = gbdtc.predict_proba(self.x_val)[:,1]
                gbdtc_pred = gbdtc.predict(self.x_val)
                if self.metrics_class == "pr-auc":
                    metrics_value = self.PR_AUC(self.y_val, gbdtc_proba,gbdtc_pred)
                elif self.metrics_class == "roc-auc":
                    metrics_value = self.ROC_AUC(self.y_val, gbdtc_proba)
                elif self.metrics_class == "f1-score":
                    metrics_value = f1_score(self.y_val,gbdtc_pred)
                elif self.metrics_class == "recall":
                    metrics_value = self.TPRN_Score(self.y_val,gbdtc_pred)["召回率|recall｜真阳率｜命中率"] 
                elif self.metrics_class == "precision":
                    metrics_value = self.TPRN_Score(self.y_val,gbdtc_pred)["精确率|precision"] 
                elif self.metrics_class == "accuracy":
                    metrics_value = self.TPRN_Score(self.y_val,gbdtc_pred)["ACC正确率|accuracy"]
                elif self.metrics_class == "roc-auc-recall":
                    roc_auc = self.ROC_AUC(self.y_val, gbdtc_proba)
                    recall = self.TPRN_Score(self.y_val,gbdtc_pred)["召回率|recall｜真阳率｜命中率"]
                    metrics_value = roc_auc*self.metrics_weight[0]+recall*self.metrics_weight[-1]
                elif self.metrics_class == "roc-auc-recall-accuracy":
                    roc_auc = self.ROC_AUC(self.y_val, gbdtc_proba)
                    recall = self.TPRN_Score(self.y_val,gbdtc_pred)["召回率|recall｜真阳率｜命中率"]
                    accuracy = self.TPRN_Score(self.y_val,gbdtc_pred)["ACC正确率|accuracy"]
                    metrics_value = roc_auc*self.metrics_weight[0]+recall*self.metrics_weight[1]+accuracy*self.metrics_weight[-1]                      
                if self.metrics_class == "all":
                    TPRN = self.TPRN_Score(self.y_val, gbdtc_pred)
                    pr_auc = self.PR_AUC(self.y_val, gbdtc_proba,gbdtc_pred)
                    roc_auc = self.ROC_AUC(self.y_val, gbdtc_proba)
                    accuracy = TPRN["ACC正确率|accuracy"]
                    precision = TPRN['精确率|precision']
                    recall = TPRN["召回率|recall｜真阳率｜命中率"]
                    false_alarm = TPRN['误报率|false alarm｜假阳率｜虚警率｜误检率'] 
                    miss_rate = TPRN['漏报率|miss rate|也称为漏警率|漏检率']
                    specificity = TPRN['特异度|specificity']
                    f1score = f1_score(self.y_val,gbdtc_pred) 
                    metrics_values = np.array([pr_auc,roc_auc,accuracy,precision,recall,-false_alarm,-miss_rate,specificity,f1score])
                    metrics_values = np.nan_to_num(metrics_values,0)
                    if recall > self.min_recall:
                        metrics_weight = np.nan_to_num(self.metrics_weight,0)
                    else:
                        metrics_weight = np.nan_to_num(self.metrics_weight,0)*self.cost_wight
                    metrics_value = metrics_values.dot(metrics_weight)
        except Exception as e:
            log(f"{str(e)},{params['max_features']}",LogLevel.ERROR)
            metrics_value = self.historical_metrics.mean()   
        # metrics_value = MyMetric.PR_AUC(self.y_val,gbdtc_proba,gbdtc_pred)
        
        func_end = time.time()
        # global NOW_FUC_RUN_ITER
        self.NOW_FUC_RUN_ITER += 1
        self.historical_params.update({self.NOW_FUC_RUN_ITER-1:params})
        self.historical_metrics[self.NOW_FUC_RUN_ITER-1]=metrics_value
        
        # log(f"""本次迭代{self.metrics_class}分数为:[{metrics_value}],
        # 用时:[{func_end-func_start}]秒,
        # 当前优化第:[{self.NOW_FUC_RUN_ITER}]次,
        # 已运行:[{self.NOW_FUC_RUN_ITER}]次，
        # 用时总计:[{datetime.timedelta(seconds=(func_end-self.Bayes_start_time))}]秒,
        # """,LogLevel.PASS)
        return -metrics_value
    
    def param_hyperopt(self,max_evals=100):
        """
        """
        #保存迭代过程
        trials = Trials()

        #设置提前停止
        ## 如果损失没有增加，将在 X 次迭代后停止的停止函数
        early_stop_fn = no_progress_loss(self.EARLY_STOP_BAYES)

        #定义代理模型
        #algo = partial(tpe.suggest, n_startup_jobs=20, n_EI_candidates=50)
        # global hyperopt_params
        # hyperopt_params = self.param_grid_hp
        params_best = fmin(self.hyperopt_objective #目标函数
                        , space = self.param_grid_hp #参数空间
                        , algo = tpe.suggest #代理模型
                        #, algo = algo
                        , max_evals = max_evals #允许的迭代次数
                        , verbose=True
                        , trials = trials
                        , early_stop_fn = early_stop_fn
                        
                        )

        #打印最优参数，fmin会自动打印最佳分数
        print("\n","\n","best params: ", params_best,
            "\n")
        return params_best, trials
    
    def parsing_bayes_params_for_GBDT(self,params):
        max_features_params = self.Max_features[int(params["max_features"])]
        
        return {"loss":self.losss[int(params["loss"])]
            ,"learning_rate":params["learning_rate"]
            ,"n_estimators": int(params["n_estimators"])
            ,"subsample": params["subsample"]
            ,"criterion" : self.criterions[int(params["criterion"])]
            ,"min_samples_leaf" : params["min_samples_leaf"]
            ,"min_samples_split" :  params["min_samples_split"]
            ,"min_weight_fraction_leaf" : params["min_weight_fraction_leaf"]
            ,"max_depth" : int(params["max_depth"])
            ,"min_impurity_decrease": params["min_impurity_decrease"]
            # ,min_impurity_split = params["min_impurity_split"]
            ,"random_state" : params["random_state"]
            ,"max_features": max_features_params
            ,"max_leaf_nodes": int(params["max_leaf_nodes"])
            ,"warm_start" : self.warm_starts[params["warm_start"]]
            }
        
    def run(self):
        t = 0
        for params_name,obj in self.param_grid_hp.items():
            t += 1
            log(f"已准备优化的第{t}个参数,名称:{params_name},类型:{obj}",LogLevel.PASS)

        self.Bayes_start_time = time.time()
        self.NOW_FUC_RUN_ITER = 0
        self.PARAMS_BEST, self.Trials = self.param_hyperopt(self.NUM_EVALS)
        self.bayes_opt_parser = self.parsing_bayes_params_for_GBDT(self.PARAMS_BEST)
        log(f"解析参数得到结果:{self.bayes_opt_parser}",LogLevel.PASS)
        
    def test_params(self):
        params = self.bayes_opt_parser
        # if int(params["max_features"]) - params["max_features"]== 0:
        # max_features_params = self.Max_features[int(params["max_features"])]
        # else:
        #     max_features_params = params["max_features"]
            
        gbdtc = GradientBoostingClassifier(
            loss = params["loss"]
            ,learning_rate = params["learning_rate"]
            ,n_estimators= params["n_estimators"]
            ,subsample = params["subsample"]
            ,criterion = params["criterion"]
            ,min_samples_leaf = params["min_samples_leaf"]
            ,min_samples_split = params["min_samples_split"]
            ,min_weight_fraction_leaf = params["min_weight_fraction_leaf"]
            ,max_depth = params["max_depth"]
            ,min_impurity_decrease = params["min_impurity_decrease"]
            # ,min_impurity_split = params["min_impurity_split"]
            ,random_state = params["random_state"]
            ,max_features = params["max_features"]
            ,max_leaf_nodes = params["max_leaf_nodes"]
            ,warm_start = params["warm_start"]
        )
        gbdtc =gbdtc.fit(self.x_train, self.y_train)
        gbdtc_proba = gbdtc.predict_proba(self.x_val)[:,1]
        gbdtc_pred = gbdtc.predict(self.x_val)
        # metric = MyMetric.PR_AUC(self.y_val,gbdtc_proba,gbdtc_pred)
        log(f'模型的评估报告1：\n,{classification_report(self.y_val, gbdtc_pred)}\n',LogLevel.SUCCESS)
        log(f'模型的评估报告2：\n,{self.TPRN_Score(self.y_val, gbdtc_pred)}\n',LogLevel.SUCCESS)
        
        # self.plot_roc(self.y_val, gbdtc_proba[:,1])
        if self.metrics_class == "pr-auc":
            metrics = self.PR_AUC(self.y_val, gbdtc_proba,gbdtc_pred)
        elif self.metrics_class == "roc-auc":
            metrics = self.ROC_AUC(self.y_val, gbdtc_proba)
        elif self.metrics_class == "f1-score":
            metrics = f1_score(self.y_val,gbdtc_pred)
        elif self.metrics_class == "recall":
            metrics = self.TPRN_Score(self.y_val,gbdtc_pred)["召回率|recall｜真阳率｜命中率"] 
        elif self.metrics_class == "precision":
            metrics = self.TPRN_Score(self.y_val,gbdtc_pred)["精确率|precision"] 
        elif self.metrics_class == "accuracy":
            metrics = self.TPRN_Score(self.y_val,gbdtc_pred)["ACC正确率|accuracy"]
        elif self.metrics_class == "roc-auc-recall":
            roc_auc = self.ROC_AUC(self.y_val, gbdtc_proba)
            recall = self.TPRN_Score(self.y_val,gbdtc_pred)["召回率|recall｜真阳率｜命中率"]
            metrics = roc_auc*self.metrics_weight[0]*recall*self.metrics_weight[-1]
        elif self.metrics_class == "roc-auc-recall-accuracy":
            roc_auc = self.ROC_AUC(self.y_val, gbdtc_proba)
            recall = self.TPRN_Score(self.y_val,gbdtc_pred)["召回率|recall｜真阳率｜命中率"]
            accuracy = self.TPRN_Score(self.y_val,gbdtc_pred)["ACC正确率|accuracy"]
            metrics = roc_auc*self.metrics_weight[0]+recall*self.metrics_weight[1]+accuracy*self.metrics_weight[-1]                      
        if self.metrics_class == "all":
            TPRN = self.TPRN_Score(self.y_val, gbdtc_pred)
            pr_auc = self.PR_AUC(self.y_val, gbdtc_proba,gbdtc_pred)
            roc_auc = self.ROC_AUC(self.y_val, gbdtc_proba)
            accuracy = TPRN["ACC正确率|accuracy"]
            precision = TPRN['精确率|precision']
            recall = TPRN["召回率|recall｜真阳率｜命中率"]
            false_alarm = TPRN['误报率|false alarm｜假阳率｜虚警率｜误检率'] 
            miss_rate = TPRN['漏报率|miss rate|也称为漏警率|漏检率']
            specificity = TPRN['特异度|specificity']
            f1score = f1_score(self.y_val,gbdtc_pred) 
            metrics_values = np.array([pr_auc,roc_auc,accuracy,precision,recall,false_alarm,miss_rate,specificity,f1score])
            #metrics_weight = np.array(self.metrics_weight)
            metrics_values = np.nan_to_num(metrics_values,0)
            metrics_weight = np.nan_to_num(self.metrics_weight,0)
            for metrice_name,metrics_value,weight in zip(self.all_metrice_names,metrics_values,metrics_weight):
                log(f'测试{metrice_name}值为:{metrics_value},权重为{weight}',LogLevel.SUCCESS)
            metrics = metrics_values.dot(metrics_weight) 
        log(f'测试优化参数的:【{self.metrics_class}】得分为:【{metrics}】,优化过程中的最高分为:【{self.historical_metrics.max()}】',LogLevel.SUCCESS)
        return metrics
            
        
if __name__ == '__main__':
    from sklearn import datasets
    cancer=datasets.load_breast_cancer()
    x=cancer.data
    y=cancer.target

    def Rollover(x):
        x = x.astype(bool)
        x = ~x
        x = x.astype(int)
        return x
    ####TODO:将少数变成正例
    y = Rollover(y)
    boG = BayesOptGBDT(x,y
                       ,MAX_SHUFFLE=100
                        ,Folds=0
                        ,metrics_class="all"
                        #all=[1.pr_auc,2.roc_auc,3.accuracy,4.precision,5.recall,6.false_alarm,7.miss_rate,8.specificity,9.f1score]
                        ,metrics_weight=[0,0.5,0.5,0,0,0,0,0,0]
                        ,EARLY_STOP_BAYES=200
                        ,NUM_EVALS=1000
                        ,min_recall=0
                        ,cost_wight=1
                       )
    boG.run()
    boG.test_params()

数据洗牌: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1775.47it/s]

[0;34m2023-12-24 18:29:24.558895 - [INFO] - 训练数据中,正例有【169】个占比【0.37142857142857144】
            ，负例有【286】个占比【0.6285714285714286】
            ，alpha值为【0.6285714285714286】，[0m
[0;34m2023-12-24 18:29:24.558980 - [INFO] - 测试数据中,正例有【43】个占比【0.37719298245614036】
            ，负例有【71】个占比【0.6228070175438597】
            ，alpha值为【0.6228070175438597】，[0m
[0;32m2023-12-24 18:29:24.559045 - [PASS] - 已准备优化的第1个参数,名称:loss,类型:0 switch
1   hyperopt_param
2     Literal{loss}
3     randint
4       Literal{2}
5   Literal{deviance}
6   Literal{exponential}[0m
[0;32m2023-12-24 18:29:24.559073 - [PASS] - 已准备优化的第2个参数,名称:learning_rate,类型:0 float
1   hyperopt_param
2     Literal{learning_rate}
3     uniform
4       Literal{0}
5       Literal{1}[0m
[0;32m2023-12-24 18:29:24.559097 - [PASS] - 已准备优化的第3个参数,名称:n_estimators,类型:0 float
1   hyperopt_param
2     Literal{n_estimators}
3     quniform
4       Literal{10}
5       Literal{1000}
6       Literal{1}[0m
[0;32m2023-12-24 18:29:24.559119 - [PASS] - 已准备优化的第




 24%|███████████████████████████████████████████████████▏                                                                                                                                                                   | 238/1000 [01:46<05:41,  2.23trial/s, best loss: -1.0]

 
 best params:  {'criterion': 0, 'learning_rate': 0.23265593628618167, 'loss': 1, 'max_depth': 625.0, 'max_features': 3, 'max_leaf_nodes': 885.0, 'min_impurity_decrease': 0.9137615753951835, 'min_samples_leaf': 0.11266315666906594, 'min_samples_split': 0.30033118853347274, 'min_weight_fraction_leaf': 0.23734129000876486, 'n_estimators': 119.0, 'random_state': 58, 'subsample': 0.863391305855304, 'warm_start': 1} 

[0;32m2023-12-24 18:31:11.384723 - [PASS] - 解析参数得到结果:{'loss': 'exponential', 'learning_rate': 0.23265593628618167, 'n_estimators': 119, 'subsample': 0.863391305855304, 'criterion': 'friedman_mse', 'min_samples_leaf': 0.11266315666906594, 'min_samples_split': 0.30033118853347274, 'min_weight_fraction_lea

In [161]:
# idx = np.argmin(boG.Trials.losses())
# boG.Trials.trials[idx]['misc']['vals']

In [162]:
# x.shape

In [163]:
# len(boG.Trials.losses())

In [165]:
# boG.PARAMS_BEST

In [166]:
idx = np.argmin([-boG.historical_metrics])
boG.historical_params[idx]

{'criterion': 'friedman_mse',
 'learning_rate': 0.23265593628618167,
 'loss': 'exponential',
 'max_depth': 625.0,
 'max_features': None,
 'max_leaf_nodes': 885.0,
 'min_impurity_decrease': 0.9137615753951835,
 'min_samples_leaf': 0.11266315666906594,
 'min_samples_split': 0.30033118853347274,
 'min_weight_fraction_leaf': 0.23734129000876486,
 'n_estimators': 119.0,
 'random_state': 58,
 'subsample': 0.863391305855304,
 'warm_start': False}

In [169]:
ol = {'loss': 'exponential', 'learning_rate': 0.23265593628618167, 'n_estimators': 119
      , 'subsample': 0.863391305855304, 'criterion': 'friedman_mse'
      , 'min_samples_leaf': 0.11266315666906594, 'min_samples_split': 0.30033118853347274
      , 'min_weight_fraction_leaf': 0.23734129000876486, 'max_depth': 625, 'min_impurity_decrease': 0.9137615753951835
      , 'random_state': 58, 'max_features': None, 'max_leaf_nodes': 885, 'warm_start': False}

ol == boG.historical_params[idx]

True

In [168]:
gbdtc = GradientBoostingClassifier(**boG.bayes_opt_parser)
gbdtc =gbdtc.fit(boG.x_train, boG.y_train)
gbdtc_proba = gbdtc.predict_proba(boG.x_val)[:,1]
gbdtc_pred = gbdtc.predict(boG.x_val)
# metric = MyMetric.PR_AUC(self.y_val,gbdtc_proba,gbdtc_pred)
log(f'模型的评估报告1：\n,{classification_report(boG.y_val, gbdtc_pred)}\n',LogLevel.SUCCESS)
log(f'模型的评估报告2：\n,{boG.TPRN_Score(boG.y_val, gbdtc_pred)}\n',LogLevel.SUCCESS)

[0;36m2023-12-24 18:34:42.868609 - [SUCCESS] - 模型的评估报告1：
,              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        71
         1.0       1.00      1.00      1.00        43

    accuracy                           1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114

[0m
[0;36m2023-12-24 18:34:42.868905 - [SUCCESS] - 模型的评估报告2：
,{'混淆矩阵': matrix([[43,  0],
        [ 0, 71]]), 'ACC正确率|accuracy': 1.0, '精确率|precision': 1.0, '召回率|recall｜真阳率｜命中率': 1.0, '误报率|false alarm｜假阳率｜虚警率｜误检率': 0.0, '漏报率|miss rate|也称为漏警率|漏检率': 0.0, '特异度|specificity': 1.0, 'F1-score:': 1.0, '真实正样本数': 43, '真实负样本数': 71}
[0m


In [93]:
boG.bayes_opt_parser['max_features']

In [94]:
print(boG.bayes_opt_parser['max_features'])

None


In [138]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.metrics import precision_recall_curve,auc,f1_score,roc_curve,auc
cancer=datasets.load_breast_cancer()
x=cancer.data
y=cancer.target


def Rollover(x):
    x = x.astype(bool)
    x = ~x
    x = x.astype(int)
    return x
####TODO:将少数变成正例
y = Rollover(y)

def ROC_AUC(test_y, proba):
    fpr,tpr,threshold = roc_curve(test_y, proba)
    roc_auc_ = auc(fpr,tpr)
    return roc_auc_

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=42) 

rfc = RandomForestClassifier()
rfc = rfc.fit(train_x,train_y,)
rfc_pred = rfc.predict(test_x)
rfc_proba = rfc.predict_proba(test_x)[:,1]
(ROC_AUC(test_y,rfc_proba)+sum(rfc_pred==test_y)/test_y.size)/2

0.9803270869607691

In [153]:
import platform
import matplotlib.pyplot as plt
system = platform.system()
if system == "Linux":
    plt.rcParams['font.sans-serif'] = ["AR PL UKai CN"] #["Noto Sans CJK JP"]
elif system == "Darwin":
    plt.rcParams['font.sans-serif'] = ["Kaiti SC"]
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from hyperopt import hp, fmin, tpe, Trials, partial
from MyLogColor import  log,LogLevel
import time
import datetime
from hyperopt.early_stop import no_progress_loss
from sklearn.metrics import classification_report
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from OptMetrics import MyMetric


class BayesOptRF(MyMetric):
    
    def __init__(self,x,y
                 ,Folds=6 #如果采用分层k折交叉验证的时候则写出分多少折
                 ,TEST_SPLIT=0.2 #测试集的比例
                 ,EARLY_STOP_BAYES =100 #当参数优化多少次没有进步的时候就停止搜索
                 ,NUM_EVALS=600 #最大优化参数的搜索次数
                 ,MAX_SHUFFLE=100 #数据洗牌的次数
                 ,x_train=[], x_val=[], y_train=[], y_val=[]
                 ,metrics_class = "pr-auc" #"[pr-auc],[roc-auc],[f1-score],[recall],[precision],[accuracy],[roc-auc-recall],[roc-auc-accuracy]"
                 # all=[pr_auc,roc_auc,accuracy,precision,recall,false_alarm,miss_rate,specificity,f1score]
                 ,metrics_weight = [0.5,0.5]
                 ,min_recall = 0.5 #召回率的最小值
                 ,cost_wight = 0.1 #对召回率不满足的情况下权重值的惩罚值
                 ,StratifiedKFoldShuffle=True
                #  ,is_verbose = False # 是否查看详细信息
                ):
        self.Folds = Folds
        self.TEST_SPLIT =TEST_SPLIT
        self.EARLY_STOP_BAYES = EARLY_STOP_BAYES
        self.NUM_EVALS = NUM_EVALS
        self.MAX_SHUFFLE = MAX_SHUFFLE
        self.metrics_class = metrics_class
        self.metrics_weight = np.array(metrics_weight)
        self.metrics_weight = self.metrics_weight/self.metrics_weight.sum()
        self.min_recall = min_recall
        self.cost_wight = cost_wight
        self.StratifiedKFoldShuffle = StratifiedKFoldShuffle
        
        self.Bayes_start_time = None
        self.NOW_FUC_RUN_ITER = 0
        self.Trials = None
        self.bayes_opt_parser = None
        self.PARAMS_BEST = None
        self.historical_metrics = np.zeros(self.NUM_EVALS)
        self.historical_params = {}
        
        self.all_metrice_names = ['pr_auc',
                            'roc_auc',
                            'accuracy',
                            'precision',
                            'recall',
                            'false_alarm',
                            'miss_rate',
                            'specificity',
                            'f1score']
        
        
        self.losss = [ 'deviance', 'exponential']
        self.criterions = ['friedman_mse', 'squared_error', 'absolute_error']
        self.Max_features = ['auto', 'sqrt', 'log2',None]#+list(np.arange(0,1,0.001))
                        # ,hp.randint("max_features_int",0,x_train.shape[-1])
                        # ,hp.uniform("max_features_float",0,1)]
        self.criterions = ["gini", "entropy"]
        self.max_depths = [None,hp.randint("max_depth_int",0,1000)]
        self.Max_features = ['auto', 'sqrt', 'log2',None]#,hp.quniform("max_features_int",1,1000,1),hp.uniform("max_features_float",1e-10,1)]
        self.Max_leaf_nodes = [None,hp.quniform("max_leaf_nodes_int",1,1000,1)]
        self.bootstraps = [True,False]
        self.oob_scores = [True,False]
        self.warm_starts = [True,False]
        self.class_weights = [None,'balanced']
        self.param_grid_hp = {
            "n_estimators":hp.quniform("n_estimators",10,1000,1)
            ,"criterion":hp.choice("criterion",self.criterions)
            ,"max_depth":hp.choice("max_depth",self.max_depths)
            ,"min_samples_split":hp.quniform("min_samples_split",2,1000,1)
            ,"min_samples_leaf":hp.quniform("min_samples_leaf",1,1000,1)
            ,"min_weight_fraction_leaf":hp.uniform("min_weight_fraction_leaf",0,0.5)
            ,"max_features":hp.choice("max_features",self.Max_features)
            ,"max_leaf_nodes":hp.choice("max_leaf_nodes",self.Max_leaf_nodes)
            ,"min_impurity_decrease":hp.uniform("min_impurity_decrease",0,1)
            ,"bootstrap":hp.choice("bootstrap",self.bootstraps)
            ,"oob_score":hp.choice("oob_score",self.oob_scores)
            ,"random_state":hp.randint("random_state",100)
            # ,"verbose":hp.quniform("verbose",0,1000,1)
            ,"warm_start":hp.choice("warm_start",self.warm_starts)
            ,"class_weight":hp.choice("class_weight",self.class_weights)
            ,"max_samples":hp.uniform("max_samples",0,1)
        }
        self.x = x
        self.y = y
        self.m,self.n = x.shape
        self.y = np.array(y,dtype=int)
        
        
        if len(x_train) and len(x_val) and len(y_train) and len(y_val):
            self.x_train, self.x_val, self.y_train, self.y_val = x_train,x_val,y_train,y_val
        else:
            if self.MAX_SHUFFLE > 0:
                self.shuffle_x,self.shuffle_y = self.shuffle_data(x,y)
                self.x_train, self.x_val, self.y_train, self.y_val = train_test_split(self.shuffle_x, self.shuffle_y, test_size=self.TEST_SPLIT, random_state = 42)
            else:
                self.x_train, self.x_val, self.y_train, self.y_val = train_test_split(x, y, test_size=self.TEST_SPLIT, random_state = 42)
        self.train_positive = (self.y_train==1).sum()
        self.train_negative = (self.y_train==0).sum()
        self.train_y_counter = self.y_train.size
        self.alpha = self.train_negative/self.train_y_counter

        log(f"""训练数据中,正例有【{self.train_positive}】个占比【{self.train_positive/self.train_y_counter}】
            ，负例有【{self.train_negative}】个占比【{self.train_negative/self.train_y_counter}】
            ，alpha值为【{self.alpha}】，""",LogLevel.INFO)
        
        self.test_positive = (self.y_val==1).sum()
        self.test_negative = (self.y_val==0).sum()
        self.test_y_counter = self.y_val.size

        log(f"""测试数据中,正例有【{self.test_positive}】个占比【{self.test_positive/self.test_y_counter }】
            ，负例有【{self.test_negative}】个占比【{self.test_negative/self.test_y_counter }】
            ，alpha值为【{self.test_negative/self.test_y_counter}】，""",LogLevel.INFO)


    def shuffle_data(self,x,y):
        """
        数据洗牌
        """
        xy = np.c_[x,y]
        for i in tqdm(range(self.MAX_SHUFFLE),desc="数据洗牌"):
            np.random.shuffle(xy)
        x,y = xy[:,0:self.n],xy[:,self.n:self.n+1]
        y = np.ravel(y)
        return x,y
    
    def hyperopt_objective(self,hyperopt_params):
        
        func_start = time.time()
        params = {
            "n_estimators":int(hyperopt_params["n_estimators"])
            ,"criterion":hyperopt_params["criterion"]
            ,"max_depth":hyperopt_params["max_depth"]
            ,"min_samples_split":int(hyperopt_params["min_samples_split"])
            ,"min_samples_leaf":int(hyperopt_params["min_samples_leaf"])
            ,"min_weight_fraction_leaf":hyperopt_params["min_weight_fraction_leaf"]
            ,"max_features":hyperopt_params["max_features"]
            ,"max_leaf_nodes":type(hyperopt_params["max_leaf_nodes"])==float and int(hyperopt_params["max_leaf_nodes"]) or None
            ,"min_impurity_decrease":hyperopt_params["min_impurity_decrease"]
            ,"bootstrap":hyperopt_params["bootstrap"]
            ,"oob_score":hyperopt_params["bootstrap"] == True and hyperopt_params["oob_score"] or False
            ,"random_state":int(hyperopt_params["random_state"])
            # ,"verbose":int(hyperopt_params["verbose"])
            ,"warm_start":hyperopt_params["warm_start"]
            ,"class_weight":hyperopt_params["class_weight"]
            ,"max_samples":hyperopt_params["bootstrap"] == True and hyperopt_params["max_samples"] or None
            ,'n_jobs':-1
        }
        # log(f"本次参数:{params}",LogLevel.INFO) 
        try:
            if isinstance(self.Folds,(int,float)) and self.Folds > 0:
                # print(self.Folds)
                # self.StratifiedKFoldShuffle
                strkf = StratifiedKFold(n_splits=self.Folds, shuffle=self.StratifiedKFoldShuffle)
                '''
                n_splits=6（默认5）：将数据集分成6个互斥子集，每次用5个子集数据作为训练集，1个子集为测试集，得到6个结果

                shuffle=True（默认False）：每次划分前数据重新洗牌，每次的运行结果不同；shuffle=False：每次运行结果相同，相当于random_state=整数
                random_state=1（默认None）：随机数设置为1，使得每次运行的结果一致
                '''
                # roc_aucs = np.zeros(self.Folds)
                metrics_ = np.zeros(self.Folds)
                log(f"k={self.Folds}折分层交叉验证",LogLevel.PASS)
                for i,index_ in enumerate(strkf.split(self.x,self.y)):
                    train_index,test_index = index_
                    # print(train_index,test_index)
                    X_train_KFold, X_test_KFold = self.x[train_index],self.x[test_index]
                    y_train_KFold, y_test_KFold = self.y[train_index],self.y[test_index]
                    gbdtc = RandomForestClassifier(**params)
                    gbdtc = gbdtc.fit(X_train_KFold, y_train_KFold)
                    gbdtc_proba = gbdtc.predict_proba(X_test_KFold)[:,1]
                    gbdtc_pred = gbdtc.predict(X_test_KFold)
                    
                    if self.metrics_class == "pr-auc":
                        metrics_[i] = self.PR_AUC(y_test_KFold, gbdtc_proba,gbdtc_pred)
                    elif self.metrics_class == "roc-auc":
                        metrics_[i] = self.ROC_AUC(y_test_KFold, gbdtc_proba)
                    elif self.metrics_class == "f1-score":
                        metrics_[i] = f1_score(y_test_KFold,gbdtc_pred) 
                    elif self.metrics_class == "recall":
                        metrics_[i] = self.TPRN_Score(y_test_KFold,gbdtc_pred)["召回率|recall｜真阳率｜命中率"] 
                    elif self.metrics_class == "precision":
                        metrics_[i] = self.TPRN_Score(y_test_KFold,gbdtc_pred)["精确率|precision"] 
                    elif self.metrics_class == "accuracy":
                        metrics_[i] = self.TPRN_Score(y_test_KFold,gbdtc_pred)["ACC正确率|accuracy"]
                    elif self.metrics_class == "roc-auc-recall":
                        roc_auc = self.ROC_AUC(y_test_KFold, gbdtc_proba)
                        recall = self.TPRN_Score(y_test_KFold,gbdtc_pred)["召回率|recall｜真阳率｜命中率"]
                        metrics_[i] = roc_auc*self.metrics_weight[0]+recall*self.metrics_weight[-1]
                    elif self.metrics_class == "roc-auc-recall-accuracy":
                        roc_auc = self.ROC_AUC(y_test_KFold, gbdtc_proba)
                        recall = self.TPRN_Score(y_test_KFold,gbdtc_pred)["召回率|recall｜真阳率｜命中率"]
                        accuracy = self.TPRN_Score(y_test_KFold,gbdtc_pred)["ACC正确率|accuracy"]
                        metrics_[i] = roc_auc*self.metrics_weight[0]+recall*self.metrics_weight[1]+accuracy*self.metrics_weight[-1] 
                    if self.metrics_class == "all":
                        TPRN = self.TPRN_Score(y_test_KFold,gbdtc_pred)
                        pr_auc = self.PR_AUC(y_test_KFold, gbdtc_proba,gbdtc_pred)
                        roc_auc = self.ROC_AUC(y_test_KFold, gbdtc_proba)
                        accuracy = TPRN["ACC正确率|accuracy"]
                        precision = TPRN['精确率|precision']
                        recall = TPRN["召回率|recall｜真阳率｜命中率"]
                        false_alarm = TPRN['误报率|false alarm｜假阳率｜虚警率｜误检率'] 
                        miss_rate = TPRN['漏报率|miss rate|也称为漏警率|漏检率']
                        specificity = TPRN['特异度|specificity']
                        f1score = f1_score(y_test_KFold,gbdtc_pred) 
                        
                        metrics_values = np.array([pr_auc,roc_auc,accuracy,precision,recall,-false_alarm,-miss_rate,specificity,f1score])
                        
                        metrics_values = np.nan_to_num(metrics_values,0)
                        if recall > self.min_recall:
                            metrics_weight = np.nan_to_num(self.metrics_weight,0)
                        else:
                            metrics_weight = np.nan_to_num(self.metrics_weight,0)*self.cost_wight
                        metrics_[i] = metrics_values.dot(metrics_weight)                    
                # roc_auc  = roc_aucs.mean()
                metrics_value = metrics_.mean()
                    
            else: 
                gbdtc = RandomForestClassifier(**params)
                gbdtc =gbdtc.fit(self.x_train, self.y_train)
                gbdtc_proba = gbdtc.predict_proba(self.x_val)[:,1]
                gbdtc_pred = gbdtc.predict(self.x_val)
                if self.metrics_class == "pr-auc":
                    metrics_value = self.PR_AUC(self.y_val, gbdtc_proba,gbdtc_pred)
                elif self.metrics_class == "roc-auc":
                    metrics_value = self.ROC_AUC(self.y_val, gbdtc_proba)
                elif self.metrics_class == "f1-score":
                    metrics_value = f1_score(self.y_val,gbdtc_pred)
                elif self.metrics_class == "recall":
                    metrics_value = self.TPRN_Score(self.y_val,gbdtc_pred)["召回率|recall｜真阳率｜命中率"] 
                elif self.metrics_class == "precision":
                    metrics_value = self.TPRN_Score(self.y_val,gbdtc_pred)["精确率|precision"] 
                elif self.metrics_class == "accuracy":
                    metrics_value = self.TPRN_Score(self.y_val,gbdtc_pred)["ACC正确率|accuracy"]
                elif self.metrics_class == "roc-auc-recall":
                    roc_auc = self.ROC_AUC(self.y_val, gbdtc_proba)
                    recall = self.TPRN_Score(self.y_val,gbdtc_pred)["召回率|recall｜真阳率｜命中率"]
                    metrics_value = roc_auc*self.metrics_weight[0]+recall*self.metrics_weight[-1]
                elif self.metrics_class == "roc-auc-recall-accuracy":
                    roc_auc = self.ROC_AUC(self.y_val, gbdtc_proba)
                    recall = self.TPRN_Score(self.y_val,gbdtc_pred)["召回率|recall｜真阳率｜命中率"]
                    accuracy = self.TPRN_Score(self.y_val,gbdtc_pred)["ACC正确率|accuracy"]
                    metrics_value = roc_auc*self.metrics_weight[0]+recall*self.metrics_weight[1]+accuracy*self.metrics_weight[-1]                      
                elif self.metrics_class == "all":
                    TPRN = self.TPRN_Score(self.y_val, gbdtc_pred)
                    pr_auc = self.PR_AUC(self.y_val, gbdtc_proba,gbdtc_pred)
                    roc_auc = self.ROC_AUC(self.y_val, gbdtc_proba)
                    accuracy = TPRN["ACC正确率|accuracy"]
                    precision = TPRN['精确率|precision']
                    recall = TPRN["召回率|recall｜真阳率｜命中率"]
                    false_alarm = TPRN['误报率|false alarm｜假阳率｜虚警率｜误检率'] 
                    miss_rate = TPRN['漏报率|miss rate|也称为漏警率|漏检率']
                    specificity = TPRN['特异度|specificity']
                    f1score = f1_score(self.y_val,gbdtc_pred) 
                    metrics_values = np.array([pr_auc,roc_auc,accuracy,precision,recall,-false_alarm,-miss_rate,specificity,f1score])
                    metrics_values = np.nan_to_num(metrics_values,0)
                    if recall > self.min_recall:
                        metrics_weight = np.nan_to_num(self.metrics_weight,0)
                    else:
                        metrics_weight = np.nan_to_num(self.metrics_weight,0)*self.cost_wight
                    metrics_value = metrics_values.dot(metrics_weight)
        except Exception as e:
            log(str(e),LogLevel.ERROR)
            metrics_value = self.historical_metrics.mean()   
        # metrics_value = MyMetric.PR_AUC(self.y_val,gbdtc_proba,gbdtc_pred)
        
        func_end = time.time()
        # global NOW_FUC_RUN_ITER
        self.NOW_FUC_RUN_ITER += 1
        self.historical_params.update({self.NOW_FUC_RUN_ITER-1:params})
        self.historical_metrics[self.NOW_FUC_RUN_ITER-1] = metrics_value
        
#         log(f"""本次迭代{self.metrics_class}分数为:[{metrics_value}],
#         用时:[{func_end-func_start}]秒,
#         当前优化第:[{self.NOW_FUC_RUN_ITER}]次,
#         已运行:[{self.NOW_FUC_RUN_ITER}]次，
#         用时总计:[{datetime.timedelta(seconds=(func_end-self.Bayes_start_time))}]秒,
#         """,LogLevel.PASS)
        return -metrics_value
    
    def param_hyperopt(self,max_evals=100):
        """
        """
        #保存迭代过程
        trials = Trials()

        #设置提前停止
        ## 如果损失没有增加，将在 X 次迭代后停止的停止函数
        early_stop_fn = no_progress_loss(self.EARLY_STOP_BAYES)

        #定义代理模型
        #algo = partial(tpe.suggest, n_startup_jobs=20, n_EI_candidates=50)
        # global hyperopt_params
        # hyperopt_params = self.param_grid_hp
        params_best = fmin(self.hyperopt_objective #目标函数
                        , space = self.param_grid_hp #参数空间
                        , algo = tpe.suggest #代理模型
                        #, algo = algo
                        , max_evals = max_evals #允许的迭代次数
                        , verbose=True
                        , trials = trials
                        , early_stop_fn = early_stop_fn
                        
                        )

        #打印最优参数，fmin会自动打印最佳分数
        print("\n","\n","best params: ", params_best,
            "\n")
        return params_best, trials
    
    def run(self):
        t = 0
        for params_name,obj in self.param_grid_hp.items():
            t += 1
            log(f"已准备优化的第{t}个参数,名称:{params_name},类型:{obj}",LogLevel.PASS)

        self.Bayes_start_time = time.time()
        self.NOW_FUC_RUN_ITER = 0
        self.PARAMS_BEST, self.Trials = self.param_hyperopt(self.NUM_EVALS)
        idx = np.argmin([-self.historical_metrics])
        # self.historical_metrics[idx]
        self.bayes_opt_parser = self.historical_params[idx]
        log(f"解析参数得到结果:{self.bayes_opt_parser}",LogLevel.PASS)
        
    def test_params(self):
        params = self.bayes_opt_parser
        gbdtc = RandomForestClassifier(**params)
        gbdtc =gbdtc.fit(self.x_train, self.y_train)
        gbdtc_proba = gbdtc.predict_proba(self.x_val)[:,1]
        gbdtc_pred = gbdtc.predict(self.x_val)
        # metric = MyMetric.PR_AUC(self.y_val,gbdtc_proba,gbdtc_pred)
        log(f'模型的评估报告1：\n,{classification_report(self.y_val, gbdtc_pred)}\n',LogLevel.SUCCESS)
        log(f'模型的评估报告2：\n,{self.TPRN_Score(self.y_val, gbdtc_pred)}\n',LogLevel.SUCCESS)
        
        # self.plot_roc(self.y_val, gbdtc_proba[:,1])
        if self.metrics_class == "pr-auc":
            metrics = self.PR_AUC(self.y_val, gbdtc_proba,gbdtc_pred)
        elif self.metrics_class == "roc-auc":
            metrics = self.ROC_AUC(self.y_val, gbdtc_proba)
        elif self.metrics_class == "f1-score":
            metrics = f1_score(self.y_val,gbdtc_pred)
        elif self.metrics_class == "recall":
            metrics = self.TPRN_Score(self.y_val,gbdtc_pred)["召回率|recall｜真阳率｜命中率"] 
        elif self.metrics_class == "precision":
            metrics = self.TPRN_Score(self.y_val,gbdtc_pred)["精确率|precision"] 
        elif self.metrics_class == "accuracy":
            metrics = self.TPRN_Score(self.y_val,gbdtc_pred)["ACC正确率|accuracy"]
        elif self.metrics_class == "roc-auc-recall":
            roc_auc = self.ROC_AUC(self.y_val, gbdtc_proba)
            recall = self.TPRN_Score(self.y_val,gbdtc_pred)["召回率|recall｜真阳率｜命中率"]
            metrics = roc_auc*self.metrics_weight[0]*recall*self.metrics_weight[-1]
        elif self.metrics_class == "roc-auc-recall-accuracy":
            roc_auc = self.ROC_AUC(self.y_val, gbdtc_proba)
            recall = self.TPRN_Score(self.y_val,gbdtc_pred)["召回率|recall｜真阳率｜命中率"]
            accuracy = self.TPRN_Score(self.y_val,gbdtc_pred)["ACC正确率|accuracy"]
            metrics = roc_auc*self.metrics_weight[0]+recall*self.metrics_weight[1]+accuracy*self.metrics_weight[-1]                      
        elif self.metrics_class == "all":
            TPRN = self.TPRN_Score(self.y_val, gbdtc_pred)
            pr_auc = self.PR_AUC(self.y_val, gbdtc_proba,gbdtc_pred)
            roc_auc = self.ROC_AUC(self.y_val, gbdtc_proba)
            accuracy = TPRN["ACC正确率|accuracy"]
            precision = TPRN['精确率|precision']
            recall = TPRN["召回率|recall｜真阳率｜命中率"]
            false_alarm = TPRN['误报率|false alarm｜假阳率｜虚警率｜误检率'] 
            miss_rate = TPRN['漏报率|miss rate|也称为漏警率|漏检率']
            specificity = TPRN['特异度|specificity']
            f1score = f1_score(self.y_val,gbdtc_pred) 
            metrics_values = np.array([pr_auc,roc_auc,accuracy,precision,recall,false_alarm,miss_rate,specificity,f1score])
            #metrics_weight = np.array(self.metrics_weight)
            metrics_values = np.nan_to_num(metrics_values,0)
            metrics_weight = np.nan_to_num(self.metrics_weight,0)
            for metrice_name,metrics_value,weight in zip(self.all_metrice_names,metrics_values,metrics_weight):
                log(f'测试{metrice_name}值为:{metrics_value},权重为{weight}',LogLevel.SUCCESS)
            metrics = metrics_values.dot(metrics_weight) 
        log(f'测试优化参数的:【{self.metrics_class}】得分为:【{metrics}】,优化过程中的最高分为:【{self.historical_metrics.max()}】',LogLevel.SUCCESS)
        return metrics
            
        
if __name__ == '__main__':
    from sklearn import datasets
    cancer=datasets.load_breast_cancer()
    x=cancer.data
    y=cancer.target

    def Rollover(x):
        x = x.astype(bool)
        x = ~x
        x = x.astype(int)
        return x
    ####TODO:将少数变成正例
    y = Rollover(y)
    boRF = BayesOptRF(x,y
                       ,MAX_SHUFFLE=100
                        ,Folds=0
                        ,metrics_class="pr-auc"
                        #all=[1.pr_auc,2.roc_auc,3.accuracy,4.precision,5.recall,6.false_alarm,7.miss_rate,8.specificity,9.f1score]
                        ,metrics_weight=[0,0.5,0.5,0,0,0,0,0,0]
                        ,EARLY_STOP_BAYES=200
                        ,NUM_EVALS=1000
                        ,min_recall=0
                        ,cost_wight=1
                       )
    boRF.run()
    # boG.test_params()

数据洗牌: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1908.75it/s]

[0;34m2023-12-24 18:17:29.489942 - [INFO] - 训练数据中,正例有【174】个占比【0.3824175824175824】
            ，负例有【281】个占比【0.6175824175824176】
            ，alpha值为【0.6175824175824176】，[0m
[0;34m2023-12-24 18:17:29.490106 - [INFO] - 测试数据中,正例有【38】个占比【0.3333333333333333】
            ，负例有【76】个占比【0.6666666666666666】
            ，alpha值为【0.6666666666666666】，[0m
[0;32m2023-12-24 18:17:29.490154 - [PASS] - 已准备优化的第1个参数,名称:n_estimators,类型:0 float
1   hyperopt_param
2     Literal{n_estimators}
3     quniform
4       Literal{10}
5       Literal{1000}
6       Literal{1}[0m
[0;32m2023-12-24 18:17:29.490181 - [PASS] - 已准备优化的第2个参数,名称:criterion,类型:0 switch
1   hyperopt_param
2     Literal{criterion}
3     randint
4       Literal{2}
5   Literal{gini}
6   Literal{entropy}[0m
[0;32m2023-12-24 18:17:29.490211 - [PASS] - 已准备优化的第3个参数,名称:max_depth,类型:0 switch
1   hyperopt_param
2     Literal{max_depth}
3     randint
4       Literal{2}
5   Literal{None}
6   hyperopt_param
7     Literal{max_depth_int}
8     randint
9 




[0;31m2023-12-24 18:19:49.806773 - [ERROR] - max_leaf_nodes 1 must be either None or larger than 1[0m                                                                                                                                                                            
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 845/1000 [08:44<01:36,  1.61trial/s, best loss: -0.9641271038528311]

 
 best params:  {'bootstrap': 1, 'class_weight': 0, 'criterion': 0, 'max_depth': 1, 'max_depth_int': 286, 'max_features': 0, 'max_leaf_nodes': 0, 'max_samples': 0.050605447156075296, 'min_impurity_decrease': 0.14211011160987627, 'min_samples_leaf': 106.0, 'min_samples_split': 416.0, 'min_weight_fraction_leaf': 0.28522148572414313, 'n_estimators': 526.0, 'oob_score': 0, 'random_state': 56, 'warm_start': 1} 

[0;32m2023-12-24 18:26:13.865009 -

In [176]:
boG.test_params()

[0;36m2023-12-24 18:53:35.307463 - [SUCCESS] - 模型的评估报告1：
,              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        71
         1.0       1.00      1.00      1.00        43

    accuracy                           1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114

[0m
[0;36m2023-12-24 18:53:35.307730 - [SUCCESS] - 模型的评估报告2：
,{'混淆矩阵': matrix([[43,  0],
        [ 0, 71]]), 'ACC正确率|accuracy': 1.0, '精确率|precision': 1.0, '召回率|recall｜真阳率｜命中率': 1.0, '误报率|false alarm｜假阳率｜虚警率｜误检率': 0.0, '漏报率|miss rate|也称为漏警率|漏检率': 0.0, '特异度|specificity': 1.0, 'F1-score:': 1.0, '真实正样本数': 43, '真实负样本数': 71}
[0m
[0;36m2023-12-24 18:53:35.311417 - [SUCCESS] - 测试pr_auc值为:1.0,权重为0.0[0m
[0;36m2023-12-24 18:53:35.311458 - [SUCCESS] - 测试roc_auc值为:1.0,权重为0.5[0m
[0;36m2023-12-24 18:53:35.311471 - [SUCCESS] - 测试accuracy值为:1.0,权重为0.5[0m
[0;36m2023-12-24 18:53:35.311480 - [SUCCESS] - 测试precision值为

1.0

In [170]:
idx = np.argmin([-boRF.historical_metrics])
boRF.historical_metrics[idx]

0.9641271038528311

In [171]:
boRF.historical_params[idx]

{'n_estimators': 526,
 'criterion': 'gini',
 'max_depth': 286,
 'min_samples_split': 416,
 'min_samples_leaf': 106,
 'min_weight_fraction_leaf': 0.28522148572414313,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.14211011160987627,
 'bootstrap': False,
 'oob_score': False,
 'random_state': 56,
 'warm_start': False,
 'class_weight': None,
 'max_samples': None,
 'n_jobs': -1}

In [175]:

rfc = RandomForestClassifier(**boRF.historical_params[idx])
rfc = rfc.fit(train_x,train_y,)
rfc_pred = rfc.predict(test_x)
rfc_proba = rfc.predict_proba(test_x)[:,1]
(ROC_AUC(test_y,rfc_proba)+sum(rfc_pred==test_y)/test_y.size)/2

0.9688528970641475

In [173]:
boRF.PR_AUC(test_y,rfc_proba,rfc_pred)

0.9871925558368324

In [279]:
from hyperopt import hp
from hyperopt import hp, fmin, tpe, Trials, partial
from hyperopt.early_stop import no_progress_loss
import warnings
warnings.filterwarnings("ignore")
# from OptMetrics import MyMetric
# from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import datasets
from MyLogColor import log,LogLevel
import time
from sklearn.metrics import precision_recall_curve,auc,f1_score,roc_curve,auc

cancer=datasets.load_breast_cancer()
x=cancer.data
y=cancer.target

def Rollover(x):
    x = x.astype(bool)
    x = ~x
    x = x.astype(int)
    return x
####TODO:将少数变成正例
y = Rollover(y)

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state = 42)


def ROC_AUC(test_y, proba):
    fpr,tpr,threshold = roc_curve(test_y, proba)
    roc_auc_ = auc(fpr,tpr)
    return roc_auc_


historical_metrics = []
historical_params = {}

boosters = ['gbtree','gblinear','dart']
sampling_methods = ['uniform','gradient_based']
tree_methods = ["auto","exact","approx","hist"]
refresh_leafs = [0,1]
# process_types = ["default","update"]#,"refresh","prune"]
grow_policys = ["depthwise","lossguide"]
# sample_types = ["uniform","weighted"]
normalize_types = ["tree","forest"]
rate_drops = []



param_grid_hp = {
    'boosters':hp.choice('boosters',boosters)
    ,"n_estimators":hp.quniform("n_estimators",50,1000,1)
    ,"learning_rate":hp.uniform("learning_rate",1e-5,1)
    ,"gamma":hp.quniform("gamma",0,100,1)
    ,"max_depth":hp.quniform("max_depth",6,200,1)
    ,"min_child_weight":hp.quniform("min_child_weight",0,100,1)
    ,"max_delta_step":hp.quniform("max_delta_step",0,100,1)
    ,"subsample":hp.uniform("subsample",0,1)
    # ,"sampling_method":hp.choice("sampling_method",sampling_methods)
    ,"colsample_bytree":hp.uniform("colsample_bytree",0,1)
    ,"colsample_bylevel":hp.uniform("colsample_bylevel",0,1)
    ,"colsample_bynode":hp.uniform("colsample_bynode",0,1)
    ,"lambda":hp.quniform("lambda",0,200,1)
    ,"alpha":hp.quniform("alpha",0,200,1)
    ,"tree_method":hp.choice("tree_method",tree_methods)
    # ,"scale_pos_weight":hp.uniform("scale_pos_weight",0,1000)
    ,"refresh_leaf":hp.choice("refresh_leaf",refresh_leafs)
    # ,"process_type":hp.choice("process_type",process_types)
    ,"grow_policy":hp.choice("grow_policy",grow_policys)
    ,"max_leaves":hp.quniform("max_leaves",0,10000,1)
    ,"max_bin":hp.quniform("max_bin",256,1000,1)
    ,"num_parallel_tree":hp.quniform("num_parallel_tree",1,100,1)   
}
# booster_dart_params = {
#     "sample_type":hp.choice("sample_type",sample_types)
#     ,"normalize_type":hp.choice("normalize_type",normalize_types)
#     ,"rate_drop":hp.uniform("rate_drop",0,1)
#     ,"one_drop":hp.quniform("one_drop",0,1000,1)
#     ,"skip_drop":hp.uniform("skip_drop",0,1)
# }

booster_gblinear_params = {
    
}

def PR_AUC(test_y,proba,pred):
    precision,recall,_ = precision_recall_curve(test_y,proba)
    f1 ,pr_auc = f1_score(test_y,pred),auc(recall,precision)
    return pr_auc

def hyperopt_objective(hyperopt_params): 
    params = {
        "objective":"binary:logistic"
        ,'boosters':hyperopt_params['boosters']
        ,"n_estimators":int(hyperopt_params["n_estimators"])
        ,"learning_rate":hyperopt_params["learning_rate"]
        ,"gamma":hyperopt_params["gamma"]
        ,"max_depth":int(hyperopt_params["max_depth"])
        ,"min_child_weight":int(hyperopt_params["min_child_weight"])
        ,"max_delta_step":int(hyperopt_params["max_delta_step"])
        ,"subsample":hyperopt_params["subsample"]
        ,"verbosity":0
        # ,"sampling_method":hyperopt_params["sampling_method"]
        ,"colsample_bytree":hyperopt_params["colsample_bytree"]
        ,"colsample_bylevel":hyperopt_params["colsample_bylevel"]
        ,"colsample_bynode":hyperopt_params["colsample_bynode"]
        ,"lambda":int(hyperopt_params["lambda"])
        ,"alpha":int(hyperopt_params["alpha"])
        ,"tree_method":hyperopt_params["tree_method"]
        ,"scale_pos_weight":(y_train==0).sum()/(y_train==1).sum()
        ,"refresh_leaf":hyperopt_params["refresh_leaf"]
        # ,"process_type":hyperopt_params["process_type"]
        ,"grow_policy":hyperopt_params["grow_policy"]
        ,"max_leaves":int(hyperopt_params["max_leaves"])
        ,"max_bin":int(hyperopt_params["max_bin"])
        ,"num_parallel_tree":int(hyperopt_params["num_parallel_tree"])   
    }
    # booster_dart_params = {
    #     "sample_type":hyperopt_params["sample_type"]
    #     ,"normalize_type":hp.choice("normalize_type",normalize_types)
    #     ,"rate_drop":hyperopt_params["rate_drop"]
    #     ,"one_drop":int(hyperopt_params["one_drop"])
    #     ,"skip_drop":hyperopt_params["skip_drop"]
    # }
    dtrain = xgb.DMatrix(x_train,label=y_train)
    clf = xgb.train(params=params
                   ,dtrain=dtrain
                   ,num_boost_round=100
                   ,evals=[(dtrain,"train")]
                   ,verbose_eval=False # 不显示训练信息就改False
                   # ,obj=logistic_obj
                   )
    dtest = xgb.DMatrix(x_val,label=y_val)
    xgboost_proba = clf.predict(dtest)
    # xgbosst_proba = np.nan_to_num(xgboost_proba,0)
    
    global NOW_FUC_RUN_ITER
    NOW_FUC_RUN_ITER += 1
    metric = ROC_AUC(y_val,xgboost_proba)
    historical_metrics.append(metric)
    historical_params.update({NOW_FUC_RUN_ITER-1:params})
    return - metric

def param_hyperopt(max_evals=100):

    #保存迭代过程
    trials = Trials()

    #设置提前停止
    early_stop_fn = no_progress_loss(100)

    #定义代理模型
    #algo = partial(tpe.suggest, n_startup_jobs=20, n_EI_candidates=50)
    params_best = fmin(hyperopt_objective #目标函数
                       , space = param_grid_hp #参数空间
                       , algo = tpe.suggest #代理模型
                       #, algo = algo
                       , max_evals = max_evals #允许的迭代次数
                       , verbose=True
                       , trials = trials
                       , early_stop_fn = early_stop_fn
                      )

    #打印最优参数，fmin会自动打印最佳分数
    print("\n","\n","best params: ", params_best,
          "\n")
    return params_best, trials

NOW_FUC_RUN_ITER = 0
PARAMS_BEST, Trials = param_hyperopt(600)

historical_metrics = np.array(historical_metrics)
idx = np.argmax(historical_metrics)
params = historical_params[idx]
dtrain = xgb.DMatrix(x_train,label=y_train)
clf = xgb.train(params=params
               ,dtrain=dtrain
               ,num_boost_round=100
               ,evals=[(dtrain,"train")]
               ,verbose_eval=False # 不显示训练信息就改False
               # ,obj=logistic_obj
               )
dtest = xgb.DMatrix(x_val,label=y_val)
xgboost_proba = clf.predict(dtest)

 44%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                | 265/600 [02:51<03:37,  1.54trial/s, best loss: -0.9990173599737964]

 
 best params:  {'alpha': 15.0, 'boosters': 1, 'colsample_bylevel': 0.6584435050605241, 'colsample_bynode': 0.0727297864386035, 'colsample_bytree': 0.9388869249397702, 'gamma': 0.0, 'grow_policy': 1, 'lambda': 23.0, 'learning_rate': 0.05930788431305209, 'max_bin': 517.0, 'max_delta_step': 57.0, 'max_depth': 90.0, 'max_leaves': 529.0, 'min_child_weight': 25.0, 'n_estimators': 814.0, 'num_parallel_tree': 71.0, 'refresh_leaf': 1, 'subsample': 0.7533073644285923, 'tree_method': 1} 



164

In [289]:
# historical_metrics

In [290]:
params = historical_params[idx]
params

{'objective': 'binary:logistic',
 'boosters': 'gblinear',
 'n_estimators': 814,
 'learning_rate': 0.05930788431305209,
 'gamma': 0.0,
 'max_depth': 90,
 'min_child_weight': 25,
 'max_delta_step': 57,
 'subsample': 0.7533073644285923,
 'verbosity': 0,
 'colsample_bytree': 0.9388869249397702,
 'colsample_bylevel': 0.6584435050605241,
 'colsample_bynode': 0.0727297864386035,
 'lambda': 23,
 'alpha': 15,
 'tree_method': 'exact',
 'scale_pos_weight': 1.6923076923076923,
 'refresh_leaf': 1,
 'grow_policy': 'lossguide',
 'max_leaves': 529,
 'max_bin': 517,
 'num_parallel_tree': 71}

In [291]:

# params = {
#     'booster': 'gbtree',
#     "objective":"binary:logistic",  # 多分类的问题
#     # 'num_class': 10,               # 类别数，与 multisoftmax 并用
#     'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
#     'max_depth': 12,               # 构建树的深度，越大越容易过拟合
#     'lambda': 2,                   # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
#     'subsample': 0.7,              # 随机采样训练样本
#     'colsample_bytree': 0.7,       # 生成树时进行的列采样
#     'min_child_weight': 3,
#     'silent': 1,                   # 设置成1则没有运行信息输出，最好是设置为0.
#     'eta': 0.007,                  # 如同学习率
#     'seed': 1000,
#     'nthread': 4,                  # cpu 线程数
# }
dtrain = xgb.DMatrix(x_train,label=y_train)
clf = xgb.train(params=params
               ,dtrain=dtrain
               ,num_boost_round=100
               ,evals=[(dtrain,"train")]
               ,verbose_eval=False # 不显示训练信息就改False
               # ,obj=logistic_obj
               )
dtest = xgb.DMatrix(x_val,label=y_val)
xgboost_proba = clf.predict(dtest)

In [298]:
(xgboost_proba>0.5).astype(int)==y_val

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [292]:
ROC_AUC(y_val,xgboost_proba)

0.9990173599737964

In [293]:
xgboost_proba

array([0.3361772 , 0.7981541 , 0.8203545 , 0.2562623 , 0.19996049,
       0.86203134, 0.84072673, 0.7630306 , 0.49083444, 0.16953701,
       0.18439867, 0.7572008 , 0.28317615, 0.78244716, 0.16682358,
       0.81705254, 0.2380926 , 0.17154232, 0.18120608, 0.84068966,
       0.41492915, 0.1975207 , 0.8623697 , 0.17965862, 0.2001096 ,
       0.30956447, 0.21091042, 0.2258969 , 0.18100636, 0.8585446 ,
       0.21630582, 0.15445712, 0.2917523 , 0.2086175 , 0.18125324,
       0.18176576, 0.50360596, 0.24452032, 0.82379895, 0.33353013,
       0.184684  , 0.80987316, 0.23368864, 0.17071378, 0.29894045,
       0.24427083, 0.23724045, 0.22725883, 0.2851912 , 0.24281749,
       0.7790351 , 0.8583394 , 0.38251728, 0.4090937 , 0.27092102,
       0.16486017, 0.23074426, 0.8536615 , 0.57294697, 0.16257402,
       0.16642316, 0.8546891 , 0.8358565 , 0.21942048, 0.17521901,
       0.27275497, 0.8614003 , 0.861268  , 0.1881125 , 0.30207688,
       0.7300041 , 0.8388176 , 0.17340842, 0.82658404, 0.20059