In [8]:
import xgboost as xgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import random
import os

seed = 1
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

# 加载数据集
data = load_breast_cancer()
X = data.data
y = data.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 转换数据格式为DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


# 设置模型参数
params = {
    'objective': 'binary:logistic',  # 二分类逻辑回归
#     'eval_metric': 'error',  # 评估指标为错误率
    'seed': seed
}

# 训练模型
num_rounds = 100  # 迭代次数
model = xgb.train(params, dtrain, num_rounds)

# 预测
y_pred = model.predict(dtest)
y_pred_binary = [1 if p > 0.5 else 0 for p in y_pred]  # 将概率转换为类别

# 计算准确率
accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy:", accuracy)


Accuracy: 0.9649122807017544


## 自定义损失函数

In [21]:
import xgboost as xgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import random
import os

seed = 1
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

# 加载数据集
data = load_breast_cancer()
X = data.data
y = data.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 转换数据格式为DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

def custom_loss(preds, dtrain):
    labels = dtrain.get_label()

    preds = 1.0 / (1.0 + np.exp(-preds))

    def binary_cross_entropy_gradient(y_pred, y_true):
        eps = 1e-15  # 避免除零错误的常数

        # 预测概率取值范围限制在 [eps, 1-eps] 内
        y_pred = np.clip(y_pred, eps, 1 - eps)

        # 二分类交叉熵损失函数的一阶导数（梯度）
        gradient = - y_true / y_pred + (1 - y_true) / (1 - y_pred)

        return gradient

    def binary_cross_entropy_hessian(y_pred, y_true):
        eps = 1e-15  # 避免除零错误的常数

        # 预测概率取值范围限制在 [eps, 1-eps] 内
        y_pred = np.clip(y_pred, eps, 1 - eps)

        # 二分类交叉熵损失函数的二阶导数（海森矩阵）
        hessian = y_true / (y_pred ** 2) + (1 - y_true) / ((1 - y_pred) ** 2)

        return hessian

    gradient = binary_cross_entropy_gradient(preds, labels)
    hessian = binary_cross_entropy_hessian(preds, labels)

    # 防止梯度爆炸做剪裁
    l2 = np.linalg.norm(gradient)
    max_norm = 0.43
    if l2>max_norm:
        gradient = gradient*(max_norm/l2)
    l2 = np.linalg.norm(hessian)
    if l2>max_norm:
        hessian = hessian*(max_norm/l2)

    return gradient, hessian

# 设置模型参数
params = {
    'eval_metric': 'error',  # 评估指标为错误率
    'seed': seed
}

# 训练模型
num_rounds = 100  # 迭代次数
model = xgb.train(params, dtrain, num_rounds,obj=custom_loss)

# 预测
y_pred = model.predict(dtest)
y_pred_binary = [1 if p > 0.1 else 0 for p in y_pred]  # 将概率转换为类别

# 计算准确率
accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy:", accuracy)

Accuracy: 0.9035087719298246


In [26]:
def maxRecall(preds,dtrain): #preds是结果（概率值），dtrain是个带label的DMatrix
    labels=dtrain.get_label() #提取label
    preds=1-preds
    precision,recall,threshold=precision_recall_curve(labels,preds,pos_label=0)
    pr=pd.DataFrame({'precision':precision,'recall':recall})
    return 'Max Recall:',pr[pr.precision>=0.97].recall.max()


import numpy as np
import xgboost as xgb
 
gamma = 0
alpha = 0.5
 

# y指的是true label
# p指的是预测出来的概率值（是经过sigmoid转换之后  p = 1.0 / (1.0 + np.exp(-p))
# gamma指的是local foss的参数
# alpha指的是用来处理非平衡的参数, label为1的样本的损失是label为0的多少倍
def logistic_obj(p, dtrain):
    
    y = dtrain.get_label()
    p = 1.0 / (1.0 + np.exp(-p))
    
    grad = p * (1 - p) * (alpha * gamma * y * (1 - p) ** gamma * np.log(p) / (1 - p) - alpha * y * (
                1 - p) ** gamma / p - gamma * p ** gamma * (1 - alpha) * (1 - y) * np.log(1 - p) / p + p ** gamma * (
                                      1 - alpha) * (1 - y) / (1 - p))
    hess = p * (1 - p) * (p * (1 - p) * (
                -alpha * gamma ** 2 * y * (1 - p) ** gamma * np.log(p) / (1 - p) ** 2 + alpha * gamma * y * (
                    1 - p) ** gamma * np.log(p) / (1 - p) ** 2 + 2 * alpha * gamma * y * (1 - p) ** gamma / (
                            p * (1 - p)) + alpha * y * (1 - p) ** gamma / p ** 2 - gamma ** 2 * p ** gamma * (
                            1 - alpha) * (1 - y) * np.log(1 - p) / p ** 2 + 2 * gamma * p ** gamma * (1 - alpha) * (
                            1 - y) / (p * (1 - p)) + gamma * p ** gamma * (1 - alpha) * (1 - y) * np.log(
            1 - p) / p ** 2 + p ** gamma * (1 - alpha) * (1 - y) / (1 - p) ** 2) - p * (
                                      alpha * gamma * y * (1 - p) ** gamma * np.log(p) / (1 - p) - alpha * y * (
                                          1 - p) ** gamma / p - gamma * p ** gamma * (1 - alpha) * (1 - y) * np.log(
                                  1 - p) / p + p ** gamma * (1 - alpha) * (1 - y) / (1 - p)) + (1 - p) * (
                                      alpha * gamma * y * (1 - p) ** gamma * np.log(p) / (1 - p) - alpha * y * (
                                          1 - p) ** gamma / p - gamma * p ** gamma * (1 - alpha) * (1 - y) * np.log(
                                  1 - p) / p + p ** gamma * (1 - alpha) * (1 - y) / (1 - p)))
    
    return grad, hess
 

# dtrain = xgb.DMatrix('../demo_data/agaricus.txt.train')
# dtest = xgb.DMatrix('../demo_data/agaricus.txt.test')
 
# print("---------no focal loss-----------")
 
# watchlist = [(dtrain, 'train'), (dtest, 'eval')]
 
# params = {'max_depth': 2, 'eta': 0.01, 'silent': 1, 'eval_metric': 'auc', "objective": "binary:logistic"}
# xgb.train(params=params, dtrain=dtrain, num_boost_round=3, early_stopping_rounds=50,
#           evals=[(dtrain, 'train'), (dtest, 'test')], verbose_eval=1)
 
# print("---------focal loss-----------")
# params = {'max_depth': 2, 'eta': 0.01, 'silent': 1, 'eval_metric': 'auc', "objective": "binary:logitraw"}
# xgb.train(params=params, dtrain=dtrain, num_boost_round=3, early_stopping_rounds=50,
#           evals=[(dtrain, 'train'), (dtest, 'test')], verbose_eval=1, obj=logistic_obj)


num_rounds = 100  # 迭代次数
model = xgb.train(params, dtrain, num_rounds,custom_metric=maxRecall,obj=logistic_obj)

# 预测
y_pred = model.predict(dtest)
y_pred_binary = [1 if p > 0.1 else 0 for p in y_pred]  # 将概率转换为类别

# 计算准确率
accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy:", accuracy)

Accuracy: 0.956140350877193
