In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

In [35]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [12]:
seed = 563453451

In [20]:
train = pd.read_csv('train.csv')
X_ini = train.iloc[:,1:10]
y_ini = train.iloc[:,10]



In [22]:

X_ini




Unnamed: 0,FSCN1,ZYX,ARHGEF15,BGN,TMEM184B,CDC42BPB,RRBP1,ELK3,AASS
0,6.791531,6.724139,6.590631,8.890153,6.945259,5.853870,6.885485,5.896114,5.479162
1,6.383779,6.275617,6.179703,8.082869,6.867253,5.727341,6.681693,5.868821,5.683885
2,6.963827,6.691952,6.549625,8.082869,7.148347,6.199279,6.794402,6.104782,5.225188
3,6.830712,6.881589,6.758582,8.507135,7.222653,6.060062,7.019101,6.275418,5.662077
4,6.596160,6.622638,6.654165,8.646988,7.160156,6.005119,6.776268,6.452779,5.457526
...,...,...,...,...,...,...,...,...,...
106,6.024392,5.923116,4.880000,7.968938,6.693820,5.990882,6.683807,5.543087,6.167974
107,6.774447,7.425636,5.771695,7.947256,7.460036,5.683899,7.451995,5.669623,5.615562
108,8.028248,7.471907,5.178493,9.044273,6.873181,5.913611,7.392649,5.732670,5.989609
109,6.692110,6.514133,6.257951,7.509148,6.788111,5.258840,5.256959,5.288180,5.308249


In [23]:
sp = RepeatedStratifiedKFold(random_state=seed,n_repeats=3,n_splits=5)

In [25]:
import hyperopt
from hyperopt import hp

In [26]:
def objective(param):
    aucs = []
    for train_index,test_index in sp.split(X_ini,y_ini):
        X_train = X_ini.iloc[train_index,:]
        X_vali = X_ini.iloc[test_index,:]
        y_train = y_ini[train_index]
        y_vali = y_ini[test_index]
        model = GradientBoostingClassifier(random_state=seed,
                                           n_estimators=param['n_estimators'],
                                           max_depth=param['max_depth'],
                                           min_samples_split=param['min_samples_split'],
                                           min_samples_leaf=param['min_samples_leaf'],
                                           learning_rate=param['learning_rate'])
        model.fit(X_train,y_train)
        pro_vali = model.predict_proba(X_vali)[:,1]
        auc_vali = roc_auc_score(y_vali,pro_vali)
        aucs.append(auc_vali)
    return -np.mean(aucs)

In [27]:
#超参数搜索范围，根据数据集不同进行修改
space = {
    'n_estimators':hp.choice('n_estimators',range(2,50)),
    'max_depth':hp.choice('max_depth',range(1,3)),
    'min_samples_split':hp.choice('min_samples_split',range(2,50)),
    'min_samples_leaf':hp.choice('min_samples_leaf',range(2,50)),
    'learning_rate':hp.uniform('learning_rate',0,1)
}

In [28]:
best_param = hyperopt.fmin(objective,space,hyperopt.tpe.suggest,max_evals=100)

100%|████████████████████████████████████████████████████████████| 100/100 [00:43<00:00,  2.29trial/s, best loss: -1.0]


In [29]:
best_param

{'learning_rate': 0.6804823599172526,
 'max_depth': 1,
 'min_samples_leaf': 15,
 'min_samples_split': 11,
 'n_estimators': 17}

In [30]:
#这里注意range()范围与上面的搜索空间保持一致
model = GradientBoostingClassifier(random_state=seed,
                                   n_estimators=range(2,50)[best_param['n_estimators']],
                                   max_depth=range(1,3)[best_param['max_depth']],
                                   min_samples_split=range(2,50)[best_param['min_samples_split']],
                                   min_samples_leaf=range(2,50)[best_param['min_samples_leaf']],
                                   learning_rate=best_param['learning_rate'])
model.fit(X_ini,y_ini)
pro_train = model.predict_proba(X_ini)[:,1]

In [31]:
print('训练集AUC={:.3f}'.format(roc_auc_score(y_ini,pro_train)))

训练集AUC=1.000


In [32]:
df_train = pd.DataFrame({
    'ID':train['ID'],
    'True':y_ini,
    'Pre':pro_train
})
df_train.to_csv('GBDT_train.csv',index=False)

In [39]:
# 使用模型对训练集数据进行预测，得到预测标签
y_pred_train = model.predict(X_ini)  

# 计算训练集的AUC值（需要模型输出的概率值 pro_train）
auc_train = roc_auc_score(y_ini, pro_train)

# 计算训练集准确率（正确预测样本比例）
accuracy_train = accuracy_score(y_ini, y_pred_train)

# 计算训练集精确率（预测为正的样本中真实为正的比例）
precision_train = precision_score(y_ini, y_pred_train)

# 计算训练集召回率（真实为正的样本中被正确预测的比例）
recall_train = recall_score(y_ini, y_pred_train)

# 计算训练集F1分数（精确率和召回率的调和平均数）
f1_train = f1_score(y_ini, y_pred_train)

# 生成训练集的混淆矩阵（四分类表格）
confusion_train = confusion_matrix(y_ini, y_pred_train)

# 解构混淆矩阵四个值：真阴、假阳、假阴、真阳
tn, fp, fn, tp = confusion_train.ravel()

# 计算特异度（真实为负的样本中被正确识别的比例）
specificity_train = tn / (tn + fp)

In [40]:
print('训练集AUC={:.3f}'.format(auc_train))
print('训练集Accuracy={:.3f}'.format(accuracy_train))
print('训练集Precision={:.3f}'.format(precision_train))
print('训练集Sensitivity (Recall)={:.3f}'.format(recall_train))
print('训练集Specificity={:.3f}'.format(specificity_train))
print('训练集F1={:.3f}'.format(f1_train))

训练集AUC=1.000
训练集Accuracy=1.000
训练集Precision=1.000
训练集Sensitivity (Recall)=1.000
训练集Specificity=1.000
训练集F1=1.000


In [46]:
# 导入必要库
import pandas as pd
from sklearn.metrics import (roc_auc_score, accuracy_score, 
                           precision_score, recall_score, 
                           f1_score, confusion_matrix)

# 定义测试集文件列表（简化路径）
test_files = [
    'test1.csv',
    'test2.csv',
    'test3.csv',
    'test4.csv',
    'test5.csv',
    'test6.csv',
    'test7.csv',
    'test8.csv'
]

# 遍历每个测试集文件
for test_file in test_files:
    # 数据加载
    test = pd.read_csv(test_file)  # 读取CSV文件
    
    # 特征工程
    X_test = test.iloc[:, 1:10]   # 提取第2到第10列作为特征（假设第1列为ID）
    y_test = test.iloc[:, 10]     # 提取第11列作为真实标签
    
    # 模型预测
    pro_test = model.predict_proba(X_test)[:, 1]  # 获取正类预测概率
    y_pred_test = model.predict(X_test)           # 获取预测标签（0/1分类）
    
    # 计算评估指标
    auc_test = roc_auc_score(y_test, pro_test)          # AUC面积
    accuracy_test = accuracy_score(y_test, y_pred_test)  # 准确率
    precision_test = precision_score(y_test, y_pred_test) # 精确率
    recall_test = recall_score(y_test, y_pred_test)       # 召回率/敏感度
    f1_test = f1_score(y_test, y_pred_test)              # F1分数
    
    # 混淆矩阵分析
    confusion_test = confusion_matrix(y_test, y_pred_test)  # 生成混淆矩阵
    tn, fp, fn, tp = confusion_test.ravel()                # 解构四类结果
    specificity_test = tn / (tn + fp)                      # 计算特异度
    
    # 输出评估结果
    print(f'\nResults for {test_file}:')
    print(f'测试集 AUC = {auc_test:.3f}')                   
    print(f'测试集 Accuracy = {accuracy_test:.3f}')         
    print(f'测试集 Precision = {precision_test:.3f}')       
    print(f'测试集 Sensitivity (Recall) = {recall_test:.3f}')
    print(f'测试集 Specificity = {specificity_test:.3f}')   
    print(f'测试集 F1 = {f1_test:.3f}')                     
    
    # 保存预测结果
    df_test = pd.DataFrame({
        'ID': test['ID'],       # 保留原始ID列
        'True': y_test,          # 真实标签
        'Pre': pro_test          # 预测概率值
    })
    df_test.to_csv(f'GBDT_{test_file}_predictions.csv', index=False)  # 生成预测文件


Results for test1.csv:
测试集 AUC = 0.710
测试集 Accuracy = 0.742
测试集 Precision = 0.727
测试集 Sensitivity (Recall) = 1.000
测试集 Specificity = 0.172
测试集 F1 = 0.842

Results for test2.csv:
测试集 AUC = 0.612
测试集 Accuracy = 0.754
测试集 Precision = 0.758
测试集 Sensitivity (Recall) = 0.980
测试集 Specificity = 0.111
测试集 F1 = 0.855

Results for test3.csv:
测试集 AUC = 0.824
测试集 Accuracy = 0.571
测试集 Precision = 0.526
测试集 Sensitivity (Recall) = 1.000
测试集 Specificity = 0.182
测试集 F1 = 0.690

Results for test4.csv:
测试集 AUC = 0.998
测试集 Accuracy = 0.987
测试集 Precision = 1.000
测试集 Sensitivity (Recall) = 0.986
测试集 Specificity = 1.000
测试集 F1 = 0.993

Results for test5.csv:
测试集 AUC = 0.418
测试集 Accuracy = 0.884
测试集 Precision = 0.916
测试集 Sensitivity (Recall) = 0.962
测试集 Specificity = 0.000
测试集 F1 = 0.938

Results for test6.csv:
测试集 AUC = 0.487
测试集 Accuracy = 0.534
测试集 Precision = 0.534
测试集 Sensitivity (Recall) = 1.000
测试集 Specificity = 0.000
测试集 F1 = 0.696

Results for test7.csv:
测试集 AUC = 0.428
测试集 Accuracy = 0.638
测试集 Preci

In [48]:
# 保存模型
import joblib

model_filename = 'gradient_boosting_model.joblib'
joblib.dump(model, model_filename)  # 这里model需要是已训练好的模型对象
print(f"模型已保存到当前目录下的 {model_filename}")

模型已保存到当前目录下的 gradient_boosting_model.joblib


In [None]:
from sklearn.ensemble import GradientBoostingClassifier import joblib 