In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

In [2]:
seed = 564563451

In [3]:
train = pd.read_csv('train.csv')
X_ini = train.iloc[:,1:4]
y_ini = train.iloc[:,4]

In [4]:
sp = RepeatedStratifiedKFold(random_state=seed,n_repeats=3,n_splits=5)

In [5]:
import hyperopt
from hyperopt import hp

In [6]:
def objective(param):
    aucs = []
    for train_index,test_index in sp.split(X_ini,y_ini):
        X_train = X_ini.iloc[train_index,:]
        X_vali = X_ini.iloc[test_index,:]
        y_train = y_ini[train_index]
        y_vali = y_ini[test_index]
        model = SVC(random_state=seed,
                    C=param['C'],
                    gamma=param['gamma'],
                    probability=True)
        model.fit(X_train,y_train)
        pro_vali = model.predict_proba(X_vali)[:,1]
        auc_vali = roc_auc_score(y_vali,pro_vali)
        aucs.append(auc_vali)
    return -np.mean(aucs)

In [7]:
#超参数搜索范围，根据数据集不同进行修改
space = {
    'C':hp.uniform('C',0,1),
    'gamma':hp.uniform('gamma',0,1),
}

In [8]:
best_param = hyperopt.fmin(objective,space,hyperopt.tpe.suggest,max_evals=100)

100%|█████████████████████████████████████████████| 100/100 [00:12<00:00,  8.20trial/s, best loss: -0.8377645502645503]


In [9]:
best_param

{'C': 0.6440129769327001, 'gamma': 0.00804304738703121}

In [10]:
#这里注意range()范围与上面的搜索空间保持一致
model = SVC(random_state=seed,
            C=best_param['C'],
            gamma=best_param['gamma'],
            probability=True)
model.fit(X_ini,y_ini)
pro_train = model.predict_proba(X_ini)[:,1]

In [11]:
print('训练集AUC={:.3f}'.format(roc_auc_score(y_ini,pro_train)))

训练集AUC=0.845


In [12]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# 使用模型对训练集数据进行预测，得到预测标签
y_pred_train = model.predict(X_ini)  

# 计算训练集的AUC值（需要模型输出的概率值 pro_train）
auc_train = roc_auc_score(y_ini, pro_train)

# 计算训练集准确率（正确预测样本比例）
accuracy_train = accuracy_score(y_ini, y_pred_train)

# 计算训练集精确率（预测为正的样本中真实为正的比例）
precision_train = precision_score(y_ini, y_pred_train)

# 计算训练集召回率（真实为正的样本中被正确预测的比例）
recall_train = recall_score(y_ini, y_pred_train)

# 计算训练集F1分数（精确率和召回率的调和平均数）
f1_train = f1_score(y_ini, y_pred_train)

# 生成训练集的混淆矩阵（四分类表格）
confusion_train = confusion_matrix(y_ini, y_pred_train)

# 解构混淆矩阵四个值：真阴、假阳、假阴、真阳
tn, fp, fn, tp = confusion_train.ravel()

# 计算特异度（真实为负的样本中被正确识别的比例）
specificity_train = tn / (tn + fp)

print('训练集AUC={:.3f}'.format(auc_train))
print('训练集Accuracy={:.3f}'.format(accuracy_train))
print('训练集Precision={:.3f}'.format(precision_train))
print('训练集Sensitivity (Recall)={:.3f}'.format(recall_train))
print('训练集Specificity={:.3f}'.format(specificity_train))
print('训练集F1={:.3f}'.format(f1_train))

训练集AUC=0.845
训练集Accuracy=0.736
训练集Precision=0.702
训练集Sensitivity (Recall)=0.857
训练集Specificity=0.606
训练集F1=0.772


In [13]:
df_train = pd.DataFrame({
    'ID':train['ID'],
    'True':y_ini,
    'Pre':pro_train
})
df_train.to_csv('SVM_train.csv',index=False)

In [14]:
# 导入必要库
import pandas as pd
from sklearn.metrics import (roc_auc_score, accuracy_score, 
                           precision_score, recall_score, 
                           f1_score, confusion_matrix)

# 定义测试集文件列表（简化路径）
test_files = [
    'test1.csv',
    'test2.csv',
    'test3.csv',
    'test4.csv',
    'test5.csv',
    'test6.csv',
    'test7.csv',
    'test8.csv'
]

# 遍历每个测试集文件
for test_file in test_files:
    # 数据加载
    test = pd.read_csv(test_file)  # 读取CSV文件
    
    # 特征工程
    X_test = test.iloc[:, 1:4]   # 提取第2到第10列作为特征（假设第1列为ID）
    y_test = test.iloc[:, 4]     # 提取第11列作为真实标签
    
    # 模型预测
    pro_test = model.predict_proba(X_test)[:, 1]  # 获取正类预测概率
    y_pred_test = model.predict(X_test)           # 获取预测标签（0/1分类）
    
    # 计算评估指标
    auc_test = roc_auc_score(y_test, pro_test)          # AUC面积
    accuracy_test = accuracy_score(y_test, y_pred_test)  # 准确率
    precision_test = precision_score(y_test, y_pred_test) # 精确率
    recall_test = recall_score(y_test, y_pred_test)       # 召回率/敏感度
    f1_test = f1_score(y_test, y_pred_test)              # F1分数
    
    # 混淆矩阵分析
    confusion_test = confusion_matrix(y_test, y_pred_test)  # 生成混淆矩阵
    tn, fp, fn, tp = confusion_test.ravel()                # 解构四类结果
    specificity_test = tn / (tn + fp)                      # 计算特异度
    
    # 输出评估结果
    print(f'\nResults for {test_file}:')
    print(f'测试集 AUC = {auc_test:.3f}')                   
    print(f'测试集 Accuracy = {accuracy_test:.3f}')         
    print(f'测试集 Precision = {precision_test:.3f}')       
    print(f'测试集 Sensitivity (Recall) = {recall_test:.3f}')
    print(f'测试集 Specificity = {specificity_test:.3f}')   
    print(f'测试集 F1 = {f1_test:.3f}')                     
    
    # 保存预测结果
    df_test = pd.DataFrame({
        'ID': test['ID'],       # 保留原始ID列
        'True': y_test,          # 真实标签
        'Pre': pro_test          # 预测概率值
    })
    df_test.to_csv(f'SVM_{test_file}_predictions.csv', index=False)  # 生成预测文件

# 保存模型
import joblib

model_filename = 'SVM_model.joblib'
joblib.dump(model, model_filename)  # 这里model需要是已训练好的模型对象
print(f"模型已保存到当前目录下的 {model_filename}")


Results for test1.csv:
测试集 AUC = 0.693
测试集 Accuracy = 0.595
测试集 Precision = 0.615
测试集 Sensitivity (Recall) = 0.762
测试集 Specificity = 0.375
测试集 F1 = 0.681

Results for test2.csv:
测试集 AUC = 0.850
测试集 Accuracy = 0.800
测试集 Precision = 0.750
测试集 Sensitivity (Recall) = 0.900
测试集 Specificity = 0.700
测试集 F1 = 0.818

Results for test3.csv:
测试集 AUC = 0.520
测试集 Accuracy = 0.500
测试集 Precision = 0.500
测试集 Sensitivity (Recall) = 0.600
测试集 Specificity = 0.400
测试集 F1 = 0.545

Results for test4.csv:
测试集 AUC = 0.580
测试集 Accuracy = 0.556
测试集 Precision = 0.538
测试集 Sensitivity (Recall) = 0.778
测试集 Specificity = 0.333
测试集 F1 = 0.636

Results for test5.csv:
测试集 AUC = 0.856
测试集 Accuracy = 0.842
测试集 Precision = 0.818
测试集 Sensitivity (Recall) = 0.900
测试集 Specificity = 0.778
测试集 F1 = 0.857

Results for test6.csv:
测试集 AUC = 0.790
测试集 Accuracy = 0.682
测试集 Precision = 0.500
测试集 Sensitivity (Recall) = 0.714
测试集 Specificity = 0.667
测试集 F1 = 0.588

Results for test7.csv:
测试集 AUC = 0.906
测试集 Accuracy = 0.750
测试集 Preci