In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.exceptions import ConvergenceWarning

In [2]:
'''读取数据'''
data_path = './concatData.xlsx'
CN_AD_Data = pd.read_excel(data_path,sheet_name="CN_AD")

### 1、串联拼接

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.metrics import roc_auc_score, make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
import warnings
from sklearn.exceptions import ConvergenceWarning

# 忽略所有的 ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# 定义模型
models = {
    'KNN': KNeighborsClassifier(n_neighbors=20),#n_neighbors=20
    'Ridge': LogisticRegression(penalty='l2', solver='liblinear'),#, solver='liblinear'
    'Bayes': BernoulliNB(),
    'SVM': SVC(probability=True),#,C=8
    'RF': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(),
    'GDBT': GradientBoostingClassifier()
}

# 定义评分标准
scoring = {
    'MCC': make_scorer(matthews_corrcoef),
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'F1': make_scorer(f1_score)
}

# 交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=23)

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [6]:
from sklearn.neighbors import NearestNeighbors

def find_and_remove_danger_samples(X, y, k=5): 
    X = np.array(X)
    y = np.array(y)
    majority_class = max(set(y), key=list(y).count)
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(X)
    danger_indices = []
    for idx, x in enumerate(X):
        if y[idx] == majority_class:
            neighbors = neigh.kneighbors([x], return_distance=False)
            neighbor_labels = y[neighbors[0]]
            minority_count = np.sum(neighbor_labels != majority_class)
            if minority_count > k / 2:
                danger_indices.append(idx)
    X_cleaned = np.delete(X, danger_indices, axis=0)
    y_cleaned = np.delete(y, danger_indices, axis=0)
    
    return X_cleaned, y_cleaned

#### （0）临床数据

In [7]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import BorderlineSMOTE

# 标准化数据
clinic_X = CN_AD_Data[['PTGENDER', 'PTMARRY', 'AGE','PTEDUCAT']]
clinic_y = CN_AD_Data['status']
print('Original dataset shape %s' % Counter(clinic_y))


'''过采样'''
sm0 = BorderlineSMOTE(
    random_state=42,
    kind="borderline-1",
    sampling_strategy={0: 209, 1: 105 },#179 20  IR=6.92
    k_neighbors=5, #确定邻居点的数量
    m_neighbors=10) #指定在合成样本生成过程中从近邻点中选择多少个样本作为参考
X_bdsmote0, y_bdsmote0 = sm0.fit_resample(clinic_X, clinic_y)
print('过采样 dataset shape %s' % Counter(y_bdsmote0))
# 合并为一个新的dataframe
y_bdsmote0 = pd.Series(y_bdsmote0,name="status")
data_bdsmote0 = pd.concat([X_bdsmote0, y_bdsmote0], axis=1)


'''欠采样'''
X_cleaned0, y_cleaned0 = find_and_remove_danger_samples(X_bdsmote0.values, data_bdsmote0["status"], k=100)
print(f"欠采样 dataset shape: {Counter(y_cleaned0)}")
# 合并为一个新的dataframe
X_bdknn0 = pd.DataFrame(X_cleaned0,columns=X_bdsmote0.columns)
y_bdknn0 = pd.Series(y_cleaned0,name="status")
data_bdknn0 = pd.concat([X_bdknn0, y_bdknn0], axis=1)

Original dataset shape Counter({0: 209, 1: 35})
过采样 dataset shape Counter({0: 209, 1: 105})
欠采样 dataset shape: Counter({0: 206, 1: 105})


In [8]:
clinic_ret = {}

# 标准化
standard_scaler0 = StandardScaler()
standard_scaler0.fit(X_bdknn0)
X_bdknn0 = standard_scaler0.transform(X_bdknn0)
X1 = pd.DataFrame(X_bdknn0,columns=[['PTGENDER', 'PTMARRY', 'AGE','PTEDUCAT']])


for name, model in models.items():    
    scores = cross_validate(model , X1, y_bdknn0, cv=cv, scoring=scoring)
    clinic_ret[name] = {
        'MCC': np.mean(scores['test_MCC']),
        'Accuracy': np.mean(scores['test_Accuracy']),
        'Precision': np.mean(scores['test_Precision']),
        'Recall': np.mean(scores['test_Recall']),
        'F1': np.mean(scores['test_F1'])
    }

# 输出结果
df_clinic = pd.DataFrame(clinic_ret).T
# print(df_clinic)

               MCC  Accuracy  Precision    Recall        F1
KNN       0.437257  0.761802   0.717619  0.485714  0.578297
Ridge     0.332372  0.723349   0.665844  0.371429  0.475401
Bayes     0.373516  0.739427   0.735027  0.361905  0.482872
SVM       0.477266  0.774859   0.713340  0.561905  0.626301
RF        0.643064  0.836047   0.762971  0.771429  0.760677
AdaBoost  0.507522  0.781464   0.701207  0.638095  0.665230
XGBoost   0.603891  0.823041   0.748681  0.723810  0.733133
GDBT      0.584509  0.816590   0.762743  0.676190  0.714207


#### （1）临床数据+PRS

In [9]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import BorderlineSMOTE

# 标准化数据
prs_X = CN_AD_Data[['PTGENDER', 'PTMARRY', 'AGE','PTEDUCAT', 
                  'cov_prs']]
prs_y = CN_AD_Data['status']
print('Original dataset shape %s' % Counter(prs_y))


'''过采样'''
sm1 = BorderlineSMOTE(
    random_state=42,
    kind="borderline-1",
    sampling_strategy={0: 209, 1: 105 },#179 20  IR=6.92
    k_neighbors=5, #确定邻居点的数量
    m_neighbors=10) #指定在合成样本生成过程中从近邻点中选择多少个样本作为参考
X_bdsmote1, y_bdsmote1 = sm1.fit_resample(prs_X, prs_y)
print('过采样 dataset shape %s' % Counter(y_bdsmote1))
# 合并为一个新的dataframe
y_bdsmote1 = pd.Series(y_bdsmote1,name="status")
data_bdsmote1 = pd.concat([X_bdsmote1, y_bdsmote1], axis=1)


'''欠采样'''
X_cleaned1, y_cleaned1 = find_and_remove_danger_samples(X_bdsmote1.values, data_bdsmote1["status"], k=100)
print(f"欠采样 dataset shape: {Counter(y_cleaned1)}")
# 合并为一个新的dataframe
X_bdknn1 = pd.DataFrame(X_cleaned1,columns=X_bdsmote1.columns)
y_bdknn1 = pd.Series(y_cleaned1,name="status")
data_bdknn1 = pd.concat([X_bdknn1, y_bdknn1], axis=1)

Original dataset shape Counter({0: 209, 1: 35})
过采样 dataset shape Counter({0: 209, 1: 105})
欠采样 dataset shape: Counter({0: 206, 1: 105})


In [10]:
clinic_prs_ret = {}

# 标准化
standard_scaler1 = StandardScaler()
standard_scaler1.fit(X_bdknn1)
X_bdknn1 = standard_scaler1.transform(X_bdknn1)
X2 = pd.DataFrame(X_bdknn1,columns=[['PTGENDER', 'PTMARRY', 'AGE','PTEDUCAT', 
                  'cov_prs']])


for name, model in models.items():    
    scores = cross_validate(model , X2, y_bdknn1, cv=cv, scoring=scoring)
    clinic_prs_ret[name] = {
        'MCC': np.mean(scores['test_MCC']),
        'Accuracy': np.mean(scores['test_Accuracy']),
        'Precision': np.mean(scores['test_Precision']),
        'Recall': np.mean(scores['test_Recall']),
        'F1': np.mean(scores['test_F1'])
    }

# 输出结果
df_clinic_prs = pd.DataFrame(clinic_prs_ret).T
print(df_clinic_prs)

               MCC  Accuracy  Precision    Recall        F1
KNN       0.580536  0.813415   0.769495  0.666667  0.710033
Ridge     0.440059  0.755658   0.673333  0.561905  0.604937
Bayes     0.551660  0.794316   0.692977  0.723810  0.703229
SVM       0.606235  0.819969   0.725667  0.761905  0.741260
RF        0.681808  0.855248   0.781927  0.800000  0.786944
AdaBoost  0.687515  0.855351   0.792019  0.800000  0.786378
XGBoost   0.694408  0.858577   0.793234  0.809524  0.794346
GDBT      0.736550  0.880952   0.831624  0.819048  0.821495


#### （2）临床数据+PRS+甲基化

In [11]:
cg_X = CN_AD_Data[['PTGENDER', 'PTMARRY', 'AGE','PTEDUCAT', 
                  'cov_prs',
                  'cg26896946','cg06536614', 'cg07972135', 'cg27657429', 'cg18678645', 
                  'cg24199400','cg04481923', 'cg11921736', 'cg05652809', 'cg01755562']]
cg_y = CN_AD_Data['status']
'''过采样'''
sm2 = BorderlineSMOTE(
    random_state=42,
    kind="borderline-1",
    sampling_strategy={0: 209, 1: 105 },
    k_neighbors=5, #确定邻居点的数量
    m_neighbors=10) #指定在合成样本生成过程中从近邻点中选择多少个样本作为参考
X_bdsmote2, y_bdsmote2 = sm2.fit_resample(cg_X, cg_y)
print('过采样 dataset shape %s' % Counter(y_bdsmote2))
# 合并为一个新的dataframe
y_bdsmote2 = pd.Series(y_bdsmote2,name="status")
data_bdsmote2 = pd.concat([X_bdsmote2, y_bdsmote2], axis=1)


'''欠采样'''
X_cleaned2, y_cleaned2 = find_and_remove_danger_samples(X_bdsmote2.values, data_bdsmote2["status"], k=100)
print(f"欠采样 dataset shape: {Counter(y_cleaned2)}")
# 合并为一个新的dataframe
X_bdknn2 = pd.DataFrame(X_cleaned2,columns=X_bdsmote2.columns)
y_bdknn2 = pd.Series(y_cleaned2,name="status")
data_bdknn2 = pd.concat([X_bdknn2, y_bdknn2], axis=1)

过采样 dataset shape Counter({0: 209, 1: 105})
欠采样 dataset shape: Counter({0: 206, 1: 105})


In [12]:
clinic_prs_methy_ret = {}

# # 标准化
standard_scaler3 = StandardScaler()
standard_scaler3.fit(X_bdknn2)
cg_bdknn_X = standard_scaler3.transform(X_bdknn2)
X3 = pd.DataFrame(cg_bdknn_X,columns=['PTGENDER', 'PTMARRY', 'AGE','PTEDUCAT', 
                  'cov_prs',
                  'cg26896946','cg06536614', 'cg07972135', 'cg27657429', 'cg18678645', 
                  'cg24199400','cg04481923', 'cg11921736', 'cg05652809', 'cg01755562'])
y3 = y_bdknn2


for name, model in models.items():
    scores = cross_validate(model, X3, y3, cv=cv, scoring=scoring)
    clinic_prs_methy_ret[name] = {
        'MCC': np.mean(scores['test_MCC']),
        'Accuracy': np.mean(scores['test_Accuracy']),
        'Precision': np.mean(scores['test_Precision']),
        'Recall': np.mean(scores['test_Recall']),
        'F1': np.mean(scores['test_F1'])
    }

# 输出结果
df_clinic_prs_methy = pd.DataFrame(clinic_prs_methy_ret).T
print(df_clinic_prs_methy)

               MCC  Accuracy  Precision    Recall        F1
KNN       0.725096  0.868152   0.775984  0.876190  0.817764
Ridge     0.741006  0.877931   0.793658  0.876190  0.829881
Bayes     0.618575  0.816999   0.702089  0.819048  0.752090
SVM       0.780186  0.897235   0.839824  0.876190  0.851457
RF        0.804117  0.910087   0.867785  0.876190  0.868264
AdaBoost  0.770310  0.893856   0.823556  0.876190  0.844788
XGBoost   0.818260  0.916487   0.876281  0.885714  0.877061
GDBT      0.803149  0.909985   0.856892  0.885714  0.868994


#### （3）打印结果

In [13]:
print(df_clinic)
print(df_clinic_prs)
print(df_clinic_prs_methy)

               MCC  Accuracy  Precision    Recall        F1
KNN       0.437257  0.761802   0.717619  0.485714  0.578297
Ridge     0.332372  0.723349   0.665844  0.371429  0.475401
Bayes     0.373516  0.739427   0.735027  0.361905  0.482872
SVM       0.477266  0.774859   0.713340  0.561905  0.626301
RF        0.643064  0.836047   0.762971  0.771429  0.760677
AdaBoost  0.507522  0.781464   0.701207  0.638095  0.665230
XGBoost   0.603891  0.823041   0.748681  0.723810  0.733133
GDBT      0.584509  0.816590   0.762743  0.676190  0.714207
               MCC  Accuracy  Precision    Recall        F1
KNN       0.580536  0.813415   0.769495  0.666667  0.710033
Ridge     0.440059  0.755658   0.673333  0.561905  0.604937
Bayes     0.551660  0.794316   0.692977  0.723810  0.703229
SVM       0.606235  0.819969   0.725667  0.761905  0.741260
RF        0.681808  0.855248   0.781927  0.800000  0.786944
AdaBoost  0.687515  0.855351   0.792019  0.800000  0.786378
XGBoost   0.694408  0.858577   0.793234 

### 2、转化拼接

In [117]:
from sklearn.decomposition import PCA

models_trans = {
    'KNN': KNeighborsClassifier(n_neighbors=15),
    'Ridge': LogisticRegression(penalty='l2', solver='liblinear',C=32),
    'Bayes': BernoulliNB(),#  GaussianNB   BernoulliNB
    'SVM': SVC(probability=True,C=32),
    'RF': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(),
    'GDBT': GradientBoostingClassifier()
}

#### （0）clinic

In [118]:
trans_clinic_ret = {}

ft0_X = X1[['PTGENDER', 'PTMARRY', 'AGE','PTEDUCAT']]
ft0_y = data_bdknn0['status']

pca0 = PCA(n_components=3)   
pca0_X = pca0.fit_transform(ft0_X)

for name, model in models_trans.items():
    scores = cross_validate(model, pca0_X, ft0_y, cv=cv, scoring=scoring)
    trans_clinic_ret[name] = {
        'MCC': np.mean(scores['test_MCC']),
        'Accuracy': np.mean(scores['test_Accuracy']),
        'Precision': np.mean(scores['test_Precision']),
        'Recall': np.mean(scores['test_Recall']),
        'F1': np.mean(scores['test_F1'])
    }

# 输出结果
df_trans_clinic = pd.DataFrame(trans_clinic_ret).T
# print(df_trans_clinic)

#### （1）cov+prs_pca

In [119]:
trans_clinic_prs_ret = {}

ft1_X = X2[['PTGENDER', 'PTMARRY', 'AGE','PTEDUCAT','cov_prs']]
# ft12_X = X2['cov_prs']
ft1_y = data_bdknn1['status']

pca1 = PCA(n_components=4)
pca1_X = pca1.fit_transform(ft1_X)
pca1_X = pd.DataFrame(pca1_X)
# ,columns=[f'PCA1_{i+1}' for i in range(pca11_X.shape[1])]

# combined_pca1 =pd.concat([pca11_X, ft12_X],axis=1)



for name, model in models_trans.items():
    scores = cross_validate(model, pca1_X, ft1_y, cv=cv, scoring=scoring)
    trans_clinic_prs_ret[name] = {
        'MCC': np.mean(scores['test_MCC']),
        'Accuracy': np.mean(scores['test_Accuracy']),
        'Precision': np.mean(scores['test_Precision']),
        'Recall': np.mean(scores['test_Recall']),
        'F1': np.mean(scores['test_F1'])
    }

# 输出结果
df_trans_clinic_prs = pd.DataFrame(trans_clinic_prs_ret).T
# print(df_trans_clinic_prs)

#### （2）cov+prs+methy_pca

In [120]:
trans_clinic_prs_methy_ret = {}

ft21_X = X3[['PTGENDER', 'PTMARRY', 'AGE','PTEDUCAT','cov_prs']]
# ft22_X = X3['cov_prs']
ft22_X = X3[['cg26896946','cg06536614', 'cg07972135', 'cg27657429', 'cg18678645', 
                     'cg24199400','cg04481923', 'cg11921736', 'cg05652809', 'cg01755562']]
ft2_y = data_bdknn2['status']

pca21_X = pca1.fit_transform(ft21_X)
pca21_X = pd.DataFrame(pca21_X,columns=[f'PCA1_{i+1}' for i in range(pca21_X.shape[1])])

pca2 = PCA(n_components=5)  
pca22_X = pca2.fit_transform(ft22_X)
pca22_X = pd.DataFrame(pca22_X,columns=[f'PCA3_{i+1}' for i in range(pca22_X.shape[1])])

combined_pca2 =pd.concat([pca21_X, pca22_X],axis=1)


for name, model in models.items():
    scores = cross_validate(model, combined_pca2, ft2_y, cv=cv, scoring=scoring)
    trans_clinic_prs_methy_ret[name] = {
        'MCC': np.mean(scores['test_MCC']),
        'Accuracy': np.mean(scores['test_Accuracy']),
        'Precision': np.mean(scores['test_Precision']),
        'Recall': np.mean(scores['test_Recall']),
        'F1': np.mean(scores['test_F1'])
    }

# 输出结果
df_trans_clinic_prs_methy = pd.DataFrame(trans_clinic_prs_methy_ret).T
# print(df_trans_clinic_prs_methy)

#### （3）打印结果

In [121]:
print(df_trans_clinic)
print(df_trans_clinic_prs)
print(df_trans_clinic_prs_methy)

               MCC  Accuracy  Precision    Recall        F1
KNN       0.403546  0.745827   0.669365  0.495238  0.568265
Ridge     0.247118  0.691193   0.594320  0.314286  0.406522
Bayes     0.283673  0.687865   0.555885  0.466667  0.503479
SVM       0.482838  0.768152   0.666071  0.647619  0.652531
RF        0.535950  0.787762   0.674223  0.723810  0.697342
AdaBoost  0.437899  0.749258   0.632580  0.619048  0.622494
XGBoost   0.514968  0.784537   0.685342  0.666667  0.674784
GDBT      0.540373  0.794316   0.704090  0.685714  0.692774
               MCC  Accuracy  Precision    Recall        F1
KNN       0.545845  0.797389   0.717843  0.676190  0.692995
Ridge     0.446804  0.755709   0.680781  0.571429  0.613500
Bayes     0.470044  0.771787   0.729231  0.533333  0.613152
SVM       0.637023  0.832770   0.731879  0.800000  0.763860
RF        0.650409  0.839324   0.755104  0.790476  0.768837
AdaBoost  0.622761  0.829595   0.751263  0.752381  0.749927
XGBoost   0.582055  0.810445   0.717814 

### 3、模型拼接

In [130]:
'''集成学习：投票分类'''
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import log_loss, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import confusion_matrix

# 定义基学习器 DecisionTreeClassifier
classifiers = {
    'KNN': KNeighborsClassifier(),
    'LR':  LogisticRegression(),
    'NaiveBayes': GaussianNB(),
    'SVM': SVC(probability=True),
    'RF': RandomForestClassifier(),
}

def find_bestModel(X,Y):
    '''寻找最佳模型'''
    best_model = None
    best_score = -np.inf  # 负对数损失的最小值是负无穷大
    best_model_name = ''
    
    for name, clf in classifiers.items():
        # 使用五折交叉验证
        kf = StratifiedKFold(n_splits=5)
        fold_scores = []
        for train_index, test_index in kf.split(X,Y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = Y[train_index], Y[test_index]
            clf.fit(X_train, y_train)
            y_pred_proba = clf.predict_proba(X_test)
            fold_scores.append(-log_loss(y_test, y_pred_proba))  # 负对数损失
        
        mean_score = np.mean(fold_scores)

        if mean_score > best_score:
            best_score = mean_score
            best_model = clf
            best_model_name = name

    return best_model_name

In [165]:
# 找到最佳子模型
ft0_model_name = find_bestModel(ft0_X,ft0_y)
ft0_model = classifiers[ft0_model_name]
print("ft1 model:",ft0_model)
ft1_model_name = find_bestModel(ft1_X,ft1_y)
ft1_model = classifiers[ft1_model_name]
print("ft1 model:",ft1_model)

ft21_model_name = find_bestModel(ft21_X,ft2_y)
ft21_model = classifiers[ft21_model_name]
print("ft2 model:",ft21_model)
ft22_model_name = find_bestModel(ft22_X,ft2_y)
ft22_model = classifiers[ft22_model_name]
print("ft2 model:",ft22_model)

# 训练模型
ft0_model.fit(ft0_X, ft0_y)
ft1_model.fit(ft1_X, ft1_y)

ft21_model.fit(ft21_X, ft2_y)
ft22_model.fit(ft22_X, ft2_y)

ft1 model: SVC(probability=True)
ft1 model: SVC(probability=True)
ft2 model: SVC(probability=True)
ft2 model: RandomForestClassifier()


#### （0）仅计算 ft0 最佳子模型

In [136]:
'''第一：仅计算 ft0 最佳子模型'''

# 创建五折交叉验证器
ft0_skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 初始化用于保存各折结果的列表
ft0_confusion_matrices = []
ft0_mccs = []
ft0_accuracies = []
ft0_precisions = []
ft0_recalls = []
ft0_f1_scores = []


# 执行五折交叉验证
for fold, (train_index, test_index) in enumerate(ft0_skf.split(ft0_X, ft0_y)):
    # 分割数据集
    X_train, X_test = ft0_X.iloc[train_index], ft0_X.iloc[test_index]
    y_train, y_test = ft0_y[train_index], ft0_y[test_index]

    # 训练模型
    ft0_model.fit(X_train, y_train)
    
    # 预测结果
    y_pred = ft0_model.predict(X_test)
#     y_pred_proba = ft0_model.predict_proba(X_test)[:, 1]  # 预测概率，用于计算AUC

    # 计算混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    ft0_confusion_matrices.append(cm)
    
    # 计算各项指标
    ft0_mccs.append(matthews_corrcoef(y_test, y_pred))
    ft0_accuracies.append(accuracy_score(y_test, y_pred))
    ft0_precisions.append(precision_score(y_test, y_pred))
    ft0_recalls.append(recall_score(y_test, y_pred))
    ft0_f1_scores.append(f1_score(y_test, y_pred))
    
    
    # 输出当前折的混淆矩阵
    print(f"Fold {fold + 1} Confusion Matrix:")
    print(cm)
    print("-" * 40)

    
# 计算并输出所有折的平均值
print(f"Average MCC: {np.mean(ft0_mccs):.4f}")
print(f"Average Accuracy: {np.mean(ft0_accuracies):.4f}")
print(f"Average Precision: {np.mean(ft0_precisions):.4f}")
print(f"Average Recall: {np.mean(ft0_recalls):.4f}")
print(f"Average F1 Score: {np.mean(ft0_f1_scores):.4f}")

Fold 1 Confusion Matrix:
[[38  4]
 [10 11]]
----------------------------------------
Fold 2 Confusion Matrix:
[[39  2]
 [ 7 14]]
----------------------------------------
Fold 3 Confusion Matrix:
[[39  2]
 [13  8]]
----------------------------------------
Fold 4 Confusion Matrix:
[[35  6]
 [ 6 15]]
----------------------------------------
Fold 5 Confusion Matrix:
[[34  7]
 [ 8 13]]
----------------------------------------
Average MCC: 0.5184
Average Accuracy: 0.7910
Average Precision: 0.7545
Average Recall: 0.5810
Average F1 Score: 0.6465


#### （1）仅计算 ft1 最佳子模型

In [137]:
'''第二：仅计算 ft1 最佳子模型'''

# 创建五折交叉验证器
ft1_skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 初始化用于保存各折结果的列表
ft1_confusion_matrices = []
ft1_mccs = []
ft1_accuracies = []
ft1_precisions = []
ft1_recalls = []
ft1_f1_scores = []


# 执行五折交叉验证
for fold, (train_index, test_index) in enumerate(ft1_skf.split(ft1_X, ft1_y)):
    # 分割数据集
    X_train, X_test = ft1_X.iloc[train_index], ft1_X.iloc[test_index]
    y_train, y_test = ft1_y[train_index], ft1_y[test_index]

    # 训练模型
    ft1_model.fit(X_train, y_train)
    
    # 预测结果
    y_pred = ft1_model.predict(X_test)
    
    # 计算混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    ft1_confusion_matrices.append(cm)
    
    # 计算各项指标
    ft1_mccs.append(matthews_corrcoef(y_test, y_pred))
    ft1_accuracies.append(accuracy_score(y_test, y_pred))
    ft1_precisions.append(precision_score(y_test, y_pred))
    ft1_recalls.append(recall_score(y_test, y_pred))
    ft1_f1_scores.append(f1_score(y_test, y_pred))
    
    
    # 输出当前折的混淆矩阵
    print(f"Fold {fold + 1} Confusion Matrix:")
    print(cm)
    print("-" * 40)

    
# 计算并输出所有折的平均值
print(f"Average MCC: {np.mean(ft1_mccs):.4f}")
print(f"Average Accuracy: {np.mean(ft1_accuracies):.4f}")
print(f"Average Precision: {np.mean(ft1_precisions):.4f}")
print(f"Average Recall: {np.mean(ft1_recalls):.4f}")
print(f"Average F1 Score: {np.mean(ft1_f1_scores):.4f}")

Fold 1 Confusion Matrix:
[[37  5]
 [ 6 15]]
----------------------------------------
Fold 2 Confusion Matrix:
[[39  2]
 [ 2 19]]
----------------------------------------
Fold 3 Confusion Matrix:
[[37  4]
 [ 5 16]]
----------------------------------------
Fold 4 Confusion Matrix:
[[37  4]
 [ 3 18]]
----------------------------------------
Fold 5 Confusion Matrix:
[[36  5]
 [ 6 15]]
----------------------------------------
Average MCC: 0.6965
Average Accuracy: 0.8651
Average Precision: 0.8046
Average Recall: 0.7905
Average F1 Score: 0.7972


#### （2）集成最佳子模型

In [166]:
'''第三：集成两个子模型'''
# 创建集成模型（软投票）
ensemble_model = VotingClassifier(estimators=[
    ('model_1', ft21_model),
    ('model_2', ft21_model)
], voting='soft')

# 进行五折交叉验证并输出每一折的混淆矩阵
vt_skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1

# 存储各折的评价指标
vt_mcc_scores = []
vt_accuracy_scores = []
vt_precision_scores = []
vt_recall_scores = []
vt_f1_scores = []
vt_confusion_matrix = []

for train_index, test_index in vt_skf.split(np.hstack((ft21_X, ft22_X)),ft2_y):
    X_train, X_test = np.hstack((ft21_X, ft22_X))[train_index], np.hstack((ft21_X, ft22_X))[test_index]
    y_train, y_test = ft2_y[train_index], ft2_y[test_index]
    
    # 训练和预测
    ensemble_model.fit(X_train, y_train)
    y_pred = ensemble_model.predict(X_test)
    
    # 计算混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    # 输出当前折的混淆矩阵
    print(f"Fold {fold} Confusion Matrix:")
    print(cm)
    print("-" * 40)
    
    # 计算并存储评价指标
    vt_confusion_matrix.append(cm)
    vt_mcc_scores.append(matthews_corrcoef(y_test, y_pred))
    vt_accuracy_scores.append(accuracy_score(y_test, y_pred))
    vt_precision_scores.append(precision_score(y_test, y_pred))
    vt_recall_scores.append(recall_score(y_test, y_pred))
    vt_f1_scores.append(f1_score(y_test, y_pred))
    
    fold += 1


# 计算并输出所有折的平均值
print(f"Average MCC: {np.mean(vt_mcc_scores):.4f}")
print(f"Average Accuracy: {np.mean(vt_accuracy_scores):.4f}")
print(f"Average Precision: {np.mean(vt_accuracy_scores):.4f}")
print(f"Average Recall: {np.mean(vt_recall_scores):.4f}")
print(f"Average F1 Score: {np.mean(vt_f1_scores):.4f}")

Fold 1 Confusion Matrix:
[[33  9]
 [ 5 16]]
----------------------------------------
Fold 2 Confusion Matrix:
[[39  2]
 [ 1 20]]
----------------------------------------
Fold 3 Confusion Matrix:
[[40  1]
 [ 4 17]]
----------------------------------------
Fold 4 Confusion Matrix:
[[39  2]
 [ 1 20]]
----------------------------------------
Fold 5 Confusion Matrix:
[[37  4]
 [ 2 19]]
----------------------------------------
Average MCC: 0.7849
Average Accuracy: 0.9007
Average Precision: 0.9007
Average Recall: 0.8762
Average F1 Score: 0.8583
