In [1]:
# The receiver-operating characteristic curves for ten ML models without undergoing SMOTE oversampling processing
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline


file_path = r'D:\2024_4_29.xlsx'
data = pd.read_excel(file_path)


features = ['性别', '糖尿病', '年龄', '白细胞计数结果标志', '单核细胞计数结果标志', '红细胞结果标志', 
            '淋巴细胞计数结果标志', '血红蛋白结果标志', '血小板计数结果标志', '葡萄糖结果标志', 
            '白蛋白结果标志', '总胆红素结果标志', "C反应蛋白结果标志", "RBC体积分布宽度结果标志", 
            "红细胞压积结果标志", "平均RBC血红蛋白含量结果标志", "平均RBC血红蛋白浓度结果标志", 
            "平均红细胞体积结果标志", 'Braden评分（压疮评分）', 'NRS2002评分（营养风险筛查）', 
            'GlasGow评分（昏迷评分）', 'CPOT评分（疼痛评分表）',  '误吸/窒息评分', '肠内营养耐受性', 
            'VTE评分（静脉血栓栓塞症风险）', '非计划拔管评估']
target = '院感'

X = data[features]
y = data[target]


categorical_features = ['白细胞计数结果标志', '单核细胞计数结果标志', '红细胞结果标志', 
            '淋巴细胞计数结果标志', '血红蛋白结果标志', '血小板计数结果标志', '葡萄糖结果标志', 
            '白蛋白结果标志', '总胆红素结果标志', "C反应蛋白结果标志", "RBC体积分布宽度结果标志", 
            "红细胞压积结果标志", "平均RBC血红蛋白含量结果标志", "平均RBC血红蛋白浓度结果标志", 
            "平均红细胞体积结果标志"]
categorical_transformer = OneHotEncoder()


numeric_features = ['年龄', 'Braden评分（压疮评分）', 'NRS2002评分（营养风险筛查）', 'GlasGow评分（昏迷评分）',
                    'CPOT评分（疼痛评分表）',  '误吸/窒息评分', '肠内营养耐受性', 'VTE评分（静脉血栓栓塞症风险）', '非计划拔管评估']
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('feature_selection', SelectFromModel(LogisticRegression(penalty="l1", solver='liblinear')))])


X_selected = pipeline.fit_transform(X, y)


X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


classifiers = {
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(),
    "MLP": MLPClassifier(),
    "LightGBM": lgb.LGBMClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Decision tree": DecisionTreeClassifier(),
    "Naive bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(),
    "CatBoost": CatBoostClassifier(verbose=False),
    "Neural Networks": MLPClassifier()
}


results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    results[name] = {"fpr": fpr, "tpr": tpr, "auc": auc}


plt.figure(figsize=(10, 8))

sorted_results = sorted(results.items(), key=lambda x: x[1]["auc"], reverse=True)
for name, result in sorted_results:
    plt.plot(result["fpr"], result["tpr"], label=f'{name} (AUC = {result["auc"]:.3f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guessing')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()



KeyboardInterrupt



In [None]:
# The ten-fold cross-validation results of the CatBoost model without SMOTE oversampling.
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import numpy as np


clf = CatBoostClassifier(verbose=False)


cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)


fold_results = []


for i, (train_index, test_index) in enumerate(cv.split(X_selected, y)):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf.fit(X_train, y_train)
    y_prob = clf.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_prob)
    roc_auc = metrics.auc(fpr, tpr)
    fold_results.append({"fpr": fpr, "tpr": tpr, "auc": roc_auc, "fold": i+1})

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
plt.figure(figsize=(10, 8))

for result in fold_results:
    plt.plot(result["fpr"], result["tpr"], alpha=0.3, label=f'Fold {result["fold"]} (AUC = {result["auc"]:.3f})')
    interp_tpr = np.interp(mean_fpr, result["fpr"], result["tpr"])
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(result["auc"])

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = metrics.auc(mean_fpr, mean_tpr)

std_auc = np.std(aucs)

tpr_upper = np.minimum(mean_tpr + 1.96 * np.std(tprs, axis=0), 1)
tpr_lower = np.maximum(mean_tpr - 1.96 * np.std(tprs, axis=0), 0)

plt.plot(mean_fpr, mean_tpr, color='b', label=f'Mean ROC (AUC = {mean_auc:.3f} $\pm$ {std_auc:.3f})')
plt.fill_between(mean_fpr, tpr_lower, tpr_upper, color='grey', alpha=0.3)

plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guessing')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)

plt.show()

In [None]:
# The receiver-operating characteristic curves for ten ML models with undergoing SMOTE oversampling processing
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline


file_path = r'D:\2024_4_29.xlsx'
data = pd.read_excel(file_path)


features = ['性别', '糖尿病', '年龄', '白细胞计数结果标志', '单核细胞计数结果标志', '红细胞结果标志', 
            '淋巴细胞计数结果标志', '血红蛋白结果标志', '血小板计数结果标志', '葡萄糖结果标志', 
            '白蛋白结果标志', '总胆红素结果标志', "C反应蛋白结果标志", "RBC体积分布宽度结果标志", 
            "红细胞压积结果标志", "平均RBC血红蛋白含量结果标志", "平均RBC血红蛋白浓度结果标志", 
            "平均红细胞体积结果标志", 'Braden评分（压疮评分）', 'NRS2002评分（营养风险筛查）', 
            'GlasGow评分（昏迷评分）', 'CPOT评分（疼痛评分表）',  '误吸/窒息评分', '肠内营养耐受性', 
            'VTE评分（静脉血栓栓塞症风险）', '非计划拔管评估']
target = '院感'

X = data[features]
y = data[target]


categorical_features = ['白细胞计数结果标志', '单核细胞计数结果标志', '红细胞结果标志', 
            '淋巴细胞计数结果标志', '血红蛋白结果标志', '血小板计数结果标志', '葡萄糖结果标志', 
            '白蛋白结果标志', '总胆红素结果标志', "C反应蛋白结果标志", "RBC体积分布宽度结果标志", 
            "红细胞压积结果标志", "平均RBC血红蛋白含量结果标志", "平均RBC血红蛋白浓度结果标志", 
            "平均红细胞体积结果标志"]
categorical_transformer = OneHotEncoder()


numeric_features = ['年龄', 'Braden评分（压疮评分）', 'NRS2002评分（营养风险筛查）', 'GlasGow评分（昏迷评分）',
                    'CPOT评分（疼痛评分表）',  '误吸/窒息评分', '肠内营养耐受性', 'VTE评分（静脉血栓栓塞症风险）', '非计划拔管评估']

numeric_transformer = StandardScaler()


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


X_preprocessed = preprocessor.fit_transform(X)


smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_preprocessed, y)


selector = SelectFromModel(LogisticRegression(penalty="l1", solver='liblinear'))
X_selected = selector.fit_transform(X_resampled, y_resampled)


X_train, X_test, y_train, y_test = train_test_split(X_selected, y_resampled, test_size=0.2, random_state=42)


classifiers = {
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(),
    "MLP": MLPClassifier(),
    "LightGBM": lgb.LGBMClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Decision tree": DecisionTreeClassifier(),
    "Naive bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(),
    "CatBoost": CatBoostClassifier(verbose=False),
    "Neural Networks": MLPClassifier()
}


results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    results[name] = {"fpr": fpr, "tpr": tpr, "auc": auc}


sorted_results = sorted(results.items(), key=lambda x: x[1]["auc"], reverse=True)


plt.figure(figsize=(10, 8))
for name, result in sorted_results:
    plt.plot(result["fpr"], result["tpr"], label=f'{name} (AUC = {result["auc"]:.3f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guessing')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()


In [None]:
# The ten-fold cross-validation results of the CatBoost model with SMOTE oversampling
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline


file_path = r'D:\2024_4_29.xlsx'
data = pd.read_excel(file_path)

features = ['性别', '糖尿病', '年龄', '白细胞计数结果标志', '单核细胞计数结果标志', '红细胞结果标志', 
            '淋巴细胞计数结果标志', '血红蛋白结果标志', '血小板计数结果标志', '葡萄糖结果标志', 
            '白蛋白结果标志', '总胆红素结果标志', "C反应蛋白结果标志", "RBC体积分布宽度结果标志", 
            "红细胞压积结果标志", "平均RBC血红蛋白含量结果标志", "平均RBC血红蛋白浓度结果标志", 
            "平均红细胞体积结果标志", 'Braden评分（压疮评分）', 'NRS2002评分（营养风险筛查）', 
            'GlasGow评分（昏迷评分）', 'CPOT评分（疼痛评分表）',  '误吸/窒息评分', '肠内营养耐受性', 
            'VTE评分（静脉血栓栓塞症风险）', '非计划拔管评估',"嗜碱性粒细胞计数结果标志",
            "嗜酸性粒细胞计数结果标志","血小板分布宽度结果标志","血小板压积结果标志","中性粒细胞计数结果标志"]
target = '院感'

X = data[features]
y = data[target]


categorical_features = ['白细胞计数结果标志', '单核细胞计数结果标志', '红细胞结果标志', 
            '淋巴细胞计数结果标志', '血红蛋白结果标志', '血小板计数结果标志', '葡萄糖结果标志', 
            '白蛋白结果标志', '总胆红素结果标志', "C反应蛋白结果标志", "RBC体积分布宽度结果标志", 
            "红细胞压积结果标志", "平均RBC血红蛋白含量结果标志", "平均RBC血红蛋白浓度结果标志", 
            "平均红细胞体积结果标志","嗜碱性粒细胞计数结果标志","嗜酸性粒细胞计数结果标志",
            "血小板分布宽度结果标志","血小板压积结果标志","中性粒细胞计数结果标志"]
categorical_transformer = OneHotEncoder()


numeric_features = ['年龄', 'Braden评分（压疮评分）', 'NRS2002评分（营养风险筛查）', 'GlasGow评分（昏迷评分）',
                    'CPOT评分（疼痛评分表）',  '误吸/窒息评分', '肠内营养耐受性', 'VTE评分（静脉血栓栓塞症风险）', '非计划拔管评估']
numeric_transformer = StandardScaler()


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


X_preprocessed = preprocessor.fit_transform(X)


smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_preprocessed, y)

selector = SelectFromModel(LogisticRegression(penalty="l1", solver='liblinear'))
X_selected = selector.fit_transform(X_resampled, y_resampled)


X_train, X_test, y_train, y_test = train_test_split(X_selected, y_resampled, test_size=0.2, random_state=42)


classifiers = {
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(),
    "MLP": MLPClassifier(),
    "LightGBM": lgb.LGBMClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Decision tree": DecisionTreeClassifier(),
    "Naive bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(),
    "CatBoost": CatBoostClassifier(verbose=False),
    "Neural Networks": MLPClassifier()
}


results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    results[name] = {"fpr": fpr, "tpr": tpr, "auc": auc}


sorted_results = sorted(results.items(), key=lambda x: x[1]["auc"], reverse=True)


plt.figure(figsize=(10, 8))
catboost_result = results["CatBoost"]
plt.plot(catboost_result["fpr"], catboost_result["tpr"], label=f'CatBoost (AUC = {catboost_result["auc"]:.3f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guessing')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for CatBoost')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
# SHapley Additive exPlanations
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectFromModel


file_path = r'D:\2024_5_23.xlsx'
data = pd.read_excel(file_path)


features = ['Age','Braden score','NRS 2002 score','GCS score','PAS','Enteral feeding tolerance',
'CRS','Female','Male','High WBC count','Low WBC count','Medium WBC count',
'High RBC','Low RBC','Medium RBC','High lymphocyte count','Low lymphocyte count','Medium lymphocyte count',
'High hemoglobin level','Low hemoglobin level','Medium hemoglobin level','High total bilirubin level','Low total bilirubin level',
'Medium total bilirubin level','High RBC distribution width','Low RBC distribution width','Medium RBC distribution width','High hematocrit',
'Low hematocrit','Medium hematocrit','High mean corpuscular hemoglobin concentration','Low mean corpuscular hemoglobin concentration',
'Medium mean corpuscular hemoglobin concentration','High mean corpuscular volume','Low mean corpuscular volume','Medium mean corpuscular volume',
'High eosinophil count','low eosinophil count','Medium eosinophil count']

target = 'HAI'

X = data[features]
y = data[target]


smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


selector = SelectFromModel(LogisticRegression(penalty="l1", solver='liblinear'))
X_selected = selector.fit_transform(X_resampled, y_resampled)


X_train, X_test, y_train, y_test = train_test_split(X_selected, y_resampled, test_size=0.2, random_state=42)


classifiers = {
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(),
    "MLP": MLPClassifier(),
    "LightGBM": lgb.LGBMClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Decision tree": DecisionTreeClassifier(),
    "Naive bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(),
    "CatBoost": CatBoostClassifier(verbose=False),
    "Neural Networks": MLPClassifier()
}


results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    results[name] = {"fpr": fpr, "tpr": tpr, "auc": auc}


sorted_results = sorted(results.items(), key=lambda x: x[1]["auc"], reverse=True)


plt.figure(figsize=(10, 8))
for name, result in sorted_results:
    plt.plot(result["fpr"], result["tpr"], label=f'{name} (AUC = {result["auc"]:.3f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guessing')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

import shap
import matplotlib.pyplot as plt


feature_names = features


explainer = shap.TreeExplainer(classifiers['CatBoost'])  
shap_values = explainer.shap_values(X_test)


shap.summary_plot(shap_values, X_test, feature_names=feature_names, plot_type='violin', show=False)
plt.gcf().set_size_inches(20, 8)  

plt.show()