In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

# ==== 读取数据 ====
clinical_df = pd.read_csv("../testdata/dataset1/clinical1.csv")
df = clinical_df[["PatientID", "deadstatus.event"]].dropna()

# ==== 合并临床特征 ====
clinical_features = clinical_df.set_index("PatientID").drop(columns=["Survival.time", "deadstatus.event"])
df = df.merge(clinical_features, left_on="PatientID", right_index=True)

# ==== 区分数值型和类别型特征 ====
num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

num_cols = [col for col in num_cols if col not in ["deadstatus.event"]]

# ==== 构建特征工程管道 ====
num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

# ==== 构建最终模型流水线 ====
model = make_pipeline(
    preprocessor,
    RandomForestClassifier(n_estimators=100, random_state=42)
)

# ==== 交叉验证评估 ====
X = df.drop(columns=["PatientID", "deadstatus.event"])
y = df["deadstatus.event"]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_scores = cross_val_score(model, X, y, cv=skf, scoring="f1")
auc_scores = cross_val_score(model, X, y, cv=skf, scoring="roc_auc")

# ==== 输出结果 ====
print("临床特征模型评估")
print(f"平均 F1: {f1_scores.mean():.3f}  每折: {f1_scores}")
print(f"平均 AUC: {auc_scores.mean():.3f}  每折: {auc_scores}")


临床特征模型评估
平均 F1: 0.921  每折: [0.91719745 0.93589744 0.90196078 0.92207792 0.92903226]
平均 AUC: 0.598  每折: [0.64       0.60933333 0.58592593 0.475      0.68040541]
