In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

# 1. 加载数据
df = pd.read_csv('../testdata/dataset2/clinical2.csv')

# 2. 丢弃信息泄露列 + 唯一 ID
leak_cols = [
    "Time to Death (days)", "Date of Death", "Date of Last Known Alive",
    "Date of Recurrence", "Recurrence", "Recurrence Location", "CT Date", "PET Date"
]
df.drop(columns=leak_cols + ["Case ID"], inplace=True)

# 3. 数值型字段转换
num_cols = [
    "Age at Histological Diagnosis",
    "Weight (lbs)", "Pack Years",
    "Quit Smoking Year", "Days between CT and surgery"
]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

# 5. 拆分特征和标签
X = df.drop(columns=["Survival Status"])
y = df["Survival Status"].map({"Alive": 0, "Dead": 1})

# 6. 区分数值型和分类特征
numeric_feats     = X.select_dtypes(include=['number']).columns.tolist()
categorical_feats = X.select_dtypes(include=['object']).columns.tolist()

# 7. 构建预处理流水线
num_pipeline = SimpleImputer(strategy='median')
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline,     numeric_feats),
    ('cat', cat_pipeline, categorical_feats)
])

# 8. 整体模型流水线
model = Pipeline([
    ('prep', preprocessor),
    ('rf',  RandomForestClassifier(random_state=42))
])

# 9. 5 折分层 CV，计算 F1 和 AUC
cv         = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores  = cross_val_score(model, X, y, cv=cv, scoring='f1',      n_jobs=1)
auc_scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc', n_jobs=1)

# 10. 打印结果
print("每折 F1 scores:", f1_scores)
print(f"平均 F1-score: {f1_scores.mean():.3f} ± {f1_scores.std():.3f}")
print("每折 AUC scores:", auc_scores)
print(f"平均 AUC: {auc_scores.mean():.3f} ± {auc_scores.std():.3f}")
