In [20]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# 1. 读入并转置
fn_in  = "../testdata/dataset2/rnaseq.txt"
fn_out = "../result/rnaseq_processed.csv"
os.makedirs(os.path.dirname(fn_out), exist_ok=True)

df = pd.read_csv(fn_in, sep="\t", index_col=0)  # 原始：基因×样本
df = df.T                                      # 转成：样本×基因

# 2. 强制转 float，非数值→NaN
df = df.apply(pd.to_numeric, errors="coerce")

# 3. 丢弃在 >50% 样本中缺失的基因
missing_frac = df.isna().mean(axis=0)
keep_genes   = missing_frac[missing_frac < 0.5].index
df = df[keep_genes]

# 4. 剩余缺失值填 0 （认为为检测不到）
df = df.fillna(0)

# 5. 文库规模归一化：Counts Per Million (CPM) + log1p
#    CPM = (counts / 每样本总计) * 1e6
counts = df
cpm    = counts.div(counts.sum(axis=1), axis=0) * 1e6
log_cpm = np.log1p(cpm)

# 6. 方差过滤：保留前 100 个最具变异基因
var = log_cpm.var(axis=0)
top_genes = var.sort_values(ascending=False).head(150).index
df_sel   = log_cpm[top_genes]

# 7. 每基因 Z-score 标准化
scaler    = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df_sel),
    index=df_sel.index,
    columns=df_sel.columns
)

# 8. 保存结果
df_scaled.to_csv(fn_out)
print("处理完成，输出保存在：", fn_out)


处理完成，输出保存在： ../result/rnaseq_processed.csv


In [15]:
import os
import pandas as pd
import numpy as np

# 1. 读入
clin = pd.read_csv("../testdata/dataset2/clinical2.csv")

# 2. 丢弃无用字段
drop_cols = [
    "Time to Death (days)", "Date of Death", "Date of Last Known Alive",
    "Date of Recurrence", "Recurrence", "Recurrence Location",
    "CT Date", "PET Date"
]
clin = clin.drop(columns=drop_cols)

# 3. 保留 Case ID 并设索引
clin = clin.set_index("Case ID")

# 4. 明确数值型列名后再转换
num_cols = [
    "Age at Histological Diagnosis", "Weight (lbs)",
    "Pack Years", "Quit Smoking Year", "Days between CT and surgery"
]
# 只对这几列做强制转 float，无法转的变 NaN
clin[num_cols] = clin[num_cols].apply(
    lambda s: pd.to_numeric(s, errors="coerce")
)

# 5. 填补缺失
# 5.1 数值列：中位数
for c in num_cols:
    med = clin[c].median()  # 计算原列中位数
    clin[c] = clin[c].fillna(med)

# 5.2 分类列：众数
cat_cols = clin.columns.difference(num_cols + ["Survival Status"])
for c in cat_cols:
    mode = clin[c].mode(dropna=True)
    if not mode.empty:
        clin[c] = clin[c].fillna(mode[0])

# 6. 检查剩余缺失（应该都填完了）
print(clin.isna().sum())

# 7. 保存
os.makedirs("../result", exist_ok=True)
clin.to_csv("../result/clinical2_processed.csv")

Patient affiliation                                  0
Age at Histological Diagnosis                        0
Weight (lbs)                                         0
Gender                                               0
Ethnicity                                            0
Smoking status                                       0
Pack Years                                           0
Quit Smoking Year                                    0
%GG                                                  0
Tumor Location (choice=RUL)                          0
Tumor Location (choice=RML)                          0
Tumor Location (choice=RLL)                          0
Tumor Location (choice=LUL)                          0
Tumor Location (choice=LLL)                          0
Tumor Location (choice=L Lingula)                    0
Tumor Location (choice=Unknown)                      0
Histology                                            0
Pathological T stage                                 0
Pathologic

In [19]:
import pandas as pd
from sklearn.ensemble        import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing   import OneHotEncoder
from sklearn.impute          import SimpleImputer
from sklearn.compose         import ColumnTransformer
from sklearn.pipeline        import Pipeline

# 1. 读取预处理好的数据
rna   = pd.read_csv("../result/rnaseq_processed.csv", index_col=0)
clin  = pd.read_csv("../result/clinical2_processed.csv", index_col="Case ID")

# 2. 保留共同样本
common = clin.index.intersection(rna.index)
clin   = clin.loc[common]
rna    = rna.loc[common]
y      = (clin["Survival Status"] == "Dead").astype(int)

# 3. 定义 Top10 临床特征（包括数值列和 OHE 后的某些 dummy 列）
top10 = [
    "Days between CT and surgery",
    "Age at Histological Diagnosis",
    "Weight (lbs)",
    "Pack Years",
    "Quit Smoking Year",
    "%GG_0%",
    "Gender_Male",
    "Gender_Female",
    "Pathological T stage_T2b",
    "Pathological N stage_N2"
]

# 4. 先把整个临床做一次填补+OHE，得到 DataFrame
num_cols = ["Days between CT and surgery",
            "Age at Histological Diagnosis",
            "Weight (lbs)",
            "Pack Years",
            "Quit Smoking Year",
            # 还有 Survival Status 但我们后面会 drop
           ]
cat_cols = [c for c in clin.columns if c not in num_cols + ["Survival Status"]]

pre_full = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_cols),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ]), cat_cols)
], remainder="drop")

# fit_transform 得到 numpy，然后转回 DataFrame
X_clin_ohe = pre_full.fit_transform(clin.drop(columns="Survival Status"))
feat_names = (
    num_cols +
    list(pre_full.named_transformers_["cat"]
             .named_steps["ohe"]
             .get_feature_names_out(cat_cols))
)
X_clin_ohe = pd.DataFrame(X_clin_ohe, index=clin.index, columns=feat_names)

# 5. 构造三种特征集
X_rna      = rna
X_clin_top = X_clin_ohe[top10]
X_comb_top = pd.concat([X_clin_top, rna], axis=1)

# 6. 评估配置
cv      = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
clf     = RandomForestClassifier(n_estimators=100,
                                 min_samples_leaf=3,
                                 random_state=42,
                                 n_jobs=-1)
scoring = {"F1":"f1","AUC":"roc_auc",
           "Precision":"precision","Recall":"recall"}

def eval_model(X, name):
    scores = cross_validate(clf, X, y, cv=cv,
                            scoring=scoring, return_train_score=False)
    print(f"\n=== {name} ===")
    for m in scoring:
        arr = scores[f"test_{m}"]
        print(f"{m:9s}: {arr.mean():.3f} ± {arr.std():.3f}")

# 7. 分别跑三种
eval_model(X_rna,      "RNA-Seq Only")
eval_model(X_clin_top, "Clinical Top10 Only")
eval_model(X_comb_top, "Combined (RNA + Clin Top10)")



=== RNA-Seq Only ===
F1       : 0.253 ± 0.093
AUC      : 0.584 ± 0.123
Precision: 0.405 ± 0.179
Recall   : 0.200 ± 0.083

=== Clinical Top10 Only ===
F1       : 0.316 ± 0.134
AUC      : 0.617 ± 0.064
Precision: 0.660 ± 0.307
Recall   : 0.222 ± 0.099

=== Combined (RNA + Clin Top10) ===
F1       : 0.334 ± 0.121
AUC      : 0.578 ± 0.105
Precision: 0.540 ± 0.182
Recall   : 0.267 ± 0.151


In [12]:
import pandas as pd
from sklearn.ensemble        import RandomForestClassifier
from sklearn.preprocessing   import OneHotEncoder
from sklearn.impute          import SimpleImputer
from sklearn.compose         import ColumnTransformer
from sklearn.pipeline        import Pipeline

# 1. 读入
clin = pd.read_csv("../result/clinical2_processed.csv", index_col="Case ID")
y    = (clin["Survival Status"] == "Dead").astype(int)
X    = clin.drop(columns="Survival Status")

# 2. 列分类
num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

# 3. 构建 pipeline：先填补，再 One-Hot，再 RF
pre = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_cols),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ]), cat_cols)
])

pipe = Pipeline([
    ("prep", pre),
    ("rf",   RandomForestClassifier(
                 n_estimators=200,
                 random_state=42,
                 n_jobs=-1
             ))
])

# 4. 训练并提取特征重要性
pipe.fit(X, y)

# 5. 获取 one-hot 后的特征名
ohe_feats = pipe.named_steps["prep"] \
                  .named_transformers_["cat"] \
                  .named_steps["ohe"] \
                  .get_feature_names_out(cat_cols).tolist()

all_feats = num_cols + ohe_feats

# 6. 排序并输出前十
importances = pd.Series(
    pipe.named_steps["rf"].feature_importances_,
    index=all_feats
)
top10 = importances.nlargest(10).index.tolist()

print("Top 10 临床特征：", top10)


Top 10 临床特征： ['Days between CT and surgery', 'Age at Histological Diagnosis', 'Weight (lbs)', 'Pack Years', 'Quit Smoking Year', '%GG_0%', 'Gender_Male', 'Pathological T stage_T2b', 'Pathological N stage_N2', 'Gender_Female']
