In [None]:
import pandas as pd

path = r"C:\**\**\深度学习验证\merged_all.xlsx"
df = pd.read_excel(path)

df.head()
df.info()
df["Level_OA"].value_counts()

from sklearn.model_selection import StratifiedKFold
import numpy as np

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2025)
df["fold_id"] = -1
for fold, (_, val_idx) in enumerate(skf.split(df, df["Level_OA"])):
    df.loc[val_idx, "fold_id"] = fold

df["fold_id"].value_counts()

feature_cols = [c for c in df.columns if c not in
                ["Name","Level_OA","fold_id","P_OA"]] 
X = df[feature_cols].values
y = df["Level_OA"].values

from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

model = LGBMClassifier(num_leaves=31, learning_rate=0.05,
                       n_estimators=300, random_state=2025)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2025)
scores = cross_val_score(model, X, y, cv=cv, scoring="f1_macro")
print("Macro-F1 (10-fold):", scores.mean(), "±", scores.std())

from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, X, y, cv=cv,
                            scoring=["accuracy", "f1_macro", "roc_auc_ovr"],
                            return_train_score=False)

print("Accuracy:", cv_results["test_accuracy"].mean())
print("Macro-F1:", cv_results["test_f1_macro"].mean())
print("AUC:", cv_results["test_roc_auc_ovr"].mean())


In [None]:
import os, random, numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from scipy.special import softmax   # ✅ 用 scipy 的 softmax
import torch, torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

SEED = 2025
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

PATH = r"C:\**\**\深度学习验证\merged_all.xlsx"

read_ok = False
try:
    df = pd.read_excel(PATH, engine="openpyxl")
    read_ok = True
except Exception:
    try:
        import xlrd 
        df = pd.read_excel(PATH, engine="xlrd")
        read_ok = True
    except Exception as e:
        raise e
print(f"数据行数: {len(df)}")

need_cols = ["Name","Level_OA","G","P_PI3K","P_PPAR","P_ROS","P_LPS"]
for c in need_cols:
    assert c in df.columns, f"缺列：{c}"
df["Level_OA"] = df["Level_OA"].astype(int)

if "fold_id" not in df.columns:
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
    fold_ids = np.zeros(len(df), dtype=int) - 1
    for fold, (_, val_idx) in enumerate(skf.split(df, df["Level_OA"])):
        fold_ids[val_idx] = fold
    df["fold_id"] = fold_ids
print("fold 分布：", df["fold_id"].value_counts().to_dict())

exclude_cols = {"Name","Level_OA","fold_id","G","P_PI3K","P_PPAR","P_ROS","P_LPS","P_OA"}
X_cols = [c for c in df.columns if c not in exclude_cols and df[c].dtype != 'O']
print(f"结构描述符个数: {len(X_cols)}")

def feature_columns(mode):
    if mode == "X-only":
        return X_cols
    elif mode == "X+Y":
        return X_cols + ["G"]
    elif mode == "X+Z":
        return X_cols + ["P_PI3K","P_PPAR","P_ROS","P_LPS"]
    elif mode == "X+Y+Z":
        return X_cols + ["G","P_PI3K","P_PPAR","P_ROS","P_LPS"]
    else:
        raise ValueError(mode)

# =========================
# 1) 轻量 Transformer 模型
# =========================
class TinyTabTransformer(nn.Module):

    def __init__(self, in_dim, n_classes=3, d_model=64, n_heads=4, n_layers=2, dropout=0.35):
        super().__init__()
        self.proj = nn.Linear(in_dim, d_model)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=d_model*2,
            dropout=dropout, batch_first=True, activation="gelu"
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
        self.norm = nn.LayerNorm(d_model)
        self.head = nn.Sequential(
            nn.Linear(d_model, d_model//2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model//2, n_classes)
        )

    def forward(self, x):
        # x: (B, in_dim) -> (B, 1, d_model)
        x = self.proj(x).unsqueeze(1)
        x = self.encoder(x)                 # (B, 1, d_model)
        x = self.norm(x.squeeze(1))         # (B, d_model)
        logits = self.head(x)               # (B, n_classes)
        return logits

def train_one_fold(train_X, train_y, val_X, val_y,
                   epochs=500, batch_size=16, lr=1e-3, weight_decay=1e-4,
                   d_model=64, n_heads=4, n_layers=2, dropout=0.35, patience=50):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_classes = len(np.unique(train_y))

    model = TinyTabTransformer(in_dim=train_X.shape[1], n_classes=n_classes,
                               d_model=d_model, n_heads=n_heads,
                               n_layers=n_layers, dropout=dropout).to(device)

    classes, counts = np.unique(train_y, return_counts=True)
    weight_map = {c: (np.sum(counts)/cnt) for c,cnt in zip(classes, counts)}
    weights = torch.tensor([weight_map[i+1] for i in range(n_classes)], dtype=torch.float32).to(device)
    criterion = nn.CrossEntropyLoss(weight=weights)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    train_ds = TensorDataset(torch.tensor(train_X, dtype=torch.float32),
                             torch.tensor(train_y-1, dtype=torch.long))
    val_ds   = TensorDataset(torch.tensor(val_X, dtype=torch.float32),
                             torch.tensor(val_y-1, dtype=torch.long))
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

    best_f1, best_state, no_improve = -1.0, None, 0

    for epoch in range(epochs):
        model.train()
        for bx, by in train_loader:
            bx, by = bx.to(device), by.to(device)
            logits = model(bx)
            loss = criterion(logits, by)
            optimizer.zero_grad(); loss.backward(); optimizer.step()

        model.eval()
        all_pred, all_prob, all_true = [], [], []
        with torch.no_grad():
            for bx, by in val_loader:
                bx = bx.to(device)
                logits = model(bx)
                prob = torch.softmax(logits, dim=1).cpu().numpy()
                pred = prob.argmax(axis=1)
                all_prob.append(prob); all_pred.append(pred); all_true.append(by.numpy())

        y_true = np.concatenate(all_true)
        y_pred = np.concatenate(all_pred)
        y_prob = np.concatenate(all_prob)
        f1 = f1_score(y_true, y_pred, average='macro')

        if f1 > best_f1:
            best_f1 = f1
            best_state = {k: v.cpu().clone() for k,v in model.state_dict().items()}
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                break

    model.load_state_dict({k:v for k,v in best_state.items()})
    model.eval()
    with torch.no_grad():
        val_logits = model(torch.tensor(val_X, dtype=torch.float32).to(device)).cpu().numpy()
        val_prob   = softmax(val_logits, axis=1)  
        val_pred   = val_prob.argmax(axis=1)

    y_true0 = (val_y-1)
    acc = accuracy_score(y_true0, val_pred)
    f1  = f1_score(y_true0, val_pred, average='macro')
    try:
        auc = roc_auc_score(label_binarize(y_true0, classes=[0,1,2]),
                            val_prob, average='macro', multi_class='ovr')
    except:
        auc = np.nan
    return acc, f1, auc

# =========================
# 2) 10-fold 训练
# =========================
modes = ["X-only","X+Y","X+Z","X+Y+Z"]
results = {}

for mode in modes:
    cols = feature_columns(mode)
    print(f"\n=== 模式: {mode} | 特征数: {len(cols)} ===")
    acc_list, f1_list, auc_list = [], [], []

    for fold in range(10):
        train_idx = df["fold_id"] != fold
        val_idx   = df["fold_id"] == fold

        X_train = df.loc[train_idx, cols].values
        y_train = df.loc[train_idx, "Level_OA"].astype(int).values
        X_val   = df.loc[val_idx,   cols].values
        y_val   = df.loc[val_idx,   "Level_OA"].astype(int).values

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val   = scaler.transform(X_val)

        acc, f1, auc = train_one_fold(X_train, y_train, X_val, y_val)
        acc_list.append(acc); f1_list.append(f1); auc_list.append(auc)

    results[mode] = {
        "acc_mean": np.mean(acc_list), "acc_std": np.std(acc_list),
        "f1_mean":  np.mean(f1_list),  "f1_std":  np.std(f1_list),
        "auc_mean": np.nanmean(auc_list), "auc_std": np.nanstd(auc_list),
    }
    print(f"[{mode}] 10-fold 结果：Acc={np.mean(acc_list):.3f}±{np.std(acc_list):.3f} | "
          f"F1={np.mean(f1_list):.3f}±{np.std(f1_list):.3f} | "
          f"AUC={np.nanmean(auc_list):.3f}±{np.nanstd(auc_list):.3f}")

pd.DataFrame(results).T



In [None]:
X_all = df.loc[:, feature_columns("X+Y+Z")].values
y_all = df["Level_OA"].astype(int).values

rs = RandomizedSearchCV(
    LGBMClassifier(random_state=2025, class_weight="balanced"),
    param_distributions=param_dist, n_iter=100,
    scoring="f1_macro",
    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=2025),
    n_jobs=-1, random_state=2025
)
rs.fit(X_all, y_all)
print(rs.best_params_, rs.best_score_)


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

PATH_IN  = r"C:\**\**\大样本预测\7565.xlsx"
PATH_OUT = r"C:\**\**\大样本预测\7565_filled.xlsx"

df = pd.read_excel(PATH_IN)

num_cols = [c for c in df.columns if np.issubdtype(df[c].dtype, np.number)]

df[num_cols] = df[num_cols].replace([np.inf, -np.inf], np.nan)

imputer = SimpleImputer(strategy="median")
df[num_cols] = imputer.fit_transform(df[num_cols])

df[num_cols] = df[num_cols].fillna(0.0)

os.makedirs(os.path.dirname(PATH_OUT), exist_ok=True)
df.to_excel(PATH_OUT, index=False)
