In [4]:
from pepbenchmark.single_peptide.singeltask_dataset import SingleTaskDatasetManager

from pepbenchmark.pep_utils.convert import Fasta2Smiles, Smiles2FP

dataset = SingleTaskDatasetManager(
    dataset_name="AV_APML",
    official_feature_names=["fasta","ecfp6","label"],
    force_download=False,
)




[2025-07-08 20:21:03,619][INFO][pepbenchmark] Set official feature: fasta successfully


[2025-07-08 20:21:03,742][INFO][pepbenchmark] Set official feature: ecfp6 successfully
[2025-07-08 20:21:03,748][INFO][pepbenchmark] Set official feature: label successfully


In [None]:
from pepbenchmark.evaluator import evaluate_classification, get_recommended_metrics
from sklearn.ensemble import RandomForestClassifier
import numpy as np
split_type = "mmseqs2_split"  # 可以选择 "random_split" 或 "mmseqs2_split"
# split_type = "random_split"
select_metrics = {"roc-auc"}
all_metrics = {"Train": [], "Valid": [], "Test": []}
for fold_seed in [0, 1, 2, 3, 4]:  # 可以选择不同的fold_seed
    dataset.set_official_split_indices(split_type=split_type,fold_seed=fold_seed)
    # 测试其他的划分:
    # dataset.set_user_split(split_indices={"train":[],valid:[],test:[]})
    train_features, valid_features, test_features = dataset.get_train_val_test_features(format="dict")
    model = RandomForestClassifier(random_state=42)
    # 训练模型
    model.fit(X=train_features["official_ecfp6"], y=train_features["official_label"])

    for name, X, y in [
        ("Train", train_features["official_ecfp6"], train_features["official_label"]),
        ("Valid", valid_features["official_ecfp6"], valid_features["official_label"]),
        ("Test", test_features["official_ecfp6"], test_features["official_label"]),
    ]:
        preds = model.predict(X)
        probs = (
            model.predict_proba(X)[:, 1] if hasattr(model, "predict_proba") else None
        )
        metrics = evaluate_classification(y_true=y, y_pred=preds, y_score=probs)
        filtered = {k: v for k, v in metrics.items() if k in select_metrics}
        print(f"{name} set metrics (fold {fold_seed}): {filtered}", flush=True)

        # 保存结果
        all_metrics[name].append(filtered)

# 计算均值和标准差
print("\n=== Metrics Summary ===")
for name in ["Train", "Valid", "Test"]:
    print(f"\n{name} set:")
    for metric in select_metrics:
        values = [m[metric] for m in all_metrics[name]]
        mean = np.mean(values)
        std = np.std(values)
        print(f"  {metric}: mean = {mean:.4f}, std = {std:.4f}")

[2025-07-08 20:21:29,573][INFO][pepbenchmark] Set official split ===mmseqs2_split=== with seed ====0=== successfully


Train set metrics (fold 0): {'roc-auc': 0.9999998567486293}
Valid set metrics (fold 0): {'roc-auc': 0.8098706361864257}
Test set metrics (fold 0): {'roc-auc': 0.8152258566978192}


[2025-07-08 20:21:31,013][INFO][pepbenchmark] Set official split ===mmseqs2_split=== with seed ====1=== successfully


Train set metrics (fold 1): {'roc-auc': 0.9999994269945175}
Valid set metrics (fold 1): {'roc-auc': 0.8726171097847712}
Test set metrics (fold 1): {'roc-auc': 0.864201680672269}


[2025-07-08 20:21:32,401][INFO][pepbenchmark] Set official split ===mmseqs2_split=== with seed ====2=== successfully


Train set metrics (fold 2): {'roc-auc': 0.9999998557882557}
Valid set metrics (fold 2): {'roc-auc': 0.8143725734378434}
Test set metrics (fold 2): {'roc-auc': 0.8144107495069034}


[2025-07-08 20:21:33,696][INFO][pepbenchmark] Set official split ===mmseqs2_split=== with seed ====3=== successfully


Train set metrics (fold 3): {'roc-auc': 0.9999994279822503}
Valid set metrics (fold 3): {'roc-auc': 0.7899108663357368}
Test set metrics (fold 3): {'roc-auc': 0.826866181156122}


[2025-07-08 20:21:35,005][INFO][pepbenchmark] Set official split ===mmseqs2_split=== with seed ====4=== successfully


Train set metrics (fold 4): {'roc-auc': 0.9999994231530226}
Valid set metrics (fold 4): {'roc-auc': 0.7709235209235209}
Test set metrics (fold 4): {'roc-auc': 0.8659411478599223}

=== Metrics Summary ===

Train set:
  roc-auc: mean = 1.0000, std = 0.0000

Valid set:
  roc-auc: mean = 0.8115, std = 0.0342

Test set:
  roc-auc: mean = 0.8373, std = 0.0231


: 

In [4]:
!source ~/.bashrc
!export HF_HUB_OFFLINE=1

In [5]:
import os
os.environ["HTTP_PROXY"] = "http://127.0.0.1:7890"
os.environ["HTTPS_PROXY"] = "http://127.0.0.1:7890"

In [6]:
from pepbenchmark.pep_utils.convert import Fasta2Embedding

fasta = dataset.get_official_feature("fasta") 
# 直接使用convert.py中的Fasta2Embedding转换器
model = "airkingbd/dplm_150m"
fasta2emb = Fasta2Embedding(model)
emb = fasta2emb(fasta)
dataset.set_user_feature("embedding", emb)


[2025-07-07 05:11:27,418][INFO][pepbenchmark] Feature fasta already loaded, skipping download
Some weights of EsmModel were not initialized from the model checkpoint at airkingbd/dplm_150m and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Generating embeddings:   0%|          | 0/4796 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Generating embeddings: 100%|██████████| 4796/4796 [01:50<00:00, 43.42it/s]
[2025-07-07 05:13:20,105][INFO][pepbenchmark] Set user feature: embedding successfully


In [None]:

train_torch_dataset, valid_torch_dataset, test_pytorch_dataset = dataset.get_train_val_test_features(format="pytorch_dataset")
print(train_torch_dataset[0])

{'official_fasta': 'ISTTFTTNLTEYPLS', 'official_label': 1, 'user_fingerprint': array([0, 1, 0, ..., 0, 0, 0]), 'user_embedding': array([-3.25971872e-01,  5.59503958e-02,  1.60561502e-01, -1.88873515e-01,
       -3.03648319e-03, -1.24160320e-01, -1.42113522e-01, -2.80184239e-01,
       -1.62246287e-01,  2.56338298e-01, -1.58257633e-02,  1.57343119e-01,
        1.76935956e-01,  2.48851757e-02,  8.26566070e-02, -4.12752256e-02,
       -8.82978886e-02, -6.31294847e-02, -1.48531273e-02,  4.96323667e-02,
       -1.05612129e-01,  1.05967335e-01,  8.16064328e-02,  2.00294182e-01,
        6.25323281e-02,  1.46937266e-01,  1.39937744e-01, -1.35676861e-01,
        6.25423864e-02, -3.22646857e-03, -4.01851721e-02, -1.98922008e-01,
        1.16619885e-01,  1.34749830e-01,  9.61783063e-03,  5.30069023e-02,
        4.97481041e-03,  2.54723672e-02, -1.15661748e-01,  1.09894685e-01,
        1.16682716e-01,  1.37592450e-01,  8.95697400e-02, -1.57308925e-04,
       -1.81789681e-01,  4.60873730e-03, -9.86

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
from pepbenchmark.evaluator import evaluate_classification

# =========================
# 参数配置区
# =========================
config = {
    "split_type": "mmseqs2_split",       # or "random_split"
    "fold_seeds": [0, 1, 2, 3, 4],
    "batch_size": 64,
    "epochs": 50,
    "learning_rate": 1e-4,
    "hidden_dims": [512, 512,256],            # MLP隐藏层结构
    "dropout": 0.3,
    "metrics": {"roc-auc"},              # 可选多个，如 {"accuracy", "roc-auc", "f1"}
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# =========================
# 定义模型结构
# =========================
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims, dropout):
        super(MLPClassifier, self).__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, 1))
        layers.append(nn.Sigmoid())  # binary classification
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x).squeeze()

# =========================
# 评估函数
# =========================
def evaluate(model, loader, name, metrics_to_keep):
    model.eval()
    all_preds, all_probs, all_labels = [], [], []
    with torch.no_grad():
        for batch in loader:
            X = batch["user_fingerprint"].to(torch.float32).to(device)
            y = batch["official_label"].to(torch.float32).to(device)
            probs = model(X)
            preds = (probs > 0.5).float()
            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
    metrics = evaluate_classification(y_true=all_labels, y_pred=all_preds, y_score=all_probs)
    filtered = {k: v for k, v in metrics.items() if k in metrics_to_keep}
    print(f"{name} set metrics: {filtered}", flush=True)
    return filtered

# =========================
# 主循环
# =========================
all_metrics = {"Train": [], "Valid": [], "Test": []}

for fold_seed in config["fold_seeds"]:
    dataset.set_official_split_indices(split_type=config["split_type"], fold_seed=fold_seed)
    train_ds, valid_ds, test_ds = dataset.get_train_val_test_features(format="pytorch_dataset")

    input_dim = len(train_ds[0]["user_fingerprint"])
    model = MLPClassifier(
        input_dim=input_dim,
        hidden_dims=config["hidden_dims"],
        dropout=config["dropout"]
    ).to(device)

    train_loader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True)
    valid_loader = DataLoader(valid_ds, batch_size=config["batch_size"])
    test_loader = DataLoader(test_ds, batch_size=config["batch_size"])

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"])

    # === 训练 ===
    for epoch in range(config["epochs"]):
        model.train()
        total_loss = 0
        for batch in train_loader:
            X = batch["user_fingerprint"].to(torch.float32).to(device)
            y = batch["official_label"].to(torch.float32).to(device)
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Fold {fold_seed} - Epoch {epoch+1}, Train Loss: {total_loss / len(train_loader):.4f}")

    # === 评估 ===
    print(f"\n=== Fold {fold_seed} Evaluation ===")
    for name, loader in [("Train", train_loader), ("Valid", valid_loader), ("Test", test_loader)]:
        metrics = evaluate(model, loader, name, config["metrics"])
        all_metrics[name].append(metrics)

# =========================
# 汇总平均与标准差
# =========================
print("\n=== Metrics Summary ===")
for name in ["Train", "Valid", "Test"]:
    print(f"\n{name} set:")
    for metric in config["metrics"]:
        values = [m[metric] for m in all_metrics[name]]
        print(f"  {metric}: mean = {np.mean(values):.4f}, std = {np.std(values):.4f}")

[2025-07-04 16:06:58,383][INFO][pepbenchmark] Set official split ===mmseqs2_split=== with seed ====0=== successfully


Fold 0 - Epoch 1, Train Loss: 0.6772
Fold 0 - Epoch 2, Train Loss: 0.6378
Fold 0 - Epoch 3, Train Loss: 0.5807
Fold 0 - Epoch 4, Train Loss: 0.5292
Fold 0 - Epoch 5, Train Loss: 0.4914
Fold 0 - Epoch 6, Train Loss: 0.4370
Fold 0 - Epoch 7, Train Loss: 0.3954
Fold 0 - Epoch 8, Train Loss: 0.3693
Fold 0 - Epoch 9, Train Loss: 0.3194
Fold 0 - Epoch 10, Train Loss: 0.2833
Fold 0 - Epoch 11, Train Loss: 0.2426
Fold 0 - Epoch 12, Train Loss: 0.2110
Fold 0 - Epoch 13, Train Loss: 0.1817
Fold 0 - Epoch 14, Train Loss: 0.1525
Fold 0 - Epoch 15, Train Loss: 0.1325
Fold 0 - Epoch 16, Train Loss: 0.1176
Fold 0 - Epoch 17, Train Loss: 0.0986
Fold 0 - Epoch 18, Train Loss: 0.0980
Fold 0 - Epoch 19, Train Loss: 0.0700
Fold 0 - Epoch 20, Train Loss: 0.0651
Fold 0 - Epoch 21, Train Loss: 0.0569
Fold 0 - Epoch 22, Train Loss: 0.0499
Fold 0 - Epoch 23, Train Loss: 0.0493
Fold 0 - Epoch 24, Train Loss: 0.0429
Fold 0 - Epoch 25, Train Loss: 0.0392
Fold 0 - Epoch 26, Train Loss: 0.0363
Fold 0 - Epoch 27, Tr

[2025-07-04 16:07:14,776][INFO][pepbenchmark] Set official split ===mmseqs2_split=== with seed ====1=== successfully


Fold 1 - Epoch 1, Train Loss: 0.6698
Fold 1 - Epoch 2, Train Loss: 0.6251
Fold 1 - Epoch 3, Train Loss: 0.5587
Fold 1 - Epoch 4, Train Loss: 0.5161
Fold 1 - Epoch 5, Train Loss: 0.4708
Fold 1 - Epoch 6, Train Loss: 0.4295
Fold 1 - Epoch 7, Train Loss: 0.3817
Fold 1 - Epoch 8, Train Loss: 0.3421
Fold 1 - Epoch 9, Train Loss: 0.2919
Fold 1 - Epoch 10, Train Loss: 0.2576
Fold 1 - Epoch 11, Train Loss: 0.2253
Fold 1 - Epoch 12, Train Loss: 0.1972
Fold 1 - Epoch 13, Train Loss: 0.1649
Fold 1 - Epoch 14, Train Loss: 0.1303
Fold 1 - Epoch 15, Train Loss: 0.1183
Fold 1 - Epoch 16, Train Loss: 0.0940
Fold 1 - Epoch 17, Train Loss: 0.0850
Fold 1 - Epoch 18, Train Loss: 0.0928
Fold 1 - Epoch 19, Train Loss: 0.0633
Fold 1 - Epoch 20, Train Loss: 0.0535
Fold 1 - Epoch 21, Train Loss: 0.0466
Fold 1 - Epoch 22, Train Loss: 0.0452
Fold 1 - Epoch 23, Train Loss: 0.0325
Fold 1 - Epoch 24, Train Loss: 0.0351
Fold 1 - Epoch 25, Train Loss: 0.0470
Fold 1 - Epoch 26, Train Loss: 0.0293
Fold 1 - Epoch 27, Tr

[2025-07-04 16:07:31,196][INFO][pepbenchmark] Set official split ===mmseqs2_split=== with seed ====2=== successfully


Fold 2 - Epoch 1, Train Loss: 0.6678
Fold 2 - Epoch 2, Train Loss: 0.6078
Fold 2 - Epoch 3, Train Loss: 0.5583
Fold 2 - Epoch 4, Train Loss: 0.5126
Fold 2 - Epoch 5, Train Loss: 0.4804
Fold 2 - Epoch 6, Train Loss: 0.4510
Fold 2 - Epoch 7, Train Loss: 0.4020
Fold 2 - Epoch 8, Train Loss: 0.3613
Fold 2 - Epoch 9, Train Loss: 0.3086
Fold 2 - Epoch 10, Train Loss: 0.2764
Fold 2 - Epoch 11, Train Loss: 0.2427
Fold 2 - Epoch 12, Train Loss: 0.2098
Fold 2 - Epoch 13, Train Loss: 0.1664
Fold 2 - Epoch 14, Train Loss: 0.1503
Fold 2 - Epoch 15, Train Loss: 0.1176
Fold 2 - Epoch 16, Train Loss: 0.1089
Fold 2 - Epoch 17, Train Loss: 0.0932
Fold 2 - Epoch 18, Train Loss: 0.0768
Fold 2 - Epoch 19, Train Loss: 0.0690
Fold 2 - Epoch 20, Train Loss: 0.0526
Fold 2 - Epoch 21, Train Loss: 0.0598
Fold 2 - Epoch 22, Train Loss: 0.0542
Fold 2 - Epoch 23, Train Loss: 0.0440
Fold 2 - Epoch 24, Train Loss: 0.0356
Fold 2 - Epoch 25, Train Loss: 0.0378
Fold 2 - Epoch 26, Train Loss: 0.0284
Fold 2 - Epoch 27, Tr

[2025-07-04 16:07:47,580][INFO][pepbenchmark] Set official split ===mmseqs2_split=== with seed ====3=== successfully


Fold 3 - Epoch 1, Train Loss: 0.6683
Fold 3 - Epoch 2, Train Loss: 0.6344
Fold 3 - Epoch 3, Train Loss: 0.5765
Fold 3 - Epoch 4, Train Loss: 0.5268
Fold 3 - Epoch 5, Train Loss: 0.4817
Fold 3 - Epoch 6, Train Loss: 0.4497
Fold 3 - Epoch 7, Train Loss: 0.3924
Fold 3 - Epoch 8, Train Loss: 0.3489
Fold 3 - Epoch 9, Train Loss: 0.3130
Fold 3 - Epoch 10, Train Loss: 0.2797
Fold 3 - Epoch 11, Train Loss: 0.2340
Fold 3 - Epoch 12, Train Loss: 0.1990
Fold 3 - Epoch 13, Train Loss: 0.1711
Fold 3 - Epoch 14, Train Loss: 0.1448
Fold 3 - Epoch 15, Train Loss: 0.1192
Fold 3 - Epoch 16, Train Loss: 0.1036
Fold 3 - Epoch 17, Train Loss: 0.0943
Fold 3 - Epoch 18, Train Loss: 0.0740
Fold 3 - Epoch 19, Train Loss: 0.0648
Fold 3 - Epoch 20, Train Loss: 0.0553
Fold 3 - Epoch 21, Train Loss: 0.0488
Fold 3 - Epoch 22, Train Loss: 0.0489
Fold 3 - Epoch 23, Train Loss: 0.0425
Fold 3 - Epoch 24, Train Loss: 0.0343
Fold 3 - Epoch 25, Train Loss: 0.0331
Fold 3 - Epoch 26, Train Loss: 0.0377
Fold 3 - Epoch 27, Tr

[2025-07-04 16:08:03,979][INFO][pepbenchmark] Set official split ===mmseqs2_split=== with seed ====4=== successfully


Fold 4 - Epoch 1, Train Loss: 0.6752
Fold 4 - Epoch 2, Train Loss: 0.6433
Fold 4 - Epoch 3, Train Loss: 0.5854
Fold 4 - Epoch 4, Train Loss: 0.5423
Fold 4 - Epoch 5, Train Loss: 0.5001
Fold 4 - Epoch 6, Train Loss: 0.4573
Fold 4 - Epoch 7, Train Loss: 0.4205
Fold 4 - Epoch 8, Train Loss: 0.3868
Fold 4 - Epoch 9, Train Loss: 0.3466
Fold 4 - Epoch 10, Train Loss: 0.3090
Fold 4 - Epoch 11, Train Loss: 0.2677
Fold 4 - Epoch 12, Train Loss: 0.2353
Fold 4 - Epoch 13, Train Loss: 0.2042
Fold 4 - Epoch 14, Train Loss: 0.1754
Fold 4 - Epoch 15, Train Loss: 0.1397
Fold 4 - Epoch 16, Train Loss: 0.1319
Fold 4 - Epoch 17, Train Loss: 0.1043
Fold 4 - Epoch 18, Train Loss: 0.0869
Fold 4 - Epoch 19, Train Loss: 0.0823
Fold 4 - Epoch 20, Train Loss: 0.0689
Fold 4 - Epoch 21, Train Loss: 0.0595
Fold 4 - Epoch 22, Train Loss: 0.0541
Fold 4 - Epoch 23, Train Loss: 0.0451
Fold 4 - Epoch 24, Train Loss: 0.0388
Fold 4 - Epoch 25, Train Loss: 0.0366
Fold 4 - Epoch 26, Train Loss: 0.0380
Fold 4 - Epoch 27, Tr

[2025-07-04 16:16:15,214][INFO][pepbenchmark] Feature fasta already loaded, skipping download


ConnectTimeout: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /api/models/facebook/esm2_t30_150M_UR50D/tree/main/additional_chat_templates?recursive=False&expand=False (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fe8684836a0>, 'Connection to huggingface.co timed out. (connect timeout=None)'))"), '(Request ID: 44a71ebf-d1ac-4c3a-990f-332a7e8e3459)')