In [3]:
import os
import torch
import torch.nn as nn
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

In [None]:
model_path = "model/protbert_model"

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30, 1024, padding_idx=0)
    (position_embeddings): Embedding(40000, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-29): 30 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.0, i

In [5]:
def prepare_protbert_input(seq):
    # 将蛋白质序列转为 "M A C P A L ..." 的形式
    return " ".join(list(seq.strip()))

In [6]:
df = pd.read_csv("data/cancer_data_preprocessed.csv")
df = df[["mutated_protein", "Cancer Stage"]].dropna()

# 标签编码
le = LabelEncoder()
df["label"] = le.fit_transform(df["Cancer Stage"])

In [7]:
embeddings, labels = [], []

print("🧬 Encoding protein sequences...")
for _, row in tqdm(df.iterrows(), total=len(df)):
    try:
        prepared_seq = prepare_protbert_input(row["mutated_protein"])
        inputs = tokenizer(prepared_seq, return_tensors="pt", truncation=True, padding=True)

        with torch.no_grad():
            outputs = model(**inputs)
        emb = outputs.last_hidden_state.mean(dim=1).squeeze(0)

        embeddings.append(emb)
        labels.append(row["label"])
    except Exception as e:
        print("❌ Error:", e)

X = torch.stack(embeddings)
y = torch.tensor(labels)

🧬 Encoding protein sequences...


  0%|          | 0/38258 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  0%|          | 92/38258 [01:00<6:56:30,  1.53it/s] 


KeyboardInterrupt: 

In [None]:
X_np, y_np = X.numpy(), y.numpy()

X_train, X_temp, y_train, y_temp = train_test_split(
    X_np, y_np, test_size=0.3, stratify=y_np, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

X_train, y_train = torch.tensor(X_train), torch.tensor(y_train)
X_val, y_val = torch.tensor(X_val), torch.tensor(y_val)
X_test, y_test = torch.tensor(X_test), torch.tensor(y_test)

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=64)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=64)

In [None]:
class ProteinClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )
    def forward(self, x):
        return self.fc(x)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_cls = ProteinClassifier(X.shape[1], len(le.classes_)).to(device)
optimizer = torch.optim.AdamW(model_cls.parameters(), lr=2e-4)
loss_fn = nn.CrossEntropyLoss()

EPOCHS = 10
for epoch in range(EPOCHS):
    model_cls.train()
    train_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model_cls(xb)
        loss = loss_fn(out, yb)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # 验证
    model_cls.eval()
    val_loss = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            out = model_cls(xb)
            loss = loss_fn(out, yb)
            val_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

In [None]:
model_cls.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        preds = model_cls(xb).argmax(dim=1).cpu()
        all_preds.extend(preds)
        all_labels.extend(yb)

acc = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average="macro")
print(f"✅ Test Accuracy: {acc:.4f}, Macro F1: {f1:.4f}")

In [None]:
torch.save(model_cls.state_dict(), "outputs/protbert_classifier.pt")
print("✅ 模型已保存到 outputs/protbert_classifier.pt")