In [7]:
import pandas as pd
merged = pd.read_parquet("train_demo.parquet")
print("Merged shape:", merged.shape)

X = merged.drop(columns=["protein_id", "term"]).astype(float).values
Y_raw = merged["term"]

from sklearn.preprocessing import MultiLabelBinarizer
import joblib

mlb = MultiLabelBinarizer(sparse_output=True)
Y = mlb.fit_transform(Y_raw)
joblib.dump(mlb, "mlb.pkl")
print("X shape:", X.shape)
print("Y shape:", Y.shape)

from sklearn.model_selection import train_test_split

# Split into train (80%) and validation (20%)
X_train, X_val, Y_train, Y_val = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape, Y_train.shape)
print("Val shape:", X_val.shape, Y_val.shape)


Merged shape: (5000, 322)
X shape: (5000, 320)
Y shape: (5000, 8910)
Train shape: (4000, 320) (4000, 8910)
Val shape: (1000, 320) (1000, 8910)


In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import f1_score
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ----- Sparse Dataset -----
class SparseDataset(Dataset):
    def __init__(self, X, Y_sparse):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.Y = Y_sparse.tocsr()   # fast row access

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        y = torch.tensor(self.Y[idx].toarray(), dtype=torch.float32).squeeze(0)
        return self.X[idx], y

# ----- Model -----
class GOClassifier(nn.Module):
    def __init__(self, input_dim, output_dim, hidden=512, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, hidden // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden // 2, output_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)


# ----- Setup -----
model = GOClassifier(input_dim=X.shape[1],
    output_dim=Y.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

train_dataset = SparseDataset(X_train, Y_train)
val_dataset   = SparseDataset(X_val,   Y_val)
train_loader  = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader    = DataLoader(val_dataset,   batch_size=256, shuffle=False)

# ----- Training + Validation -----
best_f1 = 0.0
for epoch in range(3):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model(xb), yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Train loss: {total_loss/len(train_loader):.4f}")

    # --- Validation (batched) ---
    model.eval()
    val_loss, preds_list, true_list = 0, [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            val_loss += criterion(out, yb).item()
            preds_list.append((out > 0.5).cpu().numpy())
            true_list.append(yb.cpu().numpy())

    preds = np.vstack(preds_list)
    trues = np.vstack(true_list)
    f1 = f1_score(trues, preds, average='micro')
    elementwise_acc = (preds == trues).mean()
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), "best_go_classifier.pth")
        print(f"   ✅ New best model saved (F1={best_f1:.4f})")

    print(f"   Val loss: {val_loss/len(val_loader):.4f} | F1: {f1:.4f} | Elem acc: {elementwise_acc:.4f}")


Epoch 1 Train loss: 0.2025
   Val loss: 0.0114 | F1: 0.0000 | Elem acc: 0.9993
Epoch 2 Train loss: 0.0113
   Val loss: 0.0098 | F1: 0.0000 | Elem acc: 0.9993
Epoch 3 Train loss: 0.0086
   Val loss: 0.0076 | F1: 0.0000 | Elem acc: 0.9993


| Hyperparameter    | Why it matters                |
| ----------------- | ----------------------------- |
| **hidden_dim**    | controls model capacity       |
| **dropout**       | prevents overfitting          |
| **learning rate** | biggest effect on convergence |
| **batch size**    | stability of gradients        |
| **activation**    | ReLU vs GELU                  |


In [12]:
test_emb_df = pd.read_parquet("test_demo.parquet")

def extract_uniprot_id(pid):
    parts = pid.split('|')
    return parts[1] if len(parts) >= 2 else pid

test_emb_df["protein_id"] = test_emb_df["protein_id"].apply(extract_uniprot_id)
X_test = test_emb_df.drop(columns=["protein_id"]).astype(float).values
protein_ids = test_emb_df["protein_id"].values


In [13]:
import torch
import numpy as np
from tqdm import trange

# ======== CONFIG ========
batch_size = 256        # adjust based on GPU memory
threshold = 0.01        # only save scores above this (saves disk space)
outfile = "submission.tsv"
go_terms = mlb.classes_ # GO term order from training

# ======== Predict + Write ========
print(f"Generating predictions and saving to {outfile} ...")

with open(outfile, "w") as f_out:
    model.eval()
    with torch.no_grad():
        for i in trange(0, len(X_test), batch_size, desc="Predicting"):
            xb = torch.tensor(X_test[i:i+batch_size], dtype=torch.float32).to(device)
            probs = model(xb).cpu().numpy()          # shape: [batch, 26125]
            ids_batch = protein_ids[i:i+batch_size]  # UniProt IDs

            # Stream output to file
            for pid, row in zip(ids_batch, probs):
                high = row > threshold
                for go, p in zip(go_terms[high], row[high]):
                    f_out.write(f"{pid}\t{go}\t{p:.3f}\n")

print(f"\n✅ Submission saved to {outfile}")


Generating predictions and saving to submission.tsv ...


Predicting: 100%|██████████| 4/4 [00:06<00:00,  1.72s/it]


✅ Submission saved to submission.tsv



