In [1]:

!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3


In [2]:
# Fingerprint + MLP baseline for polymer prediction (ready to run on Kaggle)
# Paste into one cell. Assumes train.csv, test.csv and sample_submission.csv (optional) are in working dir.
# Requires: rdkit, torch, numpy, pandas, sklearn (usually present on Kaggle). No internet required.
import os, math, random, time
from typing import List
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from rdkit import Chem
from rdkit.Chem import AllChem

# ---------------- Config ----------------
TRAIN_CSV = "/kaggle/input/neurips-open-polymer-prediction-2025/train.csv"
TEST_CSV = "/kaggle/input/neurips-open-polymer-prediction-2025/test.csv"
SAMPLE_SUB = "/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv"   # optional
OUT_SUB = "/kaggle/working/submission.csv"
SMILES_COL = "SMILES"
TARGET_COLS = ["Tg","FFV","Tc","Density","Rg"]   # adapt if needed
SEED = 42
BATCH_SIZE = 256
EPOCHS = 20
LR = 1e-3
FINGERPRINT_NBITS = 2048
FINGERPRINT_RADIUS = 2
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
np.random.seed(SEED); random.seed(SEED); torch.manual_seed(SEED)

# ---------------- Utilities ----------------
def mol_from_smiles(s):
    try:
        return Chem.MolFromSmiles(s)
    except Exception:
        return None

def fp_from_smiles(smiles, nbits=FINGERPRINT_NBITS, radius=FINGERPRINT_RADIUS):
    mol = mol_from_smiles(smiles)
    if mol is None:
        return None
    arr = np.zeros((nbits,), dtype=np.uint8)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits)
    DataStructs = None
    # use fp.ToBitString() which is robust and avoids extra imports
    bstr = fp.ToBitString()
    return np.frombuffer(bstr.encode('ascii'), dtype=np.uint8) - ord('0')  # array of 0/1

# faster safer variant using explicit conversion to numpy via GetOnBits
def fp_bits_from_smiles(smiles, nbits=FINGERPRINT_NBITS, radius=FINGERPRINT_RADIUS):
    mol = mol_from_smiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits)
    onbits = list(fp.GetOnBits())
    arr = np.zeros((nbits,), dtype=np.float32)
    arr[onbits] = 1.0
    return arr

# ---------------- Dataset ----------------
class FingerprintDataset(Dataset):
    def __init__(self, df: pd.DataFrame, smiles_col=SMILES_COL, target_cols=TARGET_COLS, nbits=FINGERPRINT_NBITS):
        self.ids = df.index.values
        self.smiles = df[smiles_col].astype(str).values
        self.n = len(df)
        self.targets = None
        self.mask = None
        if all(c in df.columns for c in target_cols):
            Y = df[target_cols].values.astype(np.float32)
            # mask of valid values
            mask = (~pd.isna(df[target_cols])).astype(np.float32).values
            Y = np.nan_to_num(Y, nan=0.0, posinf=0.0, neginf=0.0)
            self.targets = Y
            self.mask = mask
        else:
            self.targets = np.zeros((self.n, len(target_cols)), dtype=np.float32)
            self.mask = np.zeros_like(self.targets, dtype=np.float32)
        self.nbits = nbits
        # Precompute fingerprints (this cell is OK for Kaggle sizes; adjust if huge)
        fps = []
        skipped = 0
        for s in self.smiles:
            arr = fp_bits_from_smiles(s, nbits=self.nbits)
            if arr is None:
                # fallback zero vector for invalid SMILES
                arr = np.zeros((self.nbits,), dtype=np.float32)
                skipped += 1
            fps.append(arr)
        self.fps = np.vstack(fps).astype(np.float32)
        if skipped:
            print(f"Warning: {skipped} SMILES failed to parse -> replaced with zero fingerprint")

    def __len__(self):
        return self.n

    def __getitem__(self, idx):
        x = self.fps[idx]               # float32 vector
        y = self.targets[idx]           # float32 vector (may be zeros)
        mask = self.mask[idx]           # float32 mask 0/1
        return torch.from_numpy(x), torch.from_numpy(y), torch.from_numpy(mask), idx

# ---------------- Model ----------------
class MLPRegressor(nn.Module):
    def __init__(self, in_dim, hidden=1024, out_dim=5, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.BatchNorm1d(hidden),
            nn.Dropout(dropout),
            nn.Linear(hidden, hidden//2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden//2),
            nn.Dropout(dropout),
            nn.Linear(hidden//2, out_dim)
        )
    def forward(self, x):
        return self.net(x)

# masked MSE loss
def masked_mse(pred, y, mask):
    # pred,y,mask: (B, out_dim)
    denom = mask.sum()
    if denom.item() == 0:
        return torch.tensor(0.0, device=pred.device)
    sq = ((pred - y)**2) * mask
    return sq.sum() / denom

# ---------------- Train / Eval ----------------
def train_epoch(model, loader, optimizer, device):
    model.train()
    total = 0.0
    denom = 0.0
    for x,y,mask,_ in loader:
        x = x.to(device); y = y.to(device); mask = mask.to(device)
        pred = model(x)
        loss = masked_mse(pred, y, mask)
        optimizer.zero_grad(); loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 3.0)
        optimizer.step()
        total += loss.item() * mask.sum().item()
        denom += mask.sum().item()
    return (total/denom) if denom>0 else float('nan')

def eval_epoch(model, loader, device):
    model.eval()
    total = 0.0
    denom = 0.0
    preds_list = []
    ids_list = []
    with torch.no_grad():
        for x,y,mask,idx in loader:
            x = x.to(device); y = y.to(device); mask = mask.to(device)
            pred = model(x)
            total += (((pred - y)**2) * mask).sum().item()
            denom += mask.sum().item()
            preds_list.append(pred.cpu().numpy())
            ids_list.append(idx.numpy())
    rmse = math.sqrt(total/denom) if denom>0 else float('nan')
    preds = np.vstack(preds_list) if preds_list else np.zeros((0, len(TARGET_COLS)))
    ids = np.concatenate(ids_list) if ids_list else np.array([])
    return total/denom if denom>0 else float('nan'), rmse, preds, ids

# ---------------- Main flow ----------------
print("Loading CSVs...")
df_train = pd.read_csv(TRAIN_CSV)
if os.path.exists(TEST_CSV):
    df_test = pd.read_csv(TEST_CSV)
else:
    df_test = pd.DataFrame({SMILES_COL: []})
print("Train rows:", len(df_train), "Test rows:", len(df_test))

# create dataset and loaders
full = FingerprintDataset(df_train, smiles_col=SMILES_COL, target_cols=TARGET_COLS, nbits=FINGERPRINT_NBITS)
n = len(full)
idxs = list(range(n)); random.shuffle(idxs)
split = int(0.8*n)
train_idx, val_idx = idxs[:split], idxs[split:]
train_ds = torch.utils.data.Subset(full, train_idx)
val_ds = torch.utils.data.Subset(full, val_idx)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

# model
in_dim = FINGERPRINT_NBITS
model = MLPRegressor(in_dim, hidden=1024, out_dim=len(TARGET_COLS)).to(DEVICE)
opt = torch.optim.Adam(model.parameters(), lr=LR)
best_rmse = 1e9
best_state = None

print("Training...")
for epoch in range(1, EPOCHS+1):
    t0 = time.time()
    tr_loss = train_epoch(model, train_loader, opt, DEVICE)
    val_loss, val_rmse, _, _ = eval_epoch(model, val_loader, DEVICE)
    t1 = time.time()
    print(f"Epoch {epoch}/{EPOCHS} — train_loss={tr_loss:.6f} val_loss={val_loss:.6f} val_rmse={val_rmse:.6f} time={t1-t0:.1f}s")
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        best_state = model.state_dict()

# save best model
if best_state is not None:
    torch.save(best_state, "fp_mlp_best.pth")
    print("Saved best model (rmse)", best_rmse)

# ---------------- Inference and submission ----------------
print("Compute test fingerprints and predict...")
# create test dataset quickly (no targets)
test_smiles = df_test[SMILES_COL].astype(str).values if len(df_test)>0 else np.array([])
test_fps = []
for s in test_smiles:
    arr = fp_bits_from_smiles(s, nbits=FINGERPRINT_NBITS)
    if arr is None:
        arr = np.zeros((FINGERPRINT_NBITS,), dtype=np.float32)
    test_fps.append(arr)
if len(test_fps)==0:
    print("No test rows found — skipping submission write.")
else:
    X_test = torch.tensor(np.vstack(test_fps), dtype=torch.float32)
    model = MLPRegressor(in_dim, hidden=1024, out_dim=len(TARGET_COLS))
    model.load_state_dict(torch.load("fp_mlp_best.pth", map_location="cpu"))
    model.to(DEVICE)
    model.eval()
    preds = []
    with torch.no_grad():
        for i in range(0, len(X_test), BATCH_SIZE):
            xb = X_test[i:i+BATCH_SIZE].to(DEVICE)
            out = model(xb).cpu().numpy()
            preds.append(out)
    preds = np.vstack(preds)
    # load sample submission if present to obtain id order and column names
    if os.path.exists(SAMPLE_SUB):
        samp = pd.read_csv(SAMPLE_SUB)
        id_col = samp.columns[0]
        target_cols = list(samp.columns[1:])
    else:
        id_col = 'id'
        target_cols = TARGET_COLS[:preds.shape[1]]

    # build submission DataFrame
    ids = df_test[id_col].values if id_col in df_test.columns else df_test.index.values
    sub = pd.DataFrame(preds, columns=target_cols)
    sub.insert(0, id_col, ids)
    # reorder columns to match sample if available
    if os.path.exists(SAMPLE_SUB):
        cols = [samp.columns[0]] + [c for c in samp.columns[1:] if c in sub.columns]
        sub = sub[cols]
    sub.to_csv(OUT_SUB, index=False)
    print("Saved submission:", OUT_SUB, "shape:", sub.shape)


Loading CSVs...
Train rows: 7973 Test rows: 3




Training...
Epoch 1/20 — train_loss=1102.822558 val_loss=971.972990 val_rmse=31.176481 time=1.5s
Epoch 2/20 — train_loss=729.758042 val_loss=847.985372 val_rmse=29.120188 time=1.3s
Epoch 3/20 — train_loss=415.737043 val_loss=760.243097 val_rmse=27.572506 time=1.3s
Epoch 4/20 — train_loss=249.500603 val_loss=695.719342 val_rmse=26.376492 time=1.3s
Epoch 5/20 — train_loss=149.152913 val_loss=732.872192 val_rmse=27.071612 time=1.3s
Epoch 6/20 — train_loss=136.230502 val_loss=709.968903 val_rmse=26.645242 time=1.3s
Epoch 7/20 — train_loss=100.985167 val_loss=727.106386 val_rmse=26.964910 time=1.3s
Epoch 8/20 — train_loss=65.120121 val_loss=684.378830 val_rmse=26.160635 time=1.3s
Epoch 9/20 — train_loss=58.571418 val_loss=630.776383 val_rmse=25.115262 time=1.3s
Epoch 10/20 — train_loss=64.701030 val_loss=667.930063 val_rmse=25.844343 time=1.3s
Epoch 11/20 — train_loss=59.886712 val_loss=640.399575 val_rmse=25.306117 time=1.3s
Epoch 12/20 — train_loss=58.466759 val_loss=602.593745 val_rmse=2

