In [1]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import os, random
import numpy as np
import pandas as pd

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

import torch
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/catechol-benchmark-hackathon/drfps_catechol_lookup.csv
/kaggle/input/catechol-benchmark-hackathon/catechol_full_data_yields.csv
/kaggle/input/catechol-benchmark-hackathon/catechol_single_solvent_yields.csv
/kaggle/input/catechol-benchmark-hackathon/fragprints_lookup.csv
/kaggle/input/catechol-benchmark-hackathon/acs_pca_descriptors_lookup.csv
/kaggle/input/catechol-benchmark-hackathon/utils.py
/kaggle/input/catechol-benchmark-hackathon/spange_descriptors_lookup.csv
/kaggle/input/catechol-benchmark-hackathon/smiles_lookup.csv
/kaggle/input/catechol-benchmark-hackathon/drfps_catechol_lookup.csv
/kaggle/input/catechol-benchmark-hackathon/catechol_full_data_yields.csv
/kaggle/input/catechol-benchmark-hackathon/catechol_single_solvent_yields.csv
/kaggle/input/catechol-benchmark-hackathon/fragprints_lookup.csv
/kaggle/input/catechol-benchmark-hackathon/acs_pca_descriptors_lookup.csv
/kaggle/input/catechol-benchmark-hackathon/utils.py
/kaggle/input/catechol-benchmark-hackathon/s

In [2]:
import sys
sys.path.append('/kaggle/input/catechol-benchmark-hackathon/')

from utils import INPUT_LABELS_FULL_SOLVENT, INPUT_LABELS_SINGLE_SOLVENT, INPUT_LABELS_NUMERIC, INPUT_LABELS_SINGLE_FEATURES, INPUT_LABELS_FULL_FEATURES, load_data, load_features, generate_leave_one_out_splits, generate_leave_one_ramp_out_splits

In [3]:
from abc import ABC, abstractmethod

class SmilesFeaturizer(ABC):
    @abstractmethod
    def featurize(self, X):
        """Return torch tensor features from dataframe X."""
        ...

class BaseModel(ABC):
    @abstractmethod
    def train_model(self, X_train, y_train):
        ...

    @abstractmethod
    def predict(self, X):
        ...


In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

torch.set_default_dtype(torch.float32)

# ----------------------------
# helpers
# ----------------------------
def _strip_cols(df):
    return df.rename(columns=lambda c: c.strip())

def _clean_names(s):
    return pd.Series(s).astype(str).str.strip()

def _find_single_solvent_col(X):
    Xc = _strip_cols(X)
    if "SOLVENT NAME" in Xc.columns:
        return "SOLVENT NAME"
    if "SOLVENT" in Xc.columns:
        return "SOLVENT"
    for c in Xc.columns:
        if "SOLVENT" in c.upper():
            return c
    raise KeyError("Cannot find solvent column in single task")

class _TorchScaler:
    def __init__(self):
        self.mean = None
        self.std = None

    def fit(self, X):
        self.mean = X.mean(dim=0, keepdim=True)
        self.std = X.std(dim=0, keepdim=True)
        self.std = torch.where(self.std < 1e-12, torch.ones_like(self.std), self.std)

    def transform(self, X):
        return (X - self.mean.to(X.device)) / self.std.to(X.device)

    def inverse(self, X):
        return X * self.std.to(X.device) + self.mean.to(X.device)

# ----------------------------
# numeric features (BASELINE)
# ----------------------------
def _num_single(rt, T):
    return np.concatenate(
        [rt, T, np.log1p(rt), T**2, rt*T],
        axis=1
    )  # 5

def _num_full(rt, T):
    Tk = T + 273.15
    invT = 1.0 / np.clip(Tk, 1e-6, None)
    base = np.concatenate([rt, T, np.log1p(rt), T**2, rt*T], axis=1)
    kin = np.concatenate([invT, rt*invT], axis=1)
    return np.concatenate([base, kin], axis=1)  # 7

# ----------------------------
# Featurizers
# ----------------------------
class FeaturizerSingle:
    def __init__(self, features=("spange_descriptors", "acs_pca_descriptors")):
        feats = [load_features(f) for f in features]
        self.feats = pd.concat(feats, axis=1)
        self.dim = 5 + self.feats.shape[1]

    def featurize(self, X):
        X = _strip_cols(X)
        sol_col = _find_single_solvent_col(X)

        rt = X["Residence Time"].to_numpy(np.float32).reshape(-1, 1)
        T  = X["Temperature"].to_numpy(np.float32).reshape(-1, 1)
        num = _num_single(rt, T)

        sol = _clean_names(X[sol_col]).values
        sol_feat = self.feats.loc[sol].to_numpy(np.float32)

        return torch.tensor(np.concatenate([num, sol_feat], axis=1))

class FeaturizerFull:
    def __init__(self, features=("spange_descriptors", "acs_pca_descriptors")):
        feats = [load_features(f) for f in features]
        self.feats = pd.concat(feats, axis=1)
        d = self.feats.shape[1]
        self.dim = (7 + 2) + 5 * d

    def featurize(self, X):
        X = _strip_cols(X)

        rt = X["Residence Time"].to_numpy(np.float32).reshape(-1, 1)
        T  = X["Temperature"].to_numpy(np.float32).reshape(-1, 1)
        frac = X["SolventB%"].to_numpy(np.float32).reshape(-1, 1)

        num = _num_full(rt, T)
        mix = np.concatenate([frac, frac*(1-frac)], axis=1)

        A = _clean_names(X["SOLVENT A NAME"]).values
        B = _clean_names(X["SOLVENT B NAME"]).values

        featA = self.feats.loc[A].to_numpy(np.float32)
        featB = self.feats.loc[B].to_numpy(np.float32)

        avg = featA*(1-frac) + featB*frac
        diff = featB - featA
        inter = diff * frac

        return torch.tensor(
            np.concatenate([num, mix, featA, featB, avg, diff, inter], axis=1)
        )

# ----------------------------
# Network
# ----------------------------
class _MLP(nn.Module):
    def __init__(self, in_dim, hidden=(256,256,128), dropout=0.10):
        super().__init__()
        layers = []
        prev = in_dim
        for h in hidden:
            layers += [
                nn.Linear(prev, h),
                nn.BatchNorm1d(h),
                nn.SiLU(),
                nn.Dropout(dropout),
            ]
            prev = h
        layers.append(nn.Linear(prev, 3))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

# ----------------------------
# Model
# ----------------------------
class MLPModel(nn.Module, BaseModel):
    def __init__(
        self,
        data="single",
        base_seed=41,
        lr=1.2e-3,
        weight_decay=1e-3,
        epochs=450,
        batch_size=256,
    ):
        super().__init__()
        self.data = data
        self.base_seed = base_seed
        self.lr = lr
        self.weight_decay = weight_decay
        self.epochs = epochs
        self.batch_size = batch_size

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.featurizer = FeaturizerSingle() if data=="single" else FeaturizerFull()
        self.net = _MLP(self.featurizer.dim).to(self.device)

        self.x_scaler = _TorchScaler()
        self.y_scaler = _TorchScaler()

    def _seed(self):
        np.random.seed(self.base_seed)
        torch.manual_seed(self.base_seed)

    def train_model(self, X, y):
        self._seed()

        Xt = self.featurizer.featurize(X)
        yt = torch.tensor(y.values, dtype=torch.float32)

        self.x_scaler.fit(Xt)
        self.y_scaler.fit(yt)

        Xt = self.x_scaler.transform(Xt)
        yt = self.y_scaler.transform(yt)

        ds = TensorDataset(Xt, yt)
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=True)

        opt = torch.optim.AdamW(self.net.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=self.epochs)
        loss_fn = nn.MSELoss()

        self.net.train()
        for _ in range(self.epochs):
            for xb, yb in dl:
                xb = xb.to(self.device)
                yb = yb.to(self.device)
                opt.zero_grad()
                loss = loss_fn(self.net(xb), yb)
                loss.backward()
                nn.utils.clip_grad_norm_(self.net.parameters(), 1.0)
                opt.step()
            sched.step()

    @torch.no_grad()
    def predict(self, X):
        self.net.eval()
        Xt = self.x_scaler.transform(self.featurizer.featurize(X)).to(self.device)
        y = self.y_scaler.inverse(self.net(Xt))
        return torch.clamp(y, 0.0, 1.0)


In [10]:

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = MLPModel(
    data="single",
    base_seed=41 + fold_idx
    )

    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  

    
    predictions_np = predictions.detach().cpu().numpy()


    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })


submission_single_solvent = pd.DataFrame(all_predictions)


24it [02:12,  5.53s/it]


In [11]:

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = MLPModel(
    data="full",
    base_seed=1041 + fold_idx
    )

 
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  

    
    predictions_np = predictions.detach().cpu().numpy()

    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })


submission_full_data = pd.DataFrame(all_predictions)



13it [01:57,  9.06s/it]


In [12]:

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)
