In [9]:
!python -m pip install --upgrade pip





In [10]:
!pip install pytdc rdkit-pypi lightgbm scikit-learn pandas numpy



In [11]:
import os, json, numpy as np, pandas as pd
from pathlib import Path

import lightgbm as lgb
from scipy.stats import spearmanr
from sklearn.model_selection import RandomizedSearchCV, PredefinedSplit
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    roc_auc_score, average_precision_score, f1_score
)

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

from tdc.single_pred import ADME, Tox

SEED = 42
np.random.seed(SEED)
ART = Path("artifacts"); ART.mkdir(exist_ok=True)
(ART / "cache_fp").mkdir(exist_ok=True)


In [12]:
def smiles_to_morgan(smiles_list, radius=2, n_bits=2048):
    """Convert SMILES -> Morgan bit vectors (np.array [n, n_bits])."""
    n = len(smiles_list)
    X = np.zeros((n, n_bits), dtype=np.uint8)
    for i, smi in enumerate(smiles_list):
        mol = Chem.MolFromSmiles(smi) if isinstance(smi, str) else None
        if mol is None:
            # keep zeros row if invalid
            continue
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        arr = np.zeros((n_bits,), dtype=np.int8)
        DataStructs.ConvertToNumpyArray(fp, arr)
        X[i] = arr
    return X

def fp_cache_key(dataset_name, split_name, radius=2, n_bits=2048):
    safe = dataset_name.replace("/", "_")
    return ART / "cache_fp" / f"{safe}__{split_name}__r{radius}_n{n_bits}.npz"

def get_fps_with_cache(df_split, dataset_name, split_name, radius=2, n_bits=2048):
    cache = fp_cache_key(dataset_name, split_name, radius, n_bits)
    if cache.exists():
        npz = np.load(cache)
        return npz["X"], npz["y"]
    # TDC single_pred splits use columns: Drug (SMILES), Y (labels)
    smiles = df_split["Drug"].astype(str).tolist()
    y = df_split["Y"].values
    # drop NaNs in Y if any
    if y.dtype.kind in "fc" and np.isnan(y).any():
        mask = ~np.isnan(y)
        df_split = df_split.loc[mask]
        smiles = df_split["Drug"].astype(str).tolist()
        y = df_split["Y"].values
    X = smiles_to_morgan(smiles, radius=radius, n_bits=n_bits)
    np.savez_compressed(cache, X=X, y=y)
    return X, y


In [13]:
from sklearn.metrics import get_scorer_names
SCORING_REG = "neg_root_mean_squared_error" if "neg_root_mean_squared_error" in get_scorer_names() else "neg_mean_squared_error"


In [14]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, roc_auc_score, average_precision_score, f1_score
from scipy.stats import spearmanr

def rmse(y_true, y_pred):
    # Works on all sklearn versions
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def eval_regression(y_true, y_pred):
    return {
        "MAE": float(mean_absolute_error(y_true, y_pred)),
        "RMSE": rmse(y_true, y_pred),
        "R2": float(r2_score(y_true, y_pred)),
        "Spearman": float(spearmanr(y_true, y_pred)[0]),
    }

def eval_binary(y_true, y_score, thr=0.5):
    y_pred = (y_score >= thr).astype(int)
    return {
        "ROC_AUC": float(roc_auc_score(y_true, y_score)),
        "PR_AUC": float(average_precision_score(y_true, y_score)),
        f"F1@{thr}": float(f1_score(y_true, y_pred)),
    }


In [15]:
PARAM_DIST = {
    "learning_rate": [0.01, 0.02, 0.05, 0.1],
    "num_leaves": np.arange(15, 256, 16),
    "max_depth": [-1, 5, 10, 15],
    "min_child_samples": np.arange(5, 100, 10),
    "subsample": np.linspace(0.6, 1.0, 5),
    "colsample_bytree": np.linspace(0.6, 1.0, 5),
    "reg_alpha": [0, 0.1, 1, 5],
    "reg_lambda": [0, 0.1, 1, 5],
}

def make_regressor():
    return lgb.LGBMRegressor(
        objective="regression", n_estimators=5000,
        random_state=SEED, n_jobs=-1
    )

def make_classifier():
    return lgb.LGBMClassifier(
        objective="binary", n_estimators=5000,
        class_weight="balanced",
        random_state=SEED, n_jobs=-1
    )


In [16]:
# Which TDC loader to use (ADME vs Tox) and target type
# (We force explicit types to avoid surprises; adjust if TDC updates any task.)
DATASETS = [
    # --- ADME (regression) ---
    ("ADME", "Caco2_Wang",                 "regression"),
    ("ADME", "Lipophilicity_AstraZeneca",  "regression"),
    ("ADME", "Solubility_AqSolDB",         "regression"),
    ("ADME", "PPBR_AZ",                    "regression"),
    ("ADME", "VDss_Lombardo",              "regression"),
    ("ADME", "Half_Life_Obach",            "regression"),
    ("ADME", "Clearance_Hepatocyte_AZ",    "regression"),
    ("ADME", "Clearance_Microsome_AZ",     "regression"),

    # --- ADME (binary class) ---
    ("ADME", "Bioavailability_Ma",         "binary"),
    ("ADME", "HIA_Hou",                    "binary"),
    ("ADME", "Pgp_Broccatelli",            "binary"),
    ("ADME", "BBB_Martins",                "binary"),
    ("ADME", "CYP2C9_Veith",               "binary"),
    ("ADME", "CYP2D6_Veith",               "binary"),
    ("ADME", "CYP3A4_Veith",               "binary"),
    ("ADME", "CYP2C9_Substrate_CarbonMangels", "binary"),
    ("ADME", "CYP2D6_Substrate_CarbonMangels", "binary"),
    ("ADME", "CYP3A4_Substrate_CarbonMangels", "binary"),

    # --- TOX (binary / regression) ---
    ("Tox",  "hERG",                        "binary"),
    ("Tox",  "AMES",                        "binary"),
    ("Tox",  "DILI",                        "binary"),
    ("Tox",  "LD50_Zhu",                    "regression"),
    # (Tox21 is multi-label; you didn’t list it here. If you add it, we’ll handle separately.)
]


In [17]:
def load_tdc(task, name):
    loader = ADME if task == "ADME" else Tox
    data = loader(name=name)
    split = data.get_split()  # official TDC split
    return split

def run_one_dataset(task, name, target_type, n_iter=40, radius=2, n_bits=2048, thr=0.5):
    print("="*80)
    print(f"{task}.{name} | target={target_type} | r={radius}, n_bits={n_bits}")

    split = load_tdc(task, name)

    # Build Morgan features (with cache) for each split
    X_train, y_train = get_fps_with_cache(split["train"], f"{task}.{name}", "train", radius, n_bits)
    X_valid, y_valid = get_fps_with_cache(split["valid"], f"{task}.{name}", "valid", radius, n_bits)
    X_test,  y_test  = get_fps_with_cache(split["test"],  f"{task}.{name}", "test",  radius, n_bits)

    # ---------------- Baseline ----------------
    if target_type == "regression":
        baseline = make_regressor()
        baseline.fit(X_train, y_train,
                     eval_set=[(X_valid, y_valid)],
                     callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)])
        yhat = baseline.predict(X_test)
        base = eval_regression(y_test, yhat)
    elif target_type == "binary":
        baseline = make_classifier()
        baseline.fit(X_train, y_train,
                     eval_set=[(X_valid, y_valid)],
                     callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)])
        yscore = baseline.predict_proba(X_test)[:, 1]
        base = eval_binary(y_test, yscore, thr=thr)
    else:
        raise ValueError("Unsupported target_type")

    # ---------------- Tuning ----------------
    X_trva = np.vstack([X_train, X_valid])
    y_trva = np.concatenate([y_train, y_valid])
    test_fold = np.array([-1]*len(y_train) + [0]*len(y_valid))
    ps = PredefinedSplit(test_fold=test_fold)

    if target_type == "regression":
        est = make_regressor()
        scoring = SCORING_REG
    else:
        est = make_classifier()
        scoring = "roc_auc"

    search = RandomizedSearchCV(
        estimator=est,
        param_distributions=PARAM_DIST,
        n_iter=n_iter,
        scoring=scoring,
        cv=ps,
        random_state=SEED,
        n_jobs=-1,
        refit=True,
        verbose=0
    )
    fit_params = {"eval_set": [(X_valid, y_valid)],
                  "callbacks": [lgb.early_stopping(stopping_rounds=150, verbose=False)]}
    search.fit(X_trva, y_trva, **fit_params)
    best_model = search.best_estimator_

    if target_type == "regression":
        yhat_t = best_model.predict(X_test)
        tuned = eval_regression(y_test, yhat_t)
    else:
        yscore_t = best_model.predict_proba(X_test)[:, 1]
        tuned = eval_binary(y_test, yscore_t, thr=thr)

    # Collect tidy rows
    row_base = {"Dataset": f"{task}.{name}", "Model": "Baseline", **pretty(base)}
    row_tuned = {"Dataset": f"{task}.{name}", "Model": "Tuned", **pretty(tuned)}
    return row_base, row_tuned, search.best_params_


In [18]:
all_rows, best_params = [], {}

for task, name, ttype in DATASETS:
    try:
        rb, rt, bp = run_one_dataset(task, name, ttype, n_iter=40)
        all_rows.extend([rb, rt])
        best_params[f"{task}.{name}"] = bp
    except Exception as e:
        print(f"❌ Failed on {task}.{name}: {e}")

# Collect results into DataFrame
df = pd.DataFrame(all_rows)
display(df.sort_values(["Dataset", "Model"]))

# ---------- SAVE ARTIFACTS ----------
# Fix: convert numpy types to native Python for JSON
def clean_params(params):
    out = {}
    for k, v in params.items():
        if isinstance(v, (np.integer, np.int32, np.int64)):
            out[k] = int(v)
        elif isinstance(v, (np.floating, np.float32, np.float64)):
            out[k] = float(v)
        else:
            out[k] = v
    return out

cleaned_params = {k: clean_params(v) for k, v in best_params.items()}

df.to_csv("tdc_lgbm_morgan_results.csv", index=False)
with open("tdc_lgbm_best_params.json", "w") as f:
    json.dump(cleaned_params, f, indent=2)

print("✅ Saved results:")
print(" - tdc_lgbm_morgan_results.csv")
print(" - tdc_lgbm_best_params.json")


Found local copy...
Loading...
Done!


ADME.Caco2_Wang | target=regression | r=2, n_bits=2048
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010806 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 716
[LightGBM] [Info] Number of data points in the train set: 637, number of used features: 358
[LightGBM] [Info] Start training from score -5.251132
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 637, number of used features: 80
[LightGBM] [Info] Start training from score -5.251132
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023615 seconds.
You can set `force_row_wise=true

Found local copy...
Loading...
Done!


❌ Failed on ADME.Caco2_Wang: name 'pretty' is not defined
ADME.Lipophilicity_AstraZeneca | target=regression | r=2, n_bits=2048
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034669 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2446
[LightGBM] [Info] Number of data points in the train set: 2940, number of used features: 1223
[LightGBM] [Info] Start training from score 2.191929
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.120914 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1016
[LightGBM] [Info] Number of data points in the train set: 2940, number of used features: 508
[LightGBM] [Info] Start training from score 2.191929
[LightGBM] [Info] Auto-choosing row-wise multi-threading, t

KeyboardInterrupt: 

In [None]:
import os
import pandas as pd
import numpy as np
from tdc.single_pred import ADME, Tox
from tdc import Evaluator
from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV

os.makedirs("splits", exist_ok=True)

# -----------------------------
# Load or save splits
# -----------------------------
def load_tdc_or_saved(task, name):
    """Load split from CSV if saved, otherwise from TDC and save it."""
    files_exist = all(
        os.path.exists(f"splits/{task}_{name}_{part}.csv")
        for part in ["train", "valid", "test"]
    )

    if files_exist:
        print(f"⏩ Using saved split for {task}.{name}")
        return {
            "train": pd.read_csv(f"splits/{task}_{name}_train.csv"),
            "valid": pd.read_csv(f"splits/{task}_{name}_valid.csv"),
            "test":  pd.read_csv(f"splits/{task}_{name}_test.csv"),
        }
    else:
        print(f"⬇️ Downloading split from TDC for {task}.{name}")
        loader = ADME if task == "ADME" else Tox
        data = loader(name=name)
        split = data.get_split()

        # Save to CSV
        for part in ["train", "valid", "test"]:
            split[part].to_csv(f"splits/{task}_{name}_{part}.csv", index=False)

        return split

# -----------------------------
# Example run_one_dataset
# -----------------------------
def run_one_dataset(task, name, target_type, n_iter=40, radius=2, n_bits=2048, thr=0.5):
    print("="*80)
    print(f"{task}.{name} | target={target_type} | r={radius}, n_bits={n_bits}")

    # Load (from saved CSV if available)
    split = load_tdc_or_saved(task, name)

    # Build features
    X_train, y_train = get_fps_with_cache(split["train"], f"{task}.{name}", "train", radius, n_bits)
    X_valid, y_valid = get_fps_with_cache(split["valid"], f"{task}.{name}", "valid", radius, n_bits)
    X_test,  y_test  = get_fps_with_cache(split["test"],  f"{task}.{name}", "test",  radius, n_bits)

    # ---------------- Baseline ----------------
    if target_type == "regression":
        baseline = make_regressor()
        baseline.fit(X_train, y_train,
                     eval_set=[(X_valid, y_valid)],
                     callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)])
        yhat = baseline.predict(X_test)

        base = eval_regression(y_test, yhat)
        evaluator = Evaluator(name="Spearman")
        base["Spearman_TDC"] = evaluator(y_test, yhat)

    elif target_type == "binary":
        baseline = make_classifier()
        baseline.fit(X_train, y_train,
                     eval_set=[(X_valid, y_valid)],
                     callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)])
        yscore = baseline.predict_proba(X_test)[:, 1]

        base = eval_binary(y_test, yscore, thr=thr)
        evaluator = Evaluator(name="ROC-AUC")
        base["ROC_AUC_TDC"] = evaluator(y_test, yscore)

    else:
        raise ValueError("Unsupported target_type")

    # ---------------- Tuning ----------------
    X_trva = np.vstack([X_train, X_valid])
    y_trva = np.concatenate([y_train, y_valid])
    test_fold = np.array([-1]*len(y_train) + [0]*len(y_valid))
    ps = PredefinedSplit(test_fold=test_fold)

    if target_type == "regression":
        est = make_regressor()
        scoring = SCORING_REG
    else:
        est = make_classifier()
        scoring = "roc_auc"

    search = RandomizedSearchCV(
        estimator=est,
        param_distributions=PARAM_DIST,
        n_iter=n_iter,
        scoring=scoring,
        cv=ps,
        random_state=SEED,
        n_jobs=-1,
        refit=True,
        verbose=0
    )
    fit_params = {"eval_set": [(X_valid, y_valid)],
                  "callbacks": [lgb.early_stopping(stopping_rounds=150, verbose=False)]}
    search.fit(X_trva, y_trva, **fit_params)
    best_model = search.best_estimator_

    if target_type == "regression":
        yhat_t = best_model.predict(X_test)
        tuned = eval_regression(y_test, yhat_t)
        evaluator = Evaluator(name="Spearman")
        tuned["Spearman_TDC"] = evaluator(y_test, yhat_t)
    else:
        yscore_t = best_model.predict_proba(X_test)[:, 1]
        tuned = eval_binary(y_test, yscore_t, thr=thr)
        evaluator = Evaluator(name="ROC-AUC")
        tuned["ROC_AUC_TDC"] = evaluator(y_test, yscore_t)

    # ---------------- Collect tidy rows ----------------
    row_base = {"Dataset": f"{task}.{name}", "Model": "Baseline", **pretty(base)}
    row_tuned = {"Dataset": f"{task}.{name}", "Model": "Tuned", **pretty(tuned)}
    return row_base, row_tuned, search.best_params_


In [None]:
all_rows, best_params = [], {}

for task, name, ttype in DATASETS:
    try:
        rb, rt, bp = run_one_dataset(task, name, ttype, n_iter=40)
        all_rows.extend([rb, rt])
        best_params[f"{task}.{name}"] = bp
    except Exception as e:
        print(f"❌ Failed on {task}.{name}: {e}")

# Collect results into DataFrame
df = pd.DataFrame(all_rows)
df = df.sort_values(["Dataset", "Model"])
display(df)


In [None]:
# Convert numpy types for JSON
def clean_params(params):
    out = {}
    for k, v in params.items():
        if isinstance(v, (np.integer, np.int32, np.int64)):
            out[k] = int(v)
        elif isinstance(v, (np.floating, np.float32, np.float64)):
            out[k] = float(v)
        else:
            out[k] = v
    return out

cleaned_params = {k: clean_params(v) for k, v in best_params.items()}

# Save CSV with all metrics
df.to_csv("tdc_lgbm_morgan_all_metrics.csv", index=False)

# Save tuned hyperparameters
import json
with open("tdc_lgbm_morgan_best_params.json", "w") as f:
    json.dump(cleaned_params, f, indent=2)

print("✅ Saved results:")
print(" - tdc_lgbm_morgan_all_metrics.csv  (all metrics + TDC evaluator)")
print(" - tdc_lgbm_morgan_best_params.json (best hyperparameters)")


In [None]:
def load_saved_split(task, name):
    """Always load official splits from saved CSVs in ./splits/."""
    return {
        "train": pd.read_csv(f"splits/{task}_{name}_train.csv"),
        "valid": pd.read_csv(f"splits/{task}_{name}_valid.csv"),
        "test":  pd.read_csv(f"splits/{task}_{name}_test.csv"),
    }


In [None]:
###############
##################################
################################################# Molecular Descriptors #################################
###############




from rdkit.Chem import Descriptors

DESCRIPTOR_FUNCS = {
    "MolWt": Descriptors.MolWt,
    "MolLogP": Descriptors.MolLogP,
    "TPSA": Descriptors.TPSA,
    "NumHDonors": Descriptors.NumHDonors,
    "NumHAcceptors": Descriptors.NumHAcceptors,
    "NumRotatableBonds": Descriptors.NumRotatableBonds,
    "RingCount": Descriptors.RingCount,
    "HeavyAtomCount": Descriptors.HeavyAtomCount,
}

def smiles_to_descriptors(smiles_list):
    X = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            values = [func(mol) for func in DESCRIPTOR_FUNCS.values()]
        else:
            values = [0.0] * len(DESCRIPTOR_FUNCS)
        X.append(values)
    return np.array(X), list(DESCRIPTOR_FUNCS.keys())






In [None]:
def run_one_dataset_descriptor(task, name, target_type, n_iter=40, thr=0.5):
    print("="*80)
    print(f"{task}.{name} | target={target_type} | features=Descriptors")

    # Always load saved split
    split = load_saved_split(task, name)

    # Build descriptor features
    X_train, desc_cols = smiles_to_descriptors(split["train"]["Drug"].tolist())
    y_train = split["train"]["Y"].values
    X_valid, _ = smiles_to_descriptors(split["valid"]["Drug"].tolist())
    y_valid = split["valid"]["Y"].values
    X_test, _ = smiles_to_descriptors(split["test"]["Drug"].tolist())
    y_test = split["test"]["Y"].values

    # ---------------- Baseline ----------------
    if target_type == "regression":
        baseline = make_regressor()
        baseline.fit(X_train, y_train,
                     eval_set=[(X_valid, y_valid)],
                     callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)])
        yhat = baseline.predict(X_test)

        base = eval_regression(y_test, yhat)
        evaluator = Evaluator(name="Spearman")
        base["Spearman_TDC"] = evaluator(y_test, yhat)

    elif target_type == "binary":
        baseline = make_classifier()
        baseline.fit(X_train, y_train,
                     eval_set=[(X_valid, y_valid)],
                     callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)])
        yscore = baseline.predict_proba(X_test)[:, 1]

        base = eval_binary(y_test, yscore, thr=thr)
        evaluator = Evaluator(name="ROC-AUC")
        base["ROC_AUC_TDC"] = evaluator(y_test, yscore)

    # ---------------- Tuning ----------------
    X_trva = np.vstack([X_train, X_valid])
    y_trva = np.concatenate([y_train, y_valid])
    test_fold = np.array([-1]*len(y_train) + [0]*len(y_valid))
    ps = PredefinedSplit(test_fold=test_fold)

    if target_type == "regression":
        est = make_regressor()
        scoring = SCORING_REG
    else:
        est = make_classifier()
        scoring = "roc_auc"

    search = RandomizedSearchCV(
        estimator=est,
        param_distributions=PARAM_DIST,
        n_iter=n_iter,
        scoring=scoring,
        cv=ps,
        random_state=SEED,
        n_jobs=-1,
        refit=True,
        verbose=0
    )
    fit_params = {"eval_set": [(X_valid, y_valid)],
                  "callbacks": [lgb.early_stopping(stopping_rounds=150, verbose=False)]}
    search.fit(X_trva, y_trva, **fit_params)
    best_model = search.best_estimator_

    if target_type == "regression":
        yhat_t = best_model.predict(X_test)
        tuned = eval_regression(y_test, yhat_t)
        evaluator = Evaluator(name="Spearman")
        tuned["Spearman_TDC"] = evaluator(y_test, yhat_t)
    else:
        yscore_t = best_model.predict_proba(X_test)[:, 1]
        tuned = eval_binary(y_test, yscore_t, thr=thr)
        evaluator = Evaluator(name="ROC-AUC")
        tuned["ROC_AUC_TDC"] = evaluator(y_test, yscore_t)

    # ---------------- Collect tidy rows ----------------
    row_base = {"Dataset": f"{task}.{name}", "Model": "Baseline", "Features": "Descriptors", **pretty(base)}
    row_tuned = {"Dataset": f"{task}.{name}", "Model": "Tuned", "Features": "Descriptors", **pretty(tuned)}
    return row_base, row_tuned, search.best_params_


In [None]:
all_rows_desc, best_params_desc = [], {}

for task, name, ttype in DATASETS:
    try:
        rb, rt, bp = run_one_dataset_descriptor(task, name, ttype, n_iter=40)
        all_rows_desc.extend([rb, rt])
        best_params_desc[f"{task}.{name}"] = bp
    except Exception as e:
        print(f"❌ Failed on {task}.{name} (descriptors): {e}")

df_desc = pd.DataFrame(all_rows_desc)
df_desc = df_desc.sort_values(["Dataset","Model"])
display(df_desc)

# Save results
df_desc.to_csv("tdc_lgbm_descriptor_all_metrics.csv", index=False)

# Save tuned hyperparameters
cleaned_params_desc = {k: {p: (int(v) if isinstance(v, (np.integer, np.int32, np.int64))
                               else float(v) if isinstance(v, (np.floating, np.float32, np.float64))
                               else v)
                           for p, v in params.items()}
                       for k, params in best_params_desc.items()}

with open("tdc_lgbm_descriptor_best_params.json", "w") as f:
    json.dump(cleaned_params_desc, f, indent=2)

print("✅ Saved:")
print(" - tdc_lgbm_descriptor_all_metrics.csv")
print(" - tdc_lgbm_descriptor_best_params.json")


In [20]:
# ---------------------------------------------------
# Define dataset type map (regression vs binary)
# ---------------------------------------------------
DATASET_TYPE = {
    "ADME.Caco2_Wang": "regression",
    "ADME.Lipophilicity_AstraZeneca": "regression",
    "ADME.Solubility_AqSolDB": "regression",
    "ADME.PPBR_AZ": "regression",
    "ADME.VDss_Lombardo": "regression",
    "ADME.Half_Life_Obach": "regression",
    "ADME.Clearance_Hepatocyte_AZ": "regression",
    "ADME.Clearance_Microsome_AZ": "regression",

    "ADME.Bioavailability_Ma": "binary",
    "ADME.HIA_Hou": "binary",
    "ADME.Pgp_Broccatelli": "binary",
    "ADME.BBB_Martins": "binary",
    "ADME.CYP2C9_Veith": "binary",
    "ADME.CYP2D6_Veith": "binary",
    "ADME.CYP3A4_Veith": "binary",
    "ADME.CYP2C9_Substrate_CarbonMangels": "binary",
    "ADME.CYP2D6_Substrate_CarbonMangels": "binary",
    "ADME.CYP3A4_Substrate_CarbonMangels": "binary",

    "Tox.hERG": "binary",
    "Tox.AMES": "binary",
    "Tox.DILI": "binary",
    "Tox.LD50_Zhu": "regression",
}

# ---------------------------------------------------
# Rebuild models with explicit target_type
# ---------------------------------------------------
results = []

for key, params in best_params.items():
    task, name = key.split(".", 1)
    target_type = DATASET_TYPE[key]   # ✅ force correct type

    print(f"⚡ Rebuilding model for {key} ({target_type})")

    split = load_saved_split(task, name)
    df_trainvalid = pd.concat([split["train"], split["valid"]])
    df_test = split["test"]

    # Features
    X_trva = smiles_to_morgan(df_trainvalid["Drug"].tolist())
    y_trva = df_trainvalid["Y"].values
    X_test = smiles_to_morgan(df_test["Drug"].tolist())
    y_test = df_test["Y"].values

    # Build model
    if target_type == "regression":
        model = lgb.LGBMRegressor(random_state=42, n_jobs=-1, **params)
        model.fit(X_trva, y_trva)
        yhat = model.predict(X_test)
        metrics = eval_regression(y_test, yhat)
    else:
        model = lgb.LGBMClassifier(random_state=42, n_jobs=-1, **params)
        model.fit(X_trva, y_trva)
        yscore = model.predict_proba(X_test)[:, 1]
        metrics = eval_binary(y_test, yscore)

    # Save model
    out_path = ART / f"{task}_{name}_best_model.pkl"
    joblib.dump(model, out_path)
    print(f"💾 Saved {out_path}")

    # Save metrics
    results.append({"Dataset": key, **metrics})

# Save all metrics together
pd.DataFrame(results).to_csv("tdc_lgbm_final_metrics.csv", index=False)
print("✅ Done. All models & metrics saved.")


⚡ Rebuilding model for ADME.Caco2_Wang (regression)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024509 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2678
[LightGBM] [Info] Number of data points in the train set: 728, number of used features: 1339
[LightGBM] [Info] Start training from score -5.238485
💾 Saved artifacts/models/ADME_Caco2_Wang_best_model.pkl
⚡ Rebuilding model for ADME.Lipophilicity_AstraZeneca (regression)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032925 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2304
[LightGBM] [Info] Number of data points in the train set: 3360, number of used features: 1152
[LightGBM] [Info] Start training from score 2.197039
💾 Saved art



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062035 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4090
[LightGBM] [Info] Number of data points in the train set: 7986, number of used features: 2045
[LightGBM] [Info] Start training from score -2.883958
💾 Saved artifacts/models/ADME_Solubility_AqSolDB_best_model.pkl
⚡ Rebuilding model for ADME.PPBR_AZ (regression)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009343 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 590
[LightGBM] [Info] Number of data points in the train set: 1291, number of used features: 295
[LightGBM] [Info] Start training from score 87.804531
💾 Saved artifacts/models/ADME_PPBR_AZ_best_model.pkl
⚡ Rebuilding model f



[LightGBM] [Info] Number of positive: 1248, number of negative: 376
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1910
[LightGBM] [Info] Number of data points in the train set: 1624, number of used features: 955
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.768473 -> initscore=1.199708
[LightGBM] [Info] Start training from score 1.199708
💾 Saved artifacts/models/ADME_BBB_Martins_best_model.pkl
⚡ Rebuilding model for ADME.CYP2C9_Veith (binary)
[LightGBM] [Info] Number of positive: 3226, number of negative: 6448
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051522 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3344
[LightGBM] 



[LightGBM] [Info] Number of positive: 353, number of negative: 171
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023417 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1990
[LightGBM] [Info] Number of data points in the train set: 524, number of used features: 995
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.673664 -> initscore=0.724805
[LightGBM] [Info] Start training from score 0.724805
💾 Saved artifacts/models/Tox_hERG_best_model.pkl
⚡ Rebuilding model for Tox.AMES (binary)
[LightGBM] [Info] Number of positive: 3176, number of negative: 2646
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063259 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4034
[LightGBM] [Info] Number of da

In [4]:
import pandas as pd
import os

SPLIT_DIR = "splits"

def load_split_from_csv(task, name):
    """
    Loads train/valid/test splits for a dataset from the splits/ folder.
    Files are expected to be named like:
    splits/{task}_{name}_train.csv
    splits/{task}_{name}_valid.csv
    splits/{task}_{name}_test.csv
    """
    base = f"{task}_{name}"
    train_path = os.path.join(SPLIT_DIR, f"{base}_train.csv")
    valid_path = os.path.join(SPLIT_DIR, f"{base}_valid.csv")
    test_path  = os.path.join(SPLIT_DIR, f"{base}_test.csv")

    split = {
        "train": pd.read_csv(train_path),
        "valid": pd.read_csv(valid_path),
        "test":  pd.read_csv(test_path)
    }
    return split


In [8]:
# ===================================================
# Imports
# ===================================================
import os, json, joblib
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors

import lightgbm as lgb
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    roc_auc_score, average_precision_score, f1_score
)
from scipy.stats import spearmanr
import math

# ===================================================
# Paths
# ===================================================
SPLIT_DIR = "splits"
PICKLE_DIR = "pickle_models"
os.makedirs(PICKLE_DIR, exist_ok=True)

# ===================================================
# Helpers
# ===================================================
def load_split_from_csv(task, name):
    """Load saved train/valid/test splits from CSV."""
    base = f"{task}_{name}"
    return {
        "train": pd.read_csv(os.path.join(SPLIT_DIR, f"{base}_train.csv")),
        "valid": pd.read_csv(os.path.join(SPLIT_DIR, f"{base}_valid.csv")),
        "test":  pd.read_csv(os.path.join(SPLIT_DIR, f"{base}_test.csv")),
    }

def smiles_to_morgan(smiles_list, radius=2, n_bits=2048):
    fps = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            fp = np.zeros((n_bits,))
        else:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
            fp = np.array(fp)
        fps.append(fp)
    return np.array(fps)

def eval_regression(y_true, y_pred):
    return {
        "mae": mean_absolute_error(y_true, y_pred),
        "rmse": math.sqrt(mean_squared_error(y_true, y_pred)),
        "r2": r2_score(y_true, y_pred),
        "spearman": spearmanr(y_true, y_pred).correlation
    }

def eval_binary(y_true, y_score, thr=0.5):
    y_pred = (np.array(y_score) >= thr).astype(int)
    return {
        "roc_auc": roc_auc_score(y_true, y_score) if len(np.unique(y_true)) > 1 else None,
        "pr_auc": average_precision_score(y_true, y_score),
        "f1_at_0.5": f1_score(y_true, y_pred)
    }

# ===================================================
# Dataset type map
# ===================================================
DATASET_TYPE = {
    "ADME.Caco2_Wang": "regression",
    "ADME.Lipophilicity_AstraZeneca": "regression",
    "ADME.Solubility_AqSolDB": "regression",
    "ADME.PPBR_AZ": "regression",
    "ADME.VDss_Lombardo": "regression",
    "ADME.Half_Life_Obach": "regression",
    "ADME.Clearance_Hepatocyte_AZ": "regression",
    "ADME.Clearance_Microsome_AZ": "regression",

    "ADME.Bioavailability_Ma": "binary",
    "ADME.HIA_Hou": "binary",
    "ADME.Pgp_Broccatelli": "binary",
    "ADME.BBB_Martins": "binary",
    "ADME.CYP2C9_Veith": "binary",
    "ADME.CYP2D6_Veith": "binary",
    "ADME.CYP3A4_Veith": "binary",
    "ADME.CYP2C9_Substrate_CarbonMangels": "binary",
    "ADME.CYP2D6_Substrate_CarbonMangels": "binary",
    "ADME.CYP3A4_Substrate_CarbonMangels": "binary",

    "Tox.hERG": "binary",
    "Tox.AMES": "binary",
    "Tox.DILI": "binary",
    "Tox.LD50_Zhu": "regression",
}

# ===================================================
# Load tuned parameters (from your JSON file)
# ===================================================
with open("tdc_lgbm_best_params.json", "r") as f:
    best_params = json.load(f)

print(f"Loaded {len(best_params)} tuned datasets")

# ===================================================
# Final Rebuild + Save (Baseline + Tuned)
# ===================================================
results = []

for key, params in best_params.items():
    task, name = key.split(".", 1)
    target_type = DATASET_TYPE.get(key, "regression")  # default to regression

    print(f"\n⚡ Training {key} ({target_type})")

    # Load split
    split = load_split_from_csv(task, name)
    df_trainvalid = pd.concat([split["train"], split["valid"]])
    df_test = split["test"]

    # Features = Morgan fingerprints
    X_trva = smiles_to_morgan(df_trainvalid["Drug"].tolist())
    y_trva = df_trainvalid["Y"].values
    X_test = smiles_to_morgan(df_test["Drug"].tolist())
    y_test = df_test["Y"].values

    # ================= Baseline =================
    if target_type == "regression":
        base_model = lgb.LGBMRegressor(random_state=42, n_jobs=-1)
        base_model.fit(X_trva, y_trva)
        yhat = base_model.predict(X_test)
        base_metrics = eval_regression(y_test, yhat)
    else:
        base_model = lgb.LGBMClassifier(random_state=42, n_jobs=-1)
        base_model.fit(X_trva, y_trva)
        yscore = base_model.predict_proba(X_test)[:, 1]
        base_metrics = eval_binary(y_test, yscore)

    # Save baseline model
    base_path = os.path.join(PICKLE_DIR, f"{task}_{name}_baseline.pkl")
    joblib.dump(base_model, base_path)
    print(f"💾 Saved baseline -> {base_path}")

    results.append({"Dataset": key, "Type": "Baseline", **base_metrics})

    # ================= Tuned =================
    if target_type == "regression":
        tuned_model = lgb.LGBMRegressor(random_state=42, n_jobs=-1, **params)
        tuned_model.fit(X_trva, y_trva)
        yhat = tuned_model.predict(X_test)
        tuned_metrics = eval_regression(y_test, yhat)
    else:
        tuned_model = lgb.LGBMClassifier(random_state=42, n_jobs=-1, **params)
        tuned_model.fit(X_trva, y_trva)
        yscore = tuned_model.predict_proba(X_test)[:, 1]
        tuned_metrics = eval_binary(y_test, yscore)

    # Save tuned model
    tuned_path = os.path.join(PICKLE_DIR, f"{task}_{name}_tuned.pkl")
    joblib.dump(tuned_model, tuned_path)
    print(f"💾 Saved tuned -> {tuned_path}")

    results.append({"Dataset": key, "Type": "Tuned", **tuned_metrics})

# Save metrics table
df_results = pd.DataFrame(results)
df_results.to_csv("tdc_lgbm_final_metrics.csv", index=False)
display(df_results)

print("✅ Done. Baseline + tuned models saved in pickle_models/, metrics in tdc_lgbm_final_metrics.csv")


Loaded 22 tuned datasets

⚡ Training ADME.Caco2_Wang (regression)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012574 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 834
[LightGBM] [Info] Number of data points in the train set: 728, number of used features: 417
[LightGBM] [Info] Start training from score -5.238485
💾 Saved baseline -> pickle_models/ADME_Caco2_Wang_baseline.pkl
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025142 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2678
[LightGBM] [Info] Number of data points in the train set: 728, number of used features: 1339
[LightGBM] [Info] Start training from score -5.238485








💾 Saved tuned -> pickle_models/ADME_Caco2_Wang_tuned.pkl

⚡ Training ADME.Lipophilicity_AstraZeneca (regression)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038678 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2638
[LightGBM] [Info] Number of data points in the train set: 3360, number of used features: 1319
[LightGBM] [Info] Start training from score 2.197039




💾 Saved baseline -> pickle_models/ADME_Lipophilicity_AstraZeneca_baseline.pkl
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2304
[LightGBM] [Info] Number of data points in the train set: 3360, number of used features: 1152
[LightGBM] [Info] Start training from score 2.197039
💾 Saved tuned -> pickle_models/ADME_Lipophilicity_AstraZeneca_tuned.pkl

⚡ Training ADME.Solubility_AqSolDB (regression)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050506 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3226
[LightGBM] [Info] Number of data points in the train set: 7986, number of used features: 1613
[LightGBM] [Info] Start training from score -2.883958




💾 Saved baseline -> pickle_models/ADME_Solubility_AqSolDB_baseline.pkl
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064998 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4090
[LightGBM] [Info] Number of data points in the train set: 7986, number of used features: 2045
[LightGBM] [Info] Start training from score -2.883958




💾 Saved tuned -> pickle_models/ADME_Solubility_AqSolDB_tuned.pkl

⚡ Training ADME.PPBR_AZ (regression)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022569 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 1291, number of used features: 695
[LightGBM] [Info] Start training from score 87.804531




💾 Saved baseline -> pickle_models/ADME_PPBR_AZ_baseline.pkl
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008707 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 590
[LightGBM] [Info] Number of data points in the train set: 1291, number of used features: 295
[LightGBM] [Info] Start training from score 87.804531
💾 Saved tuned -> pickle_models/ADME_PPBR_AZ_tuned.pkl

⚡ Training ADME.VDss_Lombardo (regression)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017206 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1028
[LightGBM] [Info] Number of data points in the train set: 904, number of used features: 514
[LightGBM] [Info] Start training from score 4.290531




💾 Saved baseline -> pickle_models/ADME_VDss_Lombardo_baseline.pkl
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 778
[LightGBM] [Info] Number of data points in the train set: 904, number of used features: 389
[LightGBM] [Info] Start training from score 4.290531
💾 Saved tuned -> pickle_models/ADME_VDss_Lombardo_tuned.pkl

⚡ Training ADME.Half_Life_Obach (regression)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007436 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 484
[LightGBM] [Info] Number of data points in the train set: 534, number of used features: 242
[LightGBM] [Info] Start training from score 15.312566
💾 Saved baseline ->



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016466 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1042
[LightGBM] [Info] Number of data points in the train set: 970, number of used features: 521
[LightGBM] [Info] Start training from score 42.755485




💾 Saved baseline -> pickle_models/ADME_Clearance_Hepatocyte_AZ_baseline.pkl
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004725 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 312
[LightGBM] [Info] Number of data points in the train set: 970, number of used features: 156
[LightGBM] [Info] Start training from score 42.755485
💾 Saved tuned -> pickle_models/ADME_Clearance_Hepatocyte_AZ_tuned.pkl

⚡ Training ADME.Clearance_Microsome_AZ (regression)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014661 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 942
[LightGBM] [Info] Number of data points in the train set: 882, number of used features: 471
[LightGBM] [Info] Start training from score 3



💾 Saved baseline -> pickle_models/ADME_Clearance_Microsome_AZ_baseline.pkl
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002905 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 178
[LightGBM] [Info] Number of data points in the train set: 882, number of used features: 89
[LightGBM] [Info] Start training from score 34.562347
💾 Saved tuned -> pickle_models/ADME_Clearance_Microsome_AZ_tuned.pkl

⚡ Training ADME.Bioavailability_Ma (binary)
[LightGBM] [Info] Number of positive: 402, number of negative: 110
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006326 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 384
[LightGBM] [Info] Number of data points in the train set: 512, number of used fea



💾 Saved tuned -> pickle_models/ADME_Bioavailability_Ma_tuned.pkl

⚡ Training ADME.HIA_Hou (binary)




[LightGBM] [Info] Number of positive: 403, number of negative: 59
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005043 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 314
[LightGBM] [Info] Number of data points in the train set: 462, number of used features: 157
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.872294 -> initscore=1.921399
[LightGBM] [Info] Start training from score 1.921399
💾 Saved baseline -> pickle_models/ADME_HIA_Hou_baseline.pkl
[LightGBM] [Info] Number of positive: 403, number of negative: 59
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001245 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 82
[LightGBM] [Info] Number of data points in the train set: 462, numb



[LightGBM] [Info] Number of positive: 527, number of negative: 447
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012205 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 792
[LightGBM] [Info] Number of data points in the train set: 974, number of used features: 396
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.541068 -> initscore=0.164642
[LightGBM] [Info] Start training from score 0.164642




💾 Saved baseline -> pickle_models/ADME_Pgp_Broccatelli_baseline.pkl
[LightGBM] [Info] Number of positive: 527, number of negative: 447
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 648
[LightGBM] [Info] Number of data points in the train set: 974, number of used features: 324
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.541068 -> initscore=0.164642
[LightGBM] [Info] Start training from score 0.164642
💾 Saved tuned -> pickle_models/ADME_Pgp_Broccatelli_tuned.pkl

⚡ Training ADME.BBB_Martins (binary)




[LightGBM] [Info] Number of positive: 1248, number of negative: 376
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022615 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1442
[LightGBM] [Info] Number of data points in the train set: 1624, number of used features: 721
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.768473 -> initscore=1.199708
[LightGBM] [Info] Start training from score 1.199708




💾 Saved baseline -> pickle_models/ADME_BBB_Martins_baseline.pkl
[LightGBM] [Info] Number of positive: 1248, number of negative: 376
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034257 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1910
[LightGBM] [Info] Number of data points in the train set: 1624, number of used features: 955
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.768473 -> initscore=1.199708
[LightGBM] [Info] Start training from score 1.199708




💾 Saved tuned -> pickle_models/ADME_BBB_Martins_tuned.pkl

⚡ Training ADME.CYP2C9_Veith (binary)
[LightGBM] [Info] Number of positive: 3226, number of negative: 6448
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062757 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3906
[LightGBM] [Info] Number of data points in the train set: 9674, number of used features: 1953
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333471 -> initscore=-0.692527
[LightGBM] [Info] Start training from score -0.692527




💾 Saved baseline -> pickle_models/ADME_CYP2C9_Veith_baseline.pkl
[LightGBM] [Info] Number of positive: 3226, number of negative: 6448
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053778 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3344
[LightGBM] [Info] Number of data points in the train set: 9674, number of used features: 1672
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333471 -> initscore=-0.692527
[LightGBM] [Info] Start training from score -0.692527
💾 Saved tuned -> pickle_models/ADME_CYP2C9_Veith_tuned.pkl

⚡ Training ADME.CYP2D6_Veith (binary)




[LightGBM] [Info] Number of positive: 2026, number of negative: 8478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3954
[LightGBM] [Info] Number of data points in the train set: 10504, number of used features: 1977
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192879 -> initscore=-1.431411
[LightGBM] [Info] Start training from score -1.431411




💾 Saved baseline -> pickle_models/ADME_CYP2D6_Veith_baseline.pkl
[LightGBM] [Info] Number of positive: 2026, number of negative: 8478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061928 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3828
[LightGBM] [Info] Number of data points in the train set: 10504, number of used features: 1914
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192879 -> initscore=-1.431411
[LightGBM] [Info] Start training from score -1.431411
💾 Saved tuned -> pickle_models/ADME_CYP2D6_Veith_tuned.pkl

⚡ Training ADME.CYP3A4_Veith (binary)




[LightGBM] [Info] Number of positive: 4093, number of negative: 5769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061183 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3922
[LightGBM] [Info] Number of data points in the train set: 9862, number of used features: 1961
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.415027 -> initscore=-0.343221
[LightGBM] [Info] Start training from score -0.343221




💾 Saved baseline -> pickle_models/ADME_CYP3A4_Veith_baseline.pkl
[LightGBM] [Info] Number of positive: 4093, number of negative: 5769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040091 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2636
[LightGBM] [Info] Number of data points in the train set: 9862, number of used features: 1318
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.415027 -> initscore=-0.343221
[LightGBM] [Info] Start training from score -0.343221
💾 Saved tuned -> pickle_models/ADME_CYP3A4_Veith_tuned.pkl

⚡ Training ADME.CYP2C9_Substrate_CarbonMangels (binary)




[LightGBM] [Info] Number of positive: 114, number of negative: 421
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 422
[LightGBM] [Info] Number of data points in the train set: 535, number of used features: 211
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.213084 -> initscore=-1.306434
[LightGBM] [Info] Start training from score -1.306434
💾 Saved baseline -> pickle_models/ADME_CYP2C9_Substrate_CarbonMangels_baseline.pkl
[LightGBM] [Info] Number of positive: 114, number of negative: 421
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002479 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 140
[LightGBM] [Info] Number of data points



💾 Saved tuned -> pickle_models/ADME_CYP2C9_Substrate_CarbonMangels_tuned.pkl

⚡ Training ADME.CYP2D6_Substrate_CarbonMangels (binary)
[LightGBM] [Info] Number of positive: 160, number of negative: 374
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006787 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 418
[LightGBM] [Info] Number of data points in the train set: 534, number of used features: 209
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.299625 -> initscore=-0.849082
[LightGBM] [Info] Start training from score -0.849082
💾 Saved baseline -> pickle_models/ADME_CYP2D6_Substrate_CarbonMangels_baseline.pkl
[LightGBM] [Info] Number of positive: 160, number of negative: 374
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023062 seconds.
You can set `force_row_wise=true` to remove the overhead.
An



💾 Saved tuned -> pickle_models/ADME_CYP2D6_Substrate_CarbonMangels_tuned.pkl

⚡ Training ADME.CYP3A4_Substrate_CarbonMangels (binary)




[LightGBM] [Info] Number of positive: 288, number of negative: 248
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006956 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 424
[LightGBM] [Info] Number of data points in the train set: 536, number of used features: 212
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.537313 -> initscore=0.149532
[LightGBM] [Info] Start training from score 0.149532
💾 Saved baseline -> pickle_models/ADME_CYP3A4_Substrate_CarbonMangels_baseline.pkl
[LightGBM] [Info] Number of positive: 288, number of negative: 248
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006384 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 310
[LightGBM] [Info] Number of data points i



[LightGBM] [Info] Number of positive: 353, number of negative: 171
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 524, number of used features: 255
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.673664 -> initscore=0.724805
[LightGBM] [Info] Start training from score 0.724805
💾 Saved baseline -> pickle_models/Tox_hERG_baseline.pkl
[LightGBM] [Info] Number of positive: 353, number of negative: 171
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020797 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1990
[LightGBM] [Info] Number of data points in the train set: 524, numb



💾 Saved tuned -> pickle_models/Tox_hERG_tuned.pkl

⚡ Training Tox.AMES (binary)




[LightGBM] [Info] Number of positive: 3176, number of negative: 2646
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042210 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2768
[LightGBM] [Info] Number of data points in the train set: 5822, number of used features: 1384
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.545517 -> initscore=0.182573
[LightGBM] [Info] Start training from score 0.182573




💾 Saved baseline -> pickle_models/Tox_AMES_baseline.pkl
[LightGBM] [Info] Number of positive: 3176, number of negative: 2646
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.060154 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4034
[LightGBM] [Info] Number of data points in the train set: 5822, number of used features: 2017
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.545517 -> initscore=0.182573
[LightGBM] [Info] Start training from score 0.182573
💾 Saved tuned -> pickle_models/Tox_AMES_tuned.pkl

⚡ Training Tox.DILI (binary)




[LightGBM] [Info] Number of positive: 188, number of negative: 192
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003907 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 236
[LightGBM] [Info] Number of data points in the train set: 380, number of used features: 118
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494737 -> initscore=-0.021053
[LightGBM] [Info] Start training from score -0.021053
💾 Saved baseline -> pickle_models/Tox_DILI_baseline.pkl
[LightGBM] [Info] Number of positive: 188, number of negative: 192
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003285 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 346
[LightGBM] [Info] Number of data points in the train set: 380, num



💾 Saved tuned -> pickle_models/Tox_DILI_tuned.pkl

⚡ Training Tox.LD50_Zhu (regression)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044300 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2904
[LightGBM] [Info] Number of data points in the train set: 5908, number of used features: 1452
[LightGBM] [Info] Start training from score 2.544500




💾 Saved baseline -> pickle_models/Tox_LD50_Zhu_baseline.pkl
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4064
[LightGBM] [Info] Number of data points in the train set: 5908, number of used features: 2032
[LightGBM] [Info] Start training from score 2.544500
💾 Saved tuned -> pickle_models/Tox_LD50_Zhu_tuned.pkl




Unnamed: 0,Dataset,Type,mae,rmse,r2,spearman,roc_auc,pr_auc,f1_at_0.5
0,ADME.Caco2_Wang,Baseline,0.361714,0.501975,0.60333,0.724463,,,
1,ADME.Caco2_Wang,Tuned,0.388012,0.518714,0.576434,0.758632,,,
2,ADME.Lipophilicity_AstraZeneca,Baseline,0.602564,0.780147,0.588061,0.746071,,,
3,ADME.Lipophilicity_AstraZeneca,Tuned,0.703581,0.900817,0.450771,0.68807,,,
4,ADME.Solubility_AqSolDB,Baseline,0.985724,1.320107,0.678868,0.814132,,,
5,ADME.Solubility_AqSolDB,Tuned,0.948533,1.284479,0.695968,0.826097,,,
6,ADME.PPBR_AZ,Baseline,9.244442,13.917036,0.200445,0.549274,,,
7,ADME.PPBR_AZ,Tuned,10.312062,14.456788,0.137223,0.487678,,,
8,ADME.VDss_Lombardo,Baseline,4.886325,10.033694,-1.156592,0.441098,,,
9,ADME.VDss_Lombardo,Tuned,3.987308,6.701157,0.038065,0.468042,,,


✅ Done. Baseline + tuned models saved in pickle_models/, metrics in tdc_lgbm_final_metrics.csv
