In [19]:
from abc import ABC, abstractmethod
from datetime import datetime
from typing import Callable, Iterable

import numpy as np
import optuna.trial
from sklearn.model_selection import ParameterGrid

from qsprpred.models.assessment.methods import ModelAssessor
from qsprpred.data import QSPRDataset
from qsprpred.logs import logger
from qsprpred.models.model import QSPRModel
from qsprpred.models.monitors import BaseMonitor, HyperparameterOptimizationMonitor
from qsprpred.models.hyperparam_optimization import HyperparameterOptimization

In [2]:
%load_ext autoreload
%autoreload 2

import os
import importlib
import pandas as pd 
import json
from qsprpred.data import QSPRDataset, RandomSplit
from qsprpred.models import QSPRModel
from qsprpred.data.descriptors.fingerprints import MorganFP
from qsprpred.data.descriptors.sets import SmilesDesc
from qsprpred.models import OptunaOptimization, TestSetAssessor, CrossValAssessor, SklearnModel
from qsprpred.data.sampling.splits import DataSplit
from qsprpred.data.processing.data_filters import RepeatsFilter, CategoryFilter
from qsprpred.models import EarlyStoppingMode
from sklearn.ensemble import GradientBoostingClassifier
from qsprpred.extra.gpu.models.chemprop import ChempropModel

from qsprpred.extra.gpu.models.dnn import DNNModel


import numpy as np
from typing import Iterable, List, Tuple

from typing import Literal
import sys
#sys.path.insert(0, '/home/ubuntu/implementation/QSPRpred')

#import qsprpred.extra.gpu.models.gdnn as gdnn_module
#from importlib import reload
#
#reload(gdnn_module)

modname = 'qsprpred.extra.gpu.models.gdnn'
if modname in sys.modules:
    del sys.modules[modname]

import qsprpred.extra.gpu.models.gdnn as gdnn_module
from qsprpred.extra.gpu.models.gdnn import GGNN
importlib.reload(gdnn_module)

#from qsprpred.extra.gpu.models.gdnn import DNNModel, GGNN
#print(DNNModel.__init__.__code__.co_varnames)

Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality


<module 'qsprpred.extra.gpu.models.gdnn' from '/home/brokesm/anaconda3/lib/python3.12/site-packages/qsprpred/extra/gpu/models/gdnn.py'>

In [4]:
#create a folder structure

os.makedirs("./output/models", exist_ok=True)
os.makedirs("./output/benchmarking/data", exist_ok=True)
os.makedirs("./output/optimization/data", exist_ok=True)

In [5]:
# define a customsplit class
# inherits from datasplit
# input - QSPRDataset ids
# output - (train,test) splits

class CustomSplit(DataSplit):

    def __init__(self, test_ids: list[list[str]]):
        super().__init__()
        self.test_ids = test_ids

    def split(
        self,
        X: np.ndarray | pd.DataFrame, 
        y: np.ndarray | pd.DataFrame | pd.Series
    ) -> Iterable[tuple[list[int], list[int]]]:
        """Uses only the specified IDs from the data set as test set
        Returns an iterator of training and test split indices, 
        just like a scikit learn splitter would.
        """
        splits = []
        for test_ids in self.test_ids:
            test = np.where(X.index.isin(test_ids))[0]
            train = np.where(~X.index.isin(test_ids))[0]
            splits.append((train, test))
        return splits

In [6]:
def select_ids(dataset_name, keep_ids):
    return [f"{dataset_name}_{"0" * (4 - len(str(id)))}{id}" for id in keep_ids]

In [7]:
def data_loading(
        target:Literal["P00918","P03372","P04637","P08684","P14416","P22303","P42336","Q12809","Q16637","Q9Y468"], 
        purpose:Literal["ForOptimization","ForBenchmarking"],
        model:QSPRModel | None = None,
        save = True
        ) -> Tuple[QSPRDataset, List, List, List]:

    dataset_name = f"{purpose}_{target}"
    store_dir = f"./output/{purpose[3:].lower()}/data"

    dataset = QSPRDataset.fromTableFile(
        filename=f"./papyrus_datasets/{target}.csv",
        sep=",",
        store_dir=store_dir,
        name=dataset_name,
        target_props=[{"name": "Y", "task": "SINGLECLASS", "th":"precomputed"}],
        random_state=42
    )

    if model is not None:
        if model.supportsEarlyStopping:
            # In case of GNNs (both support early stopping) add SmilesDesc as descriptors
            dataset.addDescriptors([SmilesDesc()])
        else:
            # In case of XGB (doesn't support early stopping) add MorganFP with default parameters as descriptors
            dataset.addDescriptors([MorganFP()])

    if save:
        dataset.save()
    

    return dataset


In [22]:
np.unique(["x","y","x"]).tolist()



['x', 'y']

In [None]:
class OptunaOptimization(HyperparameterOptimization):
    """Class for hyperparameter optimization of QSPRModels using Optuna.

    Attributes:
        nTrials (int):
            number of trials for bayes optimization
        nJobs (int):
            number of jobs to run in parallel. At the moment only n_jobs=1 is supported.
        bestScore (float):
            best score found during optimization
        bestParams (dict):
            best parameters found during optimization

    Example of OptunaOptimization for scikit-learn's MLPClassifier:
        >>> model = SklearnModel(base_dir=".",
        >>>                     alg = MLPClassifier(), alg_name="MLP")
        >>> search_space = {
        >>>    "learning_rate_init": ["float", 1e-5, 1e-3,],
        >>>    "power_t" : ["discrete_uniform", 0.2, 0.8, 0.1],
        >>>    "momentum": ["float", 0.0, 1.0],
        >>> }
        >>> optimizer = OptunaOptimization(
        >>>     scoring="average_precision",
        >>>     param_grid=search_space,
        >>>     n_trials=10
        >>> )
        >>> best_params = optimizer.optimize(model, dataset) # dataset is a QSPRDataset

    Available suggestion types:
        ["categorical", "discrete_uniform", "float", "int", "loguniform", "uniform"]
    """

    def __init__(
        self,
        param_grid: dict,
        model_assessor: ModelAssessor,
        score_aggregation: Callable[[Iterable], float] = np.mean,
        monitor: HyperparameterOptimizationMonitor | None = None,
        n_trials: int = 100,
        n_jobs: int = 1,
    ):
        """Initialize the class for hyperparameter optimization
        of QSPRModels using Optuna.

        Args:
            param_grid (dict):
                search space for bayesian optimization, keys are the parameter names,
                values are lists with first element the type of the parameter and the
                following elements the parameter bounds or values.
            model_assessor (ModelAssessor):
                assessment method to use for the optimization
                (default: CrossValAssessor)
            score_aggregation (Callable):
                function to aggregate the scores of different folds if the assessment
                method returns multiple predictions
            monitor (HyperparameterOptimizationMonitor):
                monitor for the optimization, if None, a BaseMonitor is used
            n_trials (int):
                number of trials for bayes optimization
            n_jobs (int):
                number of jobs to run in parallel.
                At the moment only n_jobs=1 is supported.
        """
        super().__init__(param_grid, model_assessor, score_aggregation, monitor)
        if monitor is None:
            self.monitor = BaseMonitor()
        search_space_types = [
            "categorical",
            "discrete_uniform",
            "float",
            "int",
            "loguniform",
            "uniform",
        ]
        if not all(v[0] in search_space_types for v in param_grid.values()):
            logger.error(
                f"Search space {param_grid} is missing or has invalid search type(s), "
                "see OptunaOptimization docstring for example."
            )
            raise ValueError(
                "Search space for optuna optimization is missing or "
                "has invalid search type(s)."
            )

        self.nTrials = n_trials
        self.nJobs = n_jobs
        if self.nJobs > 1:
            logger.warning(
                "At the moment n_jobs>1 not available for bayes optimization, "
                "n_jobs set to 1."
            )
            self.nJobs = 1
        self.bestScore = -np.inf
        self.bestParams = None
        self.config.update(
            {
                "n_trials": n_trials,
                "n_jobs": n_jobs,
            }
        )

    def optimize(
        self,
        model: QSPRModel,
        ds: QSPRDataset,
        save_params: bool = True,
        refit_optimal: bool = False,
        **kwargs,
    ) -> dict:
        """Bayesian optimization of hyperparameters using optuna.

        Args:
            model (QSPRModel): the model to optimize
            ds (QSPRDataset): dataset to use for the optimization
            save_params (bool):
                whether to set and save the best parameters to the model
                after optimization
            refit_optimal (bool):
                Whether to refit the model with the optimal parameters on the
                entire training set after optimization. This implies 'save_params=True'.
            **kwargs: additional arguments for the assessment method

        Returns:
            dict: best parameters found during optimization
        """
        import optuna

        self.monitor.onOptimizationStart(
            model, ds, self.config, self.__class__.__name__
        )

        logger.info(
            "Bayesian optimization can take a while "
            "for some hyperparameter combinations"
        )
        # create optuna study
        study = optuna.create_study(
            direction="maximize",
            sampler=optuna.samplers.TPESampler(seed=model.randomState),
        )
        logger.info(
            "Bayesian optimization started: %s"
            % datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        )
        study.optimize(
            lambda t: self.objective(t, model, ds), self.nTrials, n_jobs=self.nJobs
        )
        logger.info(
            "Bayesian optimization ended: %s"
            % datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        )
        # save the best study
        trial = study.best_trial
        trials = study.get_trials()
        top_n = sorted(trials, key=lambda x:x.values)[:10]
        
        aggr_values = {k:[] for k in top_n[0].params.keys()}
        for tr in top_n:
            params = tr.params
            for k,v in params.items():
                aggr_values[k].append(v)

        next_iter = {
            k:[self.config["param_grid"][k][0], np.min(v), np.max(v)] if self.config["param_grid"][k][0] != "categorical" \
            else ["categorical", np.unique(v).tolist()] \
            for k,v in aggr_values.items()
        }

        # log the best study
        logger.info("Bayesian optimization best params: %s" % trial.params)
        # save the best score and parameters, return the best parameters
        self.bestScore = trial.value
        self.bestParams = trial.params

        self.monitor.onOptimizationEnd(self.bestScore, self.bestParams)
        # save the best parameters to the model if requested
        self.saveResults(model, ds, save_params, refit_optimal)
        # return self.bestParams
        return next_iter
    
    def objective(
        self, trial: optuna.trial.Trial, model: QSPRModel, ds: QSPRDataset, **kwargs
    ) -> float:
        """Objective for bayesian optimization.

        Arguments:
            trial (optuna.trial.Trial): trial object for the optimization
            model (QSPRModel): the model to optimize
            ds (QSPRDataset): dataset to use for the optimization
            **kwargs: additional arguments for the assessment method

        Returns:
            float: score of the model with the current parameters
        """
        bayesian_params = {}
        # get the suggested parameters for the current trial
        for key, value in self.paramGrid.items():
            if value[0] == "categorical":
                bayesian_params[key] = trial.suggest_categorical(key, value[1])
            elif value[0] == "discrete_uniform":
                bayesian_params[key] = trial.suggest_float(
                    key, value[1], value[2], step=value[3]
                )
            elif value[0] == "float":
                bayesian_params[key] = trial.suggest_float(key, value[1], value[2])
            elif value[0] == "int":
                bayesian_params[key] = trial.suggest_int(key, value[1], value[2])
            elif value[0] == "loguniform":
                bayesian_params[key] = trial.suggest_float(
                    key, value[1], value[2], log=True
                )
            elif value[0] == "uniform":
                bayesian_params[key] = trial.suggest_float(key, value[1], value[2])
        self.monitor.onIterationStart(bayesian_params)
        # assess the model with the current parameters and return the score
        scores = self.runAssessment(
            model,
            ds=ds,
            save=False,
            parameters=bayesian_params,
            monitor=self.monitor,
            **kwargs,
        )
        score = self.scoreAggregation(scores)
        logger.info(bayesian_params)
        logger.info(f"Score: {score}, std: {np.std(scores)}")
        self.monitor.onIterationEnd(score, list(scores))
        return score

In [13]:
import optuna


def objective(trial):
    x = trial.suggest_float("x", -1, 1)
    return x**2


study = optuna.create_study()
study.optimize(objective, n_trials=10)

trials = study.get_trials()


[I 2025-12-07 09:20:27,245] A new study created in memory with name: no-name-a709f69c-0a9e-4dfe-b981-cc633e77e08e
[I 2025-12-07 09:20:27,249] Trial 0 finished with value: 0.6826095879557307 and parameters: {'x': -0.8262019050787348}. Best is trial 0 with value: 0.6826095879557307.
[I 2025-12-07 09:20:27,252] Trial 1 finished with value: 0.8902280868283088 and parameters: {'x': 0.9435189912388138}. Best is trial 0 with value: 0.6826095879557307.
[I 2025-12-07 09:20:27,255] Trial 2 finished with value: 1.1463660161770181e-06 and parameters: {'x': -0.0010706848351298426}. Best is trial 2 with value: 1.1463660161770181e-06.
[I 2025-12-07 09:20:27,257] Trial 3 finished with value: 0.3141936180281509 and parameters: {'x': -0.5605297655148662}. Best is trial 2 with value: 1.1463660161770181e-06.
[I 2025-12-07 09:20:27,258] Trial 4 finished with value: 0.23963530551946446 and parameters: {'x': -0.4895255923028585}. Best is trial 2 with value: 1.1463660161770181e-06.
[I 2025-12-07 09:20:27,259]

In [14]:
sorted(trials,key=lambda x: x.values)[:2]

[FrozenTrial(number=2, state=<TrialState.COMPLETE: 1>, values=[1.1463660161770181e-06], datetime_start=datetime.datetime(2025, 12, 7, 9, 20, 27, 253112), datetime_complete=datetime.datetime(2025, 12, 7, 9, 20, 27, 253885), params={'x': -0.0010706848351298426}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'x': FloatDistribution(high=1.0, log=False, low=-1.0, step=None)}, trial_id=2, value=None),
 FrozenTrial(number=7, state=<TrialState.COMPLETE: 1>, values=[0.10486909076884882], datetime_start=datetime.datetime(2025, 12, 7, 9, 20, 27, 263159), datetime_complete=datetime.datetime(2025, 12, 7, 9, 20, 27, 263718), params={'x': 0.3238349745917646}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'x': FloatDistribution(high=1.0, log=False, low=-1.0, step=None)}, trial_id=7, value=None)]

In [None]:
def hyperparameter_optimization(
        model:QSPRModel, 
        dataset:QSPRDataset, 
        search_space:dict, 
        scoring:str, 
        val_ids:List,
        test_ids:List
        ):
    # opravit, prvy split CVA je rozdelenie povodneho datasetu na train/test
    # nasledny dataset split bude na train/val (val set je iba na early stopping v ramci CVA)
    # Uz asi hotovo
    gridsearcher = OptunaOptimization(
        n_trials=100,
        param_grid=search_space,
        model_assessor=CrossValAssessor(scoring=scoring, split=CustomSplit([test_ids])),
    )

    dataset.prepareDataset(
        split=CustomSplit([val_ids])
    )
    gridsearcher.optimize(model, dataset)


In [None]:
model_ggnn = gdnn_module.DNNModel(
    base_dir='./output/models/GGNN',
    name='GGNNModel',
    parameters={'n_epochs': 100,
                'out_feats': 74,          # 74-256 for example
                'in_feats': 74,
                'steps': 3,
                'n_hidden_layers': 2,
                'dropout_rate': 0.2,
                "optim_lr":1e-4,
                "batch_size":128,
               },
    tol=0.01,
    random_state=42,
    patience=50
)

search_space_ggnn = {
    "n_hidden_layers": ["int", 1, 6],
    "dropout_rate": ["float", 0.05, 0.5],
    "steps": ["int", 2, 5],
    "batch_size": ["categorical", [32,64,128,256]],
    'out_feats': ["int",74,256],
    'in_feats': ["int",74,74],
    }



In [None]:
model_chemprop = ChempropModel(
    base_dir='./output/models/Chemprop',
    name='ChempropModel',
    parameters={
        "epochs": 5,
        "loss_function":'binary_cross_entropy'
        },
    quiet_logger=False
)

search_space_chemprop = {
    "epochs": ["int", 1,5],
    }

In [11]:
model_xgb = SklearnModel(
            name="XGBModel",
            alg=GradientBoostingClassifier,
            base_dir="./output/models/XGB",
            parameters={
                "max_depth":2,
                "n_estimators":10
            }
        )

search_space_xgb = {
    "max_depth": ["int", 2, 10],
    "n_estimators": ["int", 5,500]
    }

In [12]:
def set_loader(
        target:Literal["P00918","P03372","P04637","P08684","P14416","P22303","P42336","Q12809","Q16637","Q9Y468"], 
        split_type:Literal["random", "cluster", "aggregate_cluster"],
        seed,
        purpose:Literal["ForBenchmarking","ForOptimization"]
        ):
    seed = str(seed)

    with open(f"./papyrus_datasets/{split_type}_split.json") as file:
        json_file = file.read()

    split = json.loads(json_file)
    
    train_ids = split[split_type][target][seed]["train"]
    val_ids = split[split_type][target][seed]["valid"]
    test_ids = split[split_type][target][seed]["test"]

    train_ids = select_ids(f"{purpose}_{target}",list(train_ids))
    val_ids = select_ids(f"{purpose}_{target}",list(val_ids))
    test_ids = select_ids(f"{purpose}_{target}",list(test_ids))

    return train_ids, val_ids, test_ids

In [13]:
def optimize(
        target:Literal["P00918","P03372","P04637","P08684","P14416","P22303","P42336","Q12809","Q16637","Q9Y468"],
        split:Literal["random","cluster","aggregate_cluster"],
        model:QSPRModel,
        search_space:dict,
        seed = 0
        ):

        dataset = data_loading(target,model=model, purpose="ForOptimization")
        train_ids, val_ids, test_ids = set_loader(target,split,seed,purpose="ForOptimization")
        #selected_ids = train_ids + val_ids
        #dataset.prepareDataset(data_filters=[CategoryFilter(name="QSPRID", values=selected_ids, keep=True)])
        
        hyperparameter_optimization(model=model, dataset=dataset, search_space=search_space, scoring="matthews_corrcoef", val_ids=val_ids, test_ids=test_ids)
        


In [3]:
ChempropModel.getAvailableParameters()

{'no_cuda': 'Turn off cuda (i.e., use CPU instead of GPU).',
 'gpu': 'Which GPU to use.',
 'num_workers': 'Number of workers for the parallel data loading (0 means sequential).',
 'batch_size': 'Batch size.',
 'no_cache_mol': 'Whether to not cache the RDKit molecule for each SMILES string to reduce memory usage (cached by default).',
 'empty_cache': 'Whether to empty all caches before training or predicting. This is necessary if multiple jobs are run within a single script and the atom or bond features change.',
 'loss_function': 'Choice of loss function. Loss functions are limited to compatible dataset types.',
 'metric': "Metric to use with the validation set for early stopping. Defaults to 'auc' for classification, 'rmse' for regression. Note. In Chemprop this metric is also used for test-set evaluation, but in QSPRpred this is determined by the scoring parameter in assessment.",
 'bias': 'Whether to add bias to linear layers.',
 'hidden_size': 'Dimensionality of hidden layers in MP

In [None]:
targets = ["P00918","P03372","P04637","P08684","P14416","P22303","P42336","Q12809","Q16637","Q9Y468"]
splits = ["random","cluster", "aggregate_cluster"]
models = [model_xgb, model_ggnn, model_chemprop]
search_spaces = [search_space_xgb, search_space_ggnn, search_space_chemprop]

for target in targets:
    for split in splits:
        for model, search_space in zip(models, search_spaces):
            model.name += f"_{target}_{split}"
            optimize(target = target, split = split, model = model, search_space = search_space)
            model.name = model.name.split("_")[0]

In [14]:
def get_model_params(
        target:Literal["P00918","P03372","P04637","P08684","P14416","P22303","P42336","Q12809","Q16637","Q9Y468"], 
        split_type:Literal["random","cluster","aggregate_cluster"],
        model:Literal["XGB","GGNN","Chemprop"]
    ):

    with open(f"./output/models/{model}/{model}Model_{target}_{split_type}/{model}Model_{target}_{split_type}_meta.json") as f:
        params = f.read()

    params = json.loads(params)
    return params["py/state"]["parameters"]

In [15]:
def prepare_for_benchmarking(dataset:QSPRDataset,descriptors, chemprop=False):

    dataset.addDescriptors([descriptors])

    if chemprop:
        # binary cross entropy loss cannot deal with target variable being of type int
        dataset.transformProperties(["Y","Y_original"],transformer=np.float32)

In [20]:
def benchmark(
    target:Literal["P00918","P03372","P04637","P08684","P14416","P22303","P42336","Q12809","Q16637","Q9Y468"],
    split_type:Literal["random","cluster","aggregate_cluster"]
):
    os.makedirs(f"./output/benchmarking/{target}/{split_type}", exist_ok=True)

    # save the dataset corresponding to a given target
    data_loading(target,purpose="ForBenchmarking")
    
    results = {
        "model":[],
        "metric":[],
        "score":[]
    }
    for metric in ["matthews_corrcoef","f1","recall","precision","roc_auc"]:
        for seed in range(1,21):
            # get the ids for training, validation and test sets for a given combination of target + split + seed
            _, val_ids, test_ids = set_loader(target,split_type,seed=seed, purpose="ForBenchmarking")
            dataset_path = f"./output/benchmarking/data/ForBenchmarking_{target}/ForBenchmarking_{target}_meta.json"
            
            dataset_xgb = QSPRDataset.fromFile(dataset_path)
            dataset_ggnn = QSPRDataset.fromFile(dataset_path)
            dataset_chemprop = QSPRDataset.fromFile(dataset_path)


            prepare_for_benchmarking(dataset_xgb,MorganFP())
            prepare_for_benchmarking(dataset_ggnn,SmilesDesc())
            prepare_for_benchmarking(dataset_chemprop,SmilesDesc(), chemprop=True)

            model_xgb.parameters = get_model_params(target,split_type,"XGB")
            model_ggnn.parameters = get_model_params(target,split_type,"GGNN")
            model_chemprop.parameters = get_model_params(target,split_type,"Chemprop")

            proba = True
            if metric == "matthews_corrcoef":
                proba = False

            dataset_xgb.prepareDataset(split = CustomSplit([test_ids]))
            xgb_score = TestSetAssessor(scoring=metric, use_proba=proba)(model_xgb, dataset_xgb)
            results["model"].append("XGB")
            results["metric"].append(metric)
            results["score"].append(xgb_score.item())


            # Tu mozno pouzit iba CVA, kde na val mnozine najdem best epoch pomocou early stopping
            # Na test mnozine v ramci toho isteho CVA vypocitam skore
            # Uz opravene
            ggnn_score = CrossValAssessor(
                scoring=metric,
                use_proba=proba,
                mode=EarlyStoppingMode.RECORDING,
                split=CustomSplit([test_ids]))(model_ggnn, dataset_ggnn,split=CustomSplit([val_ids]))
            print(f"Best epoch found for GGNN: {model_ggnn.earlyStopping.optimalEpochs}")
            results["model"].append("GGNN")
            results["metric"].append(metric)
            results["score"].append(ggnn_score.item())

            chemprop_score = CrossValAssessor(
                scoring=metric,
                use_proba=proba,
                mode=EarlyStoppingMode.RECORDING,
                split=CustomSplit([test_ids]))(model_chemprop, dataset_chemprop,split=CustomSplit([test_ids]))
            print(f"Best epoch found for Chemprop: {model_chemprop.earlyStopping.optimalEpochs}")
            results["model"].append("Chemprop")
            results["metric"].append(metric)
            results["score"].append(chemprop_score.item())
            
    pd.DataFrame(results).to_csv(f"./output/benchmarking/{target}/{split_type}/results.csv")

In [None]:
# import os

# from qsprpred.data import QSPRDataset, RandomSplit
# from qsprpred.data.descriptors.fingerprints import MorganFP
# from qsprpred.data.descriptors.sets import SmilesDesc

# # Create dataset
# dataset = QSPRDataset.fromTableFile(
#     filename="./papyrus_datasets/P00918.csv",
#     sep=",",
#     store_dir="./tutorial_output/data",
#     name="ChempropTutorialDataset",
#     target_props=[{"name": "Y", "task": "SINGLECLASS", "th":"precomputed"}],
#     random_state=42
# )

# # calculate compound features and split dataset into train and test
# feature_calculators = [SmilesDesc()]
# dataset.prepareDataset(
#     split=RandomSplit(test_fraction=0.2, dataset=dataset),
#     feature_calculators=feature_calculators)

# dataset.getDF().head()

Unnamed: 0_level_0,SMILES,Y,QSPRID,Y_original
QSPRID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ChempropTutorialDataset_0000,CC(=O)OCC1OC(NS(=O)(=O)NO)C=CC1OC(C)=O,1,ChempropTutorialDataset_0000,1
ChempropTutorialDataset_0001,CC(=O)OCC1OC(CC(=O)C=Cc2cccc(O)c2)C(OC(C)=O)C(...,0,ChempropTutorialDataset_0001,0
ChempropTutorialDataset_0002,COc1ccc(CNc2ccc(S(N)(=O)=O)cc2)cc1,1,ChempropTutorialDataset_0002,1
ChempropTutorialDataset_0003,CN(C)C=Nc1ncnc2c1ncn2CC(=O)Nc1ccc(S(N)(=O)=O)cc1,1,ChempropTutorialDataset_0003,1
ChempropTutorialDataset_0004,Cc1sc(-c2noc(N)c2S(N)(=O)=O)cc1S(N)(=O)=O,1,ChempropTutorialDataset_0004,1


In [21]:
for target in targets:
    for split in splits:
        benchmark(target,split)



GGNN updated


KeyError: 'out_feats'

In [None]:
# 'C:\\Users\\marti\\AppData\\Roaming\\Python\\Python312\\site-packages'