In [1]:
import os
import sys
sys.path.append('../')
import numpy as np

from MolRep import MolRep
from MolRep.Utils.logger import Logger
from MolRep.Utils.config_from_dict import Config
from MolRep.Experiments.experiments import EndToEndExperiment

In [2]:
MODEL_CONFIG_DIR = '../MolRep/Configs' # Need to set! The directory of Model Configurations files, such as config_CMPNN.yml.
DATASET_DIR = '../MolRep/Datasets'     # Need to set! The directory of Datasets downloaded from Google Drive.
OUTPUT_DIR = '../Outputs/'

# Output file name
_CONFIG_BASE = 'config_'
_CONFIG_FILENAME = 'config_results.json'

# Args
_FOLDS = 5
MODEL_NAME = 'MorganFP'#'MolecularFingerprint' #'CMPNN'
DATASET_NAME = 'BBBP'

In [3]:
dataset_config, dataset, model_configurations, model_selector, exp_path = MolRep.construct_dataset(
        dataset_name = DATASET_NAME,
        model_name = MODEL_NAME,
        inner_k = _FOLDS,
        config_dir = MODEL_CONFIG_DIR,
        datasets_dir = DATASET_DIR,
        output_dir=OUTPUT_DIR
)

In [4]:
config_id = 0  # the idx of model config since there are more than 100 combinations of hyper-parameters.
KFOLD_FOLDER = os.path.join(exp_path, str(_FOLDS) + '_FOLD_MS')
exp_config_name = os.path.join(KFOLD_FOLDER, _CONFIG_BASE + str(config_id + 1))
config_filename = os.path.join(exp_config_name, _CONFIG_FILENAME)
if not os.path.exists(exp_config_name):
    os.makedirs(exp_config_name)

In [5]:
config = model_configurations[config_id]

# model configs could be change
# for example:
# config['device'] = 'cpu' or config['batch_size'] = 32

logger = Logger(str(os.path.join(exp_config_name, 'experiment.log')), mode='w')
logger.log('Configuration: ' + str(config))

Configuration: {'model': 'MorganFP', 'device': 'cuda', 'batch_size': 32, 'learning_rate': 0.01, 'l2': 0.0, 'num_epochs': 50, 'num_lrs': 1, 'optimizer': 'Adam', 'scheduler': {'class': 'NoamLR', 'args': {'warmup_epochs': [2.0], 'step_size': 10, 'max_lr': [0.0001], 'init_lr': [1e-05], 'final_lr': [1e-05]}}, 'gradient_clipping': None, 'early_stopper': {'class': 'Patience', 'args': {'patience': 500, 'use_loss': True}}, 'shuffle': True, 'hidden_units': 128}


In [6]:
k_fold_dict = {
    'config': config,
    'folds': [{} for _ in range(_FOLDS)],
    'avg_TR_score': 0.,
    'avg_VL_score': 0.,
    'std_TR_score': 0.,
    'std_VL_score': 0.
}

In [7]:
dataset_getter = MolRep.construct_dataloader(dataset)
for k in range(_FOLDS):
    logger.log(f"Training in Fold: {k+1}")
    dataset_getter.set_inner_k(k)

    fold_exp_folder = os.path.join(exp_config_name, 'FOLD_' + str(k + 1))
    # Create the experiment object which will be responsible for running a specific experiment
    experiment = EndToEndExperiment(config, dataset_config, fold_exp_folder)

    model_path = os.path.join(fold_exp_folder, f"{MODEL_NAME}_{DATASET_NAME}_fold_{k}.pt")
    training_score, validation_score, validation_loss = experiment.run_valid(dataset_getter, logger, other={'model_path': model_path})

    # print('training_score:', training_score, 'validation_score:',validation_score)
    logger.log(str(k+1) + ' split, TR Score: ' + str(training_score) +
                ' VL Score: ' + str(validation_score))

    k_fold_dict['folds'][k]['TR_score'] = training_score
    k_fold_dict['folds'][k]['VL_score'] = validation_score

tr_scores = np.array([k_fold_dict['folds'][k]['TR_score'] for k in range(_FOLDS)])
vl_scores = np.array([k_fold_dict['folds'][k]['VL_score'] for k in range(_FOLDS)])

k_fold_dict['avg_TR_score'] = tr_scores.mean()
k_fold_dict['std_TR_score'] = tr_scores.std()
k_fold_dict['avg_VL_score'] = vl_scores.mean()
k_fold_dict['std_VL_score'] = vl_scores.std()


log_str = f"TR avg is %.4f std is %.4f; VL avg is %.4f std is %.4f" % (
            k_fold_dict['avg_TR_score'], k_fold_dict['std_TR_score'], k_fold_dict['avg_VL_score'], k_fold_dict['std_VL_score']
        )
logger.log(log_str)

Training in Fold: 1
[TRAIN] Epoch: 1, train loss: 0.692826 train auc: 0.486420
[TRAIN] Metric:{'auc': 0.48642034177375115}
[VALID] Epoch: 1, valid loss: 0.573312 valid auc: 0.495658
[VALID] Metric:{'auc': 0.4956582474421414}
[TEST] Metric:None
- Elapsed time: 3.76s , Time estimation in a fold: 3.14min
[TRAIN] Epoch: 10, train loss: 0.573786 train auc: 0.769187
[TRAIN] Metric:{'auc': 0.7691865654305591}
[VALID] Epoch: 10, valid loss: 0.546169 valid auc: 0.736399
[VALID] Metric:{'auc': 0.7363989881828822}
[TEST] Metric:None
- Elapsed time: 0.98s , Time estimation in a fold: 0.82min
[TRAIN] Epoch: 20, train loss: 0.453149 train auc: 0.869016
[TRAIN] Metric:{'auc': 0.8690155970444274}
[VALID] Epoch: 20, valid loss: 0.519444 valid auc: 0.779590
[VALID] Metric:{'auc': 0.7795899875410579}
[TEST] Metric:None
- Elapsed time: 1.05s , Time estimation in a fold: 0.87min
[TRAIN] Epoch: 30, train loss: 0.377302 train auc: 0.928631
[TRAIN] Metric:{'auc': 0.928631397050133}
[VALID] Epoch: 30, valid lo

[TRAIN] Epoch: 50, train loss: 0.190212 train auc: 0.994269
[TRAIN] Metric:{'auc': 0.9942687892014819}
[VALID] Epoch: 50, valid loss: 0.490262 valid auc: 0.800091
[VALID] Metric:{'auc': 0.800090610488164}
[TEST] Metric:None
- Elapsed time: 0.95s , Time estimation in a fold: 0.79min
5 split, TR Score: 0.9941760709467858 VL Score: 0.800090610488164
TR avg is 0.8934 std is 0.1968; VL avg is 0.7450 std is 0.1227
