In [6]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt

def calculate_metrics(true_values, pred_values):
    mse = round(mean_squared_error(true_values, pred_values),3)
    mae = round(mean_absolute_error(true_values, pred_values),3)
    r_score = round(r2_score(true_values, pred_values),3)

    return {"mse": mse,
            "mae": mae,
            "r_score": r_score,}

In [7]:
import h2o
from h2o.automl import H2OAutoML

# Start the H2O cluster (locally)
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,3 hours 53 mins
H2O_cluster_timezone:,Europe/Kiev
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,1 month and 9 days
H2O_cluster_name:,H2O_from_python_38066_gx568w
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.481 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [25]:
import os
import sys
sys.path.insert(0, os.path.dirname('C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part'))

import pandas as pd

from ml_part.random_forest.data_prep.preparation import DataPreparation
from ml_part.random_forest.train import RFTrain

CSV_PATH = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\remained_features_25.01.csv'
smiles_to_subgroup_filepath = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\smiles_to_subgroup.pkl'
smiles_filepath = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\smiles_to_index.pkl'

dataPreparation = DataPreparation(path_to_data=CSV_PATH,
                                  smiles_to_subgroup_pickle_file=smiles_to_subgroup_filepath,
                                  smiles_to_index_pickle_file=smiles_filepath)

unimportant_features_to_drop = ['nFaRing', 'nHRing', 'f_atom_fraction', 'tpsa+f', 'nFaHRing',
       'nFAHRing', 'naRing', 'cis/trans', 'dipole_moment', 'nFARing',
       'distance_between_atoms_in_f_group_centers', 'nF', 'PBF', 'naHRing',
       'RPCS', 'GeomShapeIndex', 'nFHRing', 'mol_num_cycles']

X, y = dataPreparation.prepare_data_for_RF(is_pKa=False,
                                           subgroup_type="Carboxylic acid",
                                           use_mandatory_features=True,
                                           is_remove_outliers=True,
                                           is_remove_nan=False,
                                           outliers_features_to_skip=unimportant_features_to_drop)

features_to_drop = []
for feature_name in X.columns:
    if 'angle' in feature_name or feature_name in unimportant_features_to_drop:
        features_to_drop.append(feature_name)

X = X.drop(features_to_drop, axis=1)

rf_train = RFTrain(X=X, 
                   y=y,
                   smiles_filepath=smiles_filepath,
                   is_pKa=False,
                   k_folds=2)

y_train = rf_train.y_train
X_train = rf_train.X_train

y_test = rf_train.y_test
X_test = rf_train.X_test

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

print(len(train_df), len(test_df))

train = h2o.H2OFrame(train_df)
test = h2o.H2OFrame(test_df)

True
55
['dipole_moment', 'FPSA3', 'nHRing', 'avg_atoms_in_cycle', 'angle_X1X2R2', 'PNSA5', 'nN', 'angle_R2X2R1', 'nF', 'nFRing', 'tpsa+f', 'f_freedom', 'f_atom_fraction', 'mol_num_cycles', 'nO', 'nFAHRing', 'angle_R1X1R2', 'distance_between_atoms_in_f_group_centers', 'nC', 'PBF', 'nAHRing', 'mol_volume', 'nFaRing', 'f_to_fg', 'GeomShapeIndex', 'cis/trans', 'PPSA5', 'nFHRing', 'angle_X2X1R1', 'nFaHRing', 'RPCS', 'naRing', 'nFARing', 'dihedral_angle', 'naHRing', 'chirality', 'nARing', 'TASA', 'pKa', 'logP']
FPSA3 [40]
PPSA5 [39, 40]
logP [48]
Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
       54],
      dtype='int64')
Remains rows:51, amount of features: 40
41 10
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████

In [17]:
x = train.columns
y = "logP"
x.remove(y)

aml = H2OAutoML(seed=1, max_runtime_secs=1800)
# aml.train(x=x, y=y, training_frame=train, leaderboard_frame=test, fold_column="fold_id")
aml.train(x=x, y=y, training_frame=train, fold_column="fold_id")

# View the AutoML Leaderboard
lb = aml.leaderboard
print(lb.head(rows=lb.nrows))

AutoML progress: |█
17:53:21.234: Fold column fold_id will be used for cross-validation. nfolds parameter will be ignored.
17:53:21.235: AutoML: XGBoost is not available; skipping it.
17:53:21.237: _train param, Dropping bad and constant columns: [nN, nAHRing, nO]
17:53:21.303: _train param, Dropping bad and constant columns: [nN, nAHRing, nO]
17:53:21.303: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 47.0.
17:53:21.306: _train param, Dropping bad and constant columns: [nN, nAHRing, nO]
17:53:21.418: _train param, Dropping bad and constant columns: [nN, nAHRing, nO]
17:53:21.547: _train param, Dropping bad and constant columns: [nN, nAHRing, nO]
17:53:21.639: _train param, Dropping bad and constant columns: [nN, nAHRing, nO]
17:53:21.735: _train param, Dropping unused columns: [nN, nAHRing, nO]
17:53:21.845: _train param, Dropping unused columns: [nN, nAHRing, nO]
17:53:21.954: _train param, Dropping

In [20]:
best_model = h2o.get_model('GBM_grid_1_AutoML_4_20240129_175321_model_276')
preds = best_model.predict(test)
true = test.as_data_frame()['logP']
predicted_values = preds.as_data_frame()['predict']

calculate_metrics(true, predicted_values)

gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%




{'mse': 0.057, 'mae': 0.184, 'r_score': 0.612}

In [21]:
best_model = aml.get_best_model()
preds = best_model.predict(test)
true = test.as_data_frame()['logP']
predicted_values = preds.as_data_frame()['predict']

calculate_metrics(true, predicted_values)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%




{'mse': 0.055, 'mae': 0.162, 'r_score': 0.629}

In [22]:
best_model

key,value
Stacking strategy,cross_validation
Number of base models (used / total),3/5
# GBM base models (used / total),1/1
# DeepLearning base models (used / total),1/1
# GLM base models (used / total),1/1
# DRF base models (used / total),0/2
Metalearner algorithm,GLM
Metalearner fold assignment scheme,AUTO
Metalearner nfolds,0
Metalearner fold_column,fold_id

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid
mae,0.1500889,0.0134077,0.1406083,0.1595696
mean_residual_deviance,0.0372306,0.0081301,0.0314818,0.0429795
mse,0.0372306,0.0081301,0.0314818,0.0429795
null_deviance,4.9568005,1.5232502,3.8797,6.0339007
r2,0.8209826,0.0004743,0.8206472,0.821318
residual_deviance,0.8835428,0.2700352,0.6925991,1.0744866
rmse,0.192373,0.0211311,0.1774311,0.2073149
rmsle,0.0603545,0.0039163,0.0575853,0.0631238


In [16]:
MODEL_SAVE_PATH = r'C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part\h2o_model\25.01.24_features_3_subgroups\logP_models'

model_path = h2o.save_model(model=best_model, path=os.path.join(MODEL_SAVE_PATH, 'primary_amine_subgroup_all_molecules_top15_features'), force=True)

model_path

'C:\\work\\DrugDiscovery\\main_git\\XAI_Chem\\ml_part\\h2o_model\\25.01.24_features_3_subgroups\\logP_models\\primary_amine_subgroup_all_molecules_top15_features\\DeepLearning_grid_2_AutoML_3_20240129_172436_model_10'

In [28]:
model_path = r'C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part\h2o_model\models\25.01.24_features\logP\Only_mol_with_angles_without_outliers(except_dipole)_with_angles(as features)\DeepLearning_grid_1_AutoML_2_20240127_15921_model_83'

test_model = h2o.load_model(model_path)

test_model

Unnamed: 0,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
,1,34,Input,0.0,,,,,,,,,
,2,20,RectifierDropout,40.0,0.0,0.0,0.0005348,0.0004301,0.0,0.005371,0.2091236,-0.0183898,0.153103
,3,1,Linear,,0.0,0.0,0.0001089,5.46e-05,0.0,0.0459793,0.3017079,0.0032812,0.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid
mae,0.254001,0.0161379,0.2425898,0.2654122
mean_residual_deviance,0.1205594,0.0324599,0.0976068,0.143512
mse,0.1205594,0.0324599,0.0976068,0.143512
r2,0.6144856,0.0718267,0.6652748,0.5636964
residual_deviance,0.1205594,0.0324599,0.0976068,0.143512
rmse,0.3456253,0.0469582,0.3124209,0.3788298
rmsle,0.1257445,0.0191793,0.1121827,0.1393063

Unnamed: 0,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_deviance,training_mae,training_r2
,2024-01-27 02:09:31,0.000 sec,,0.0,0,0.0,,,,
,2024-01-27 02:09:31,10 min 2.493 sec,445000 obs/sec,10.0,1,890.0,0.6109181,0.3732209,0.4471065,-0.1884841
,2024-01-27 02:09:33,10 min 3.964 sec,605224 obs/sec,10010.0,1001,890890.0,0.0908388,0.0082517,0.0622105,0.9737233

variable,relative_importance,scaled_importance,percentage
nN,1.0,1.0,0.0479725
mol_volume,0.9709323,0.9709323,0.0465781
PPSA5,0.8813123,0.8813123,0.0422788
cis/trans,0.8379441,0.8379441,0.0401983
PNSA5,0.7623063,0.7623063,0.0365698
tpsa+f,0.7456973,0.7456973,0.0357730
angle_R1X1R2,0.7324474,0.7324474,0.0351374
FPSA3,0.7319981,0.7319981,0.0351158
nF,0.7216697,0.7216697,0.0346203
nAHRing,0.7002086,0.7002086,0.0335908
