In [1]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt

def calculate_metrics(true_values, pred_values):
    mse = round(mean_squared_error(true_values, pred_values),3)
    mae = round(mean_absolute_error(true_values, pred_values),3)
    r_score = round(r2_score(true_values, pred_values),3)

    return {"mse": mse,
            "mae": mae,
            "r^2": r_score,}

In [2]:
import h2o
from h2o.automl import H2OAutoML

# Start the H2O cluster (locally)
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 21.0.1+12-LTS-29, mixed mode, sharing)
  Starting server from C:\work\DrugDiscovery\drug-discovery-venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\38066\AppData\Local\Temp\tmppe9sd56n
  JVM stdout: C:\Users\38066\AppData\Local\Temp\tmppe9sd56n\h2o_38066_started_from_python.out
  JVM stderr: C:\Users\38066\AppData\Local\Temp\tmppe9sd56n\h2o_38066_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Kiev
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,3 months and 26 days
H2O_cluster_name:,H2O_from_python_38066_9sfjj2
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.952 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [3]:
import os
import sys
sys.path.insert(0, os.path.dirname('C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part'))

import pandas as pd

from ml_part.random_forest.data_prep.preparation import DataPreparation
from ml_part.random_forest.train import RFTrain

CSV_PATH = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\remained_features_logP_08.02_v4_fixed_distances_chirality.csv'
smiles_filepath = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\smiles_to_index.pkl'

dataPreparation = DataPreparation(CSV_PATH)

unimportant_features_to_drop = ['dipole_moment']
X, y = dataPreparation.prepare_data_for_RF(is_pKa=False,
                                           use_mandatory_features=True,
                                           is_remove_outliers=True,
                                           is_remove_nan=False,
                                           outliers_features_to_skip=unimportant_features_to_drop)

LOGP_FEATURES = ['f_freedom', 'PPSA5', 'mol_num_cycles', 'nFRing', 'nF', 'identificator', 'f_atom_fraction',
                 'mol_weight', 'dipole_moment', 'nHRing', 'nO', 'PBF', 'nC', 'nARing',
                 'cis/trans', 'PNSA5', 'FPSA3', 'mol_volume', 'RPCS', 'GeomShapeIndex',
                 'WPSA5', 'TASA', 'f_to_fg', 'avg_atoms_in_cycle', 'nFHRing',
                 'chirality']

features_to_drop = []
for feature_name in X.columns:
    if feature_name not in LOGP_FEATURES:
        features_to_drop.append(feature_name)


X = X.drop(features_to_drop, axis=1)

rf_train = RFTrain(X=X, 
                   y=y,
                   smiles_filepath=smiles_filepath,
                   is_pKa=False,
                   k_folds=2)

y_train = rf_train.y_train
X_train = rf_train.X_train

y_test = rf_train.y_test
X_test = rf_train.X_test

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

print(len(train_df), len(test_df))

train = h2o.H2OFrame(train_df)
test = h2o.H2OFrame(test_df)

True
183
['f_freedom', 'distance_between_atoms_in_cycle_and_f_group', 'PPSA5', 'mol_num_cycles', 'nFRing', 'nAHRing', 'angle_R1X1R2', 'nF', 'identificator', 'mol_weight', 'dipole_moment', 'nHRing', 'nO', 'PBF', 'nC', 'angle_X2X1R1', 'nARing', 'angle_R2X2R1', 'cis/trans', 'PNSA5', 'FPSA3', 'naRing', 'tpsa+f', 'mol_volume', 'RPCS', 'f_atom_fraction', 'GeomShapeIndex', 'WPSA5', 'TASA', 'f_to_fg', 'dihedral_angle', 'nFARing', 'distance_between_atoms_in_f_group_centers', 'avg_atoms_in_cycle', 'angle_X1X2R2', 'nFHRing', 'nFAHRing', 'chirality', 'pKa', 'logP']
mol_volume outliers indexes: [127]
f_atom_fraction outliers indexes: [124]
distance_between_atoms_in_f_group_centers outliers indexes: [ 35 167]
logP outliers indexes: [82, 83]
Remains rows:172, amount of features: 40
147 25
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [5]:
test_df.keys()

Index(['f_freedom', 'PPSA5', 'mol_num_cycles', 'nFRing', 'nF', 'identificator',
       'mol_weight', 'dipole_moment', 'nHRing', 'nO', 'PBF', 'nC', 'nARing',
       'cis/trans', 'PNSA5', 'FPSA3', 'mol_volume', 'RPCS', 'f_atom_fraction',
       'GeomShapeIndex', 'WPSA5', 'TASA', 'f_to_fg', 'avg_atoms_in_cycle',
       'nFHRing', 'chirality', 'logP'],
      dtype='object')

In [17]:
import numpy as np
import pandas as pd
import scipy.stats as ss

def cramers_v(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))


ring_features = [feature_name for feature_name in X.columns if 'ring' in feature_name.lower()]
ring_features_to_remain = ['nFRing', 'nHRing', 'nARing', 'nFHRing']

for i in range(len(ring_features)):
    for j in range(len(ring_features)):
        first_feature = X[ring_features[i]]
        second_feature = X[ring_features[j]]

        if i == j:
            continue

        confusion_matrix = pd.crosstab(first_feature, second_feature)
        cramers_v_value = cramers_v(confusion_matrix.values)
        print(f"{ring_features[i]}, {ring_features[j]}, corr: {cramers_v_value}")

nHRing, nARing, corr: 0.6568999956701997
nHRing, nFRing, corr: 0.5799472743597861
nHRing, nFHRing, corr: 0.7298992678284247
nARing, nHRing, corr: 0.6568999956701996
nARing, nFRing, corr: 0.8557878388867258
nARing, nFHRing, corr: 0.7758976171502688
nFRing, nHRing, corr: 0.5799472743597861
nFRing, nARing, corr: 0.8557878388867258
nFRing, nFHRing, corr: 0.8090152479786314
nFHRing, nHRing, corr: 0.7298992678284247
nFHRing, nARing, corr: 0.775897617150269
nFHRing, nFRing, corr: 0.8090152479786314


In [34]:
x = train.columns
y = "logP"
x.remove(y)

aml = H2OAutoML(
    seed=1, 
    max_runtime_secs_per_model=300,
    keep_cross_validation_predictions=True,
    keep_cross_validation_fold_assignment=True,
    keep_cross_validation_models=True
)
aml.train(x=x, y=y, training_frame=train, fold_column="fold_id")

lb = aml.leaderboard
print(lb.head(rows=lb.nrows))

AutoML progress: |
13:02:02.590: Fold column fold_id will be used for cross-validation. nfolds parameter will be ignored.
13:02:02.590: AutoML: XGBoost is not available; skipping it.
13:02:02.642: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 147.0.



███████████████████████████████████████████████████████████████| (done) 100%
model_id                                                     rmse        mse       mae      rmsle    mean_residual_deviance
StackedEnsemble_BestOfFamily_5_AutoML_2_20240416_130202  0.242794  0.058949   0.188629  0.0859053                 0.058949
DeepLearning_grid_1_AutoML_2_20240416_130202_model_31    0.244786  0.0599202  0.190748  0.0876398                 0.0599202
StackedEnsemble_BestOfFamily_1_AutoML_2_20240416_130202  0.245358  0.0602006  0.187299  0.0858583                 0.0602006
GLM_1_AutoML_2_20240416_130202                           0.245511  0.0602757  0.189919  0.086001                  0.0602757
StackedEnsemble_AllModels_1_AutoML_2_20240416_130202     0.246037  0.0605341  0.187939  0.0860266                 0.0605341
StackedEnsemble_BestOfFamily_2_AutoML_2_20240416_130202  0.246414  0.0607199  0.188506  0.0862328                 0.0607199
StackedEnsemble_AllModels_2_AutoML_2_20240416_130202    

In [51]:
best_model = h2o.get_model('StackedEnsemble_BestOfFamily_5_AutoML_2_20240416_130202')

In [53]:
base_models = best_model.base_models
base_models_cv = []

for base_model in base_models:
    base_models_cv.append(base_model + "_cv_1")
    base_models_cv.append(base_model + "_cv_2")

base_models_cv

['DeepLearning_grid_1_AutoML_2_20240416_130202_model_31_cv_1',
 'DeepLearning_grid_1_AutoML_2_20240416_130202_model_31_cv_2',
 'GLM_1_AutoML_2_20240416_130202_cv_1',
 'GLM_1_AutoML_2_20240416_130202_cv_2',
 'GBM_grid_1_AutoML_2_20240416_130202_model_1_cv_1',
 'GBM_grid_1_AutoML_2_20240416_130202_model_1_cv_2',
 'XRT_1_AutoML_2_20240416_130202_cv_1',
 'XRT_1_AutoML_2_20240416_130202_cv_2',
 'DRF_1_AutoML_2_20240416_130202_cv_1',
 'DRF_1_AutoML_2_20240416_130202_cv_2']

In [76]:
cv_indices_dict = {0: [], 1: []}
index = 0
for _, row in train.as_data_frame().iterrows():
    cv_indices_dict[row['fold_id']].append(index)
    index += 1
cv_indices = [[cv_indices_dict[1], cv_indices_dict[0]], [cv_indices_dict[0], cv_indices_dict[1]]]

for base_model in base_models:
    print("-"*30)
    print("-"*30)
    print("-"*30)
    print(base_model)
    for cv_idx in range(len(cv_indices)):
        base_model_cv = base_model + f"_cv_{cv_idx + 1}"
        cv_indexes = cv_indices[cv_idx]
        
        train_idx = cv_indexes[0]
        test_idx = cv_indexes[1]

        train_h2o_cv = h2o.H2OFrame(train_df.iloc[train_idx])
        test_h2o_cv = h2o.H2OFrame(train_df.iloc[test_idx])

        model_cv = h2o.get_model(base_model_cv)
        preds = model_cv.predict(train_h2o_cv)

        true = train_h2o_cv.as_data_frame()['logP']
        predicted_values = preds.as_data_frame()['predict']

        print(calculate_metrics(true, predicted_values))

------------------------------
------------------------------
------------------------------
DeepLearning_grid_1_AutoML_2_20240416_130202_model_31
Parse progress: |



████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
{'mse': 0.011, 'mae': 0.084, 'r^2': 0.963}
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
{'mse': 0.022, 'mae': 0.107, 'r^2': 0.935}
------------------------------
------------------------------
------------------------------
GLM_1_AutoML_2_20240416_130202
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |█████████████████████



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
{'mse': 0.051, 'mae': 0.171, 'r^2': 0.848}
------------------------------
------------------------------
------------------------------
GBM_grid_1_AutoML_2_20240416_130202_model_1
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
{'mse': 0.023, 'mae': 0.119, 'r^2': 0.922}
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction pr



████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
{'mse': 0.016, 'mae': 0.096, 'r^2': 0.945}
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |



████████████████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
{'mse': 0.02, 'mae': 0.099, 'r^2': 0.94}
------------------------------
------------------------------
------------------------------
DRF_1_AutoML_2_20240416_130202
Parse progress: |



████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
{'mse': 0.009, 'mae': 0.075, 'r^2': 0.968}
Parse progress: |



████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
{'mse': 0.021, 'mae': 0.098, 'r^2': 0.937}




TRAIN

In [77]:
best_model = aml.get_best_model()
preds = best_model.predict(train)
true = train.as_data_frame()['logP']
predicted_values = preds.as_data_frame()['predict']

calculate_metrics(true, predicted_values)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%




{'mse': 0.024, 'mae': 0.115, 'r^2': 0.925}

OOS

In [78]:
best_model = aml.get_best_model()
preds = best_model.predict(test)
true = test.as_data_frame()['logP']
predicted_values = preds.as_data_frame()['predict']

calculate_metrics(true, predicted_values)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%




{'mse': 0.018, 'mae': 0.117, 'r^2': 0.933}

In [79]:
best_model

key,value
Stacking strategy,cross_validation
Number of base models (used / total),5/5
# GBM base models (used / total),1/1
# DeepLearning base models (used / total),1/1
# GLM base models (used / total),1/1
# DRF base models (used / total),2/2
Metalearner algorithm,GLM
Metalearner fold assignment scheme,AUTO
Metalearner nfolds,0
Metalearner fold_column,fold_id

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid
mae,0.1876857,0.0276561,0.2072415,0.1681298
mean_residual_deviance,0.0585603,0.02406,0.0755732,0.0415473
mse,0.0585603,0.02406,0.0755732,0.0415473
null_deviance,23.386139,2.5729854,25.205513,21.566763
r2,0.8181612,0.0609421,0.7750686,0.8612538
residual_deviance,4.3297005,1.892632,5.6679935,2.9914076
rmse,0.2393687,0.0502571,0.2749059,0.2038316
rmsle,0.0839708,0.0222748,0.0997214,0.0682201


In [80]:
model_path = h2o.save_model(model=best_model, path=r"C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part\h2o_model\models\16.04.24_proper_train_crossval", force=True)