In [1]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt

def calculate_metrics(true_values, pred_values):
    mse = round(mean_squared_error(true_values, pred_values),3)
    mae = round(mean_absolute_error(true_values, pred_values),3)
    r_score = round(r2_score(true_values, pred_values),3)

    return {"mse": mse,
            "mae": mae,
            "r^2": r_score,}

In [2]:
import h2o
from h2o.automl import H2OAutoML

# Start the H2O cluster (locally)
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 21.0.1+12-LTS-29, mixed mode, sharing)
  Starting server from C:\work\DrugDiscovery\drug-discovery-venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\38066\AppData\Local\Temp\tmppcxxzkzr
  JVM stdout: C:\Users\38066\AppData\Local\Temp\tmppcxxzkzr\h2o_38066_started_from_python.out
  JVM stderr: C:\Users\38066\AppData\Local\Temp\tmppcxxzkzr\h2o_38066_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Kiev
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,3 months and 27 days
H2O_cluster_name:,H2O_from_python_38066_rb9v1p
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.952 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


In [3]:
import os
import sys
sys.path.insert(0, os.path.dirname('C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part'))

import pandas as pd

from ml_part.random_forest.data_prep.preparation import DataPreparation
from ml_part.random_forest.train import RFTrain

CSV_PATH = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\remained_features_pKa_08.02_v4_fixed_distances_chirality.csv'
smiles_filepath = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\smiles_to_index.pkl'

dataPreparation = DataPreparation(CSV_PATH)

unimportant_features_to_drop = ['dipole_moment']
X, y = dataPreparation.prepare_data_for_RF(is_pKa=True,
                                           use_mandatory_features=True,
                                           is_remove_outliers=True,
                                           is_remove_nan=False,
                                           outliers_features_to_skip=unimportant_features_to_drop)

LOGP_FEATURES = ['f_freedom', 'PPSA5', 'mol_num_cycles', 'nFRing', 'nF', 'identificator', 'f_atom_fraction',
                 'mol_weight', 'dipole_moment', 'nHRing', 'nO', 'PBF', 'nC', 'nARing',
                 'cis/trans', 'PNSA5', 'FPSA3', 'mol_volume', 'RPCS', 'GeomShapeIndex',
                 'WPSA5', 'TASA', 'f_to_fg', 'avg_atoms_in_cycle', 'nFHRing',
                 'chirality']

features_to_drop = []
for feature_name in X.columns:
    if feature_name not in LOGP_FEATURES:
        features_to_drop.append(feature_name)


X = X.drop(features_to_drop, axis=1)

rf_train = RFTrain(X=X, 
                   y=y,
                   smiles_filepath=smiles_filepath,
                   is_pKa=True,
                   k_folds=2)

y_train = rf_train.y_train
X_train = rf_train.X_train

y_test = rf_train.y_test
X_test = rf_train.X_test

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

print(len(train_df), len(test_df))

train = h2o.H2OFrame(train_df)
test = h2o.H2OFrame(test_df)

True
183
['RPCS', 'PBF', 'mol_weight', 'dipole_moment', 'PPSA5', 'avg_atoms_in_cycle', 'nHRing', 'cis/trans', 'f_atom_fraction', 'dihedral_angle', 'FPSA3', 'distance_between_atoms_in_cycle_and_f_group', 'angle_X1X2R2', 'nF', 'angle_R1X1R2', 'nFAHRing', 'nAHRing', 'chirality', 'sasa', 'PNSA5', 'GeomShapeIndex', 'TASA', 'angle_R2X2R1', 'mol_num_cycles', 'naRing', 'nN', 'f_freedom', 'tpsa+f', 'nFRing', 'identificator', 'nO', 'distance_between_atoms_in_f_group_centers', 'angle_X2X1R1', 'nARing', 'nFARing', 'nC', 'nFHRing', 'f_to_fg', 'pKa', 'logP']
PBF outliers indexes: [40, 71, 127]
f_atom_fraction outliers indexes: [124]
FPSA3 outliers indexes: [40]
sasa outliers indexes: [127]
PNSA5 outliers indexes: [37, 38]
distance_between_atoms_in_f_group_centers outliers indexes: [35]
logP outliers indexes: [82, 83]
Remains rows:169, amount of features: 40
144 25
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |█████████████████████████

In [17]:
import numpy as np
import pandas as pd
import scipy.stats as ss

def cramers_v(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))


ring_features = [feature_name for feature_name in X.columns if 'ring' in feature_name.lower()]
ring_features_to_remain = ['nFRing', 'nHRing', 'nARing', 'nFHRing']

for i in range(len(ring_features)):
    for j in range(len(ring_features)):
        first_feature = X[ring_features[i]]
        second_feature = X[ring_features[j]]

        if i == j:
            continue

        confusion_matrix = pd.crosstab(first_feature, second_feature)
        cramers_v_value = cramers_v(confusion_matrix.values)
        print(f"{ring_features[i]}, {ring_features[j]}, corr: {cramers_v_value}")

nHRing, nARing, corr: 0.6568999956701997
nHRing, nFRing, corr: 0.5799472743597861
nHRing, nFHRing, corr: 0.7298992678284247
nARing, nHRing, corr: 0.6568999956701996
nARing, nFRing, corr: 0.8557878388867258
nARing, nFHRing, corr: 0.7758976171502688
nFRing, nHRing, corr: 0.5799472743597861
nFRing, nARing, corr: 0.8557878388867258
nFRing, nFHRing, corr: 0.8090152479786314
nFHRing, nHRing, corr: 0.7298992678284247
nFHRing, nARing, corr: 0.775897617150269
nFHRing, nFRing, corr: 0.8090152479786314


In [7]:
x = train.columns
y = "pKa"
x.remove(y)

aml = H2OAutoML(
    seed=1, 
    max_runtime_secs_per_model=300,
    keep_cross_validation_predictions=True,
    keep_cross_validation_fold_assignment=True,
    keep_cross_validation_models=True
)
aml.train(x=x, y=y, training_frame=train, fold_column="fold_id")

lb = aml.leaderboard
print(lb.head(rows=lb.nrows))

AutoML progress: |
19:19:18.746: Fold column fold_id will be used for cross-validation. nfolds parameter will be ignored.
19:19:18.759: AutoML: XGBoost is not available; skipping it.


19:19:19.212: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 144.0.

███████████████████████████████████████████████████████████████| (done) 100%
model_id                                                     rmse       mse       mae        rmsle    mean_residual_deviance
StackedEnsemble_BestOfFamily_3_AutoML_1_20240416_191918  0.623578  0.38885   0.443963    0.0780921                  0.38885
StackedEnsemble_BestOfFamily_5_AutoML_1_20240416_191918  0.623916  0.389271  0.44615     0.0780885                  0.389271
DeepLearning_grid_1_AutoML_1_20240416_191918_model_39    0.62896   0.395591  0.444926    0.0791568                  0.395591
StackedEnsemble_BestOfFamily_6_AutoML_1_20240416_191918  0.631261  0.39849   0.448006

In [8]:
best_model = h2o.get_model('StackedEnsemble_BestOfFamily_3_AutoML_1_20240416_191918')

In [9]:
base_models = best_model.base_models
base_models_cv = []

for base_model in base_models:
    base_models_cv.append(base_model + "_cv_1")
    base_models_cv.append(base_model + "_cv_2")

base_models_cv

['DeepLearning_grid_1_AutoML_1_20240416_191918_model_39_cv_1',
 'DeepLearning_grid_1_AutoML_1_20240416_191918_model_39_cv_2',
 'GBM_grid_1_AutoML_1_20240416_191918_model_58_cv_1',
 'GBM_grid_1_AutoML_1_20240416_191918_model_58_cv_2',
 'DRF_1_AutoML_1_20240416_191918_cv_1',
 'DRF_1_AutoML_1_20240416_191918_cv_2',
 'XRT_1_AutoML_1_20240416_191918_cv_1',
 'XRT_1_AutoML_1_20240416_191918_cv_2',
 'GLM_1_AutoML_1_20240416_191918_cv_1',
 'GLM_1_AutoML_1_20240416_191918_cv_2']

In [15]:
cv_indices_dict = {0: [], 1: []}
index = 0
for _, row in train.as_data_frame().iterrows():
    cv_indices_dict[row['fold_id']].append(index)
    index += 1
cv_indices = [[cv_indices_dict[1], cv_indices_dict[0]], [cv_indices_dict[0], cv_indices_dict[1]]]

r_2, mse, mae = 0, 0 ,0
for base_model in base_models:
    print("-"*30)
    print("-"*30)
    print("-"*30)
    print(base_model)
    for cv_idx in range(len(cv_indices)):
        base_model_cv = base_model + f"_cv_{cv_idx + 1}"
        cv_indexes = cv_indices[cv_idx]
        
        train_idx = cv_indexes[0]
        test_idx = cv_indexes[1]

        train_h2o_cv = h2o.H2OFrame(train_df.iloc[train_idx])
        test_h2o_cv = h2o.H2OFrame(train_df.iloc[test_idx])

        model_cv = h2o.get_model(base_model_cv)
        preds = model_cv.predict(train_h2o_cv)

        true = train_h2o_cv.as_data_frame()['pKa']
        predicted_values = preds.as_data_frame()['predict']

        train_cv_metrics = calculate_metrics(true, predicted_values)
        r_2 += train_cv_metrics['r^2']
        mae += train_cv_metrics['mae']
        mse += train_cv_metrics['mse']

------------------------------
------------------------------
------------------------------
DeepLearning_grid_1_AutoML_1_20240416_191918_model_39
Parse progress: |



████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
------------------------------
------------------------------
------------------------------
GBM_grid_1_AutoML_1_20240416_191918_model_58
Parse progress: |



████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
------------------------------
------------------------------
------------------------------
DRF_1_AutoML_1_20240416_191918
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%




Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
------------------------------
------------------------------
------------------------------
XRT_1_AutoML_1_20240416_191918
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
Parse progress: |



████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
------------------------------
------------------------------
------------------------------
GLM_1_AutoML_1_20240416_191918
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%




In [17]:
print(r_2 / 10, mse / 10, mae / 10)

0.9795999999999999 0.1323 0.22000000000000003


TRAIN

In [13]:
best_model = aml.get_best_model()
preds = best_model.predict(train)
true = train.as_data_frame()['pKa']
predicted_values = preds.as_data_frame()['predict']

calculate_metrics(true, predicted_values)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%




{'mse': 0.009, 'mae': 0.086, 'r^2': 0.999}

OOS

In [14]:
best_model = aml.get_best_model()
preds = best_model.predict(test)
true = test.as_data_frame()['pKa']
predicted_values = preds.as_data_frame()['predict']

calculate_metrics(true, predicted_values)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%




{'mse': 0.535, 'mae': 0.586, 'r^2': 0.911}

In [18]:
best_model

key,value
Stacking strategy,cross_validation
Number of base models (used / total),2/5
# GBM base models (used / total),1/1
# DeepLearning base models (used / total),1/1
# DRF base models (used / total),0/2
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,AUTO
Metalearner nfolds,0
Metalearner fold_column,fold_id

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid
mae,0.444038,0.0038407,0.4413222,0.4467538
mean_residual_deviance,0.3884243,0.0216534,0.4037356,0.3731131
mse,0.3884243,0.0216534,0.4037356,0.3731131
null_deviance,467.36746,24.869736,484.953,449.7819
r2,0.9401372,0.0025031,0.9383672,0.9419072
residual_deviance,27.997175,2.6576736,29.876434,26.117916
rmse,0.6231159,0.0173751,0.6354019,0.6108298
rmsle,0.0780879,0.0002061,0.0782337,0.0779422


In [19]:
model_path = h2o.save_model(model=best_model, path=r"C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part\h2o_model\models\16.04.24_proper_train_crossval\pKa", force=True)