In [1]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt

def calculate_metrics(true_values, pred_values):
    mse = round(mean_squared_error(true_values, pred_values),3)
    mae = round(mean_absolute_error(true_values, pred_values),3)
    r_score = round(r2_score(true_values, pred_values),3)

    return {"mse": mse,
            "mae": mae,
            "r^2": r_score,}

In [2]:
import h2o

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 21.0.1+12-LTS-29, mixed mode, sharing)
  Starting server from C:\work\DrugDiscovery\drug-discovery-venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\38066\AppData\Local\Temp\tmp2f2mrvg3
  JVM stdout: C:\Users\38066\AppData\Local\Temp\tmp2f2mrvg3\h2o_38066_started_from_python.out
  JVM stderr: C:\Users\38066\AppData\Local\Temp\tmp2f2mrvg3\h2o_38066_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Kiev
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,1 month and 16 days
H2O_cluster_name:,H2O_from_python_38066_68sknj
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.950 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [17]:
import pandas as pd
import numpy as np

import h2o

import sys
import os
sys.path.insert(0, os.path.dirname('C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part'))

from ml_part.random_forest.data_prep.preparation import DataPreparation
from ml_part.random_forest.train import RFTrain
from ml_part.molecule_features.constants import identificator_to_molecule_type, cis_trans_id_to_str

CSV_PATH = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\remained_features_pKa_01.02_v2.csv'
smiles_filepath = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\smiles_to_index.pkl'

dataPreparation = DataPreparation(CSV_PATH)

unimportant_features_to_drop = []
X, y = dataPreparation.prepare_data_for_RF(is_pKa=True,
                                           use_mandatory_features=True,
                                           is_remove_outliers=True,
                                           is_remove_nan=False,
                                           outliers_features_to_skip=unimportant_features_to_drop)

correlated_features = ['f_atom_fraction', 'naHRing', 'nFaRing', 'nFaHRing', 'tpsa+f']
features_to_drop = []
for feature_name in X.columns:
    if feature_name in correlated_features:
        features_to_drop.append(feature_name)

X = X.drop(features_to_drop, axis=1)

rf_train = RFTrain(X=X, 
                   y=y,
                   smiles_filepath=smiles_filepath,
                   is_pKa=True,
                   k_folds=2)
smiles_to_index_dict = rf_train.smiles_to_index

y_train = rf_train.y_train
X_train = rf_train.X_train

y_test = rf_train.y_test
X_test = rf_train.X_test

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

train = h2o.H2OFrame(train_df)
test = h2o.H2OFrame(test_df)


True
183
['avg_atoms_in_cycle', 'nFaHRing', 'chirality', 'PPSA5', 'tpsa+f', 'RPCS', 'mol_num_cycles', 'GeomShapeIndex', 'angle_R2X2R1', 'nN', 'distance_between_atoms_in_f_group_centers', 'nC', 'nFARing', 'angle_R1X1R2', 'f_freedom', 'naHRing', 'nFAHRing', 'cis/trans', 'dipole_moment', 'f_to_fg', 'identificator', 'f_atom_fraction', 'nFRing', 'nFaRing', 'naRing', 'nFHRing', 'PBF', 'nARing', 'nF', 'dihedral_angle', 'nAHRing', 'nO', 'TASA', 'angle_X2X1R1', 'mol_volume', 'FPSA3', 'PNSA5', 'angle_X1X2R2', 'nHRing', 'pKa', 'logP']
distance_between_atoms_in_f_group_centers outliers indexes: [35]
dipole_moment outliers indexes: [82]
f_atom_fraction outliers indexes: [124]
PBF outliers indexes: [40, 71, 127]
mol_volume outliers indexes: [127]
FPSA3 outliers indexes: [40]
PNSA5 outliers indexes: [37, 38]
logP outliers indexes: [82, 83]
Remains rows:169, amount of features: 41
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |██████████

In [18]:
pKa_model = h2o.load_model(r'C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part\h2o_model\models\01.02.24_features_2.1\pKa\all_molecules(without_angle_feature)_without_outliers\DeepLearning_grid_1_AutoML_3_20240201_153554_model_72')

In [19]:
preds = pKa_model.predict(test)
true = test.as_data_frame()['pKa']
predicted_values = preds.as_data_frame()['predict']

calculate_metrics(true, predicted_values)

predicted_values = [round(pred, 2) for pred in list(preds.as_data_frame()['predict'])]

test_df['predicted_pKa'] = predicted_values

deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%




In [20]:
preds = pKa_model.predict(train)
true = train.as_data_frame()['pKa']
predicted_values = preds.as_data_frame()['predict']

calculate_metrics(true, predicted_values)

predicted_values = [round(pred, 2) for pred in list(preds.as_data_frame()['predict'])]

train_df['predicted_pKa'] = predicted_values

deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%




In [21]:
train_df['used_for'] = ['Train'] * len(train_df)
test_df['used_for'] = ['Out of sample'] * len(test_df)

df = pd.concat([train_df, test_df])
df['cis/trans'] = [cis_trans_id_to_str[param] for param in df['cis/trans']]
df['identificator'] = [identificator_to_molecule_type[param] for param in df['identificator']]
df.rename(columns = {'pKa': 'true_pKa'}, inplace=True)

In [22]:
for feature_name in df.columns:
    if "angle" in feature_name or "distance" in feature_name:
        df[feature_name] = [np.NaN if value == 0.000 else value for value in df[feature_name]]

In [23]:
df['identificator']

0      Carboxylic acid
1      Carboxylic acid
2      Carboxylic acid
3      Carboxylic acid
4      Carboxylic acid
            ...       
129    Secondary amine
131    Secondary amine
96     Secondary amine
63     Secondary amine
125      Primary amine
Name: identificator, Length: 169, dtype: object

In [24]:
import pandas as pd
import numpy as np

import h2o

import sys
import os
sys.path.insert(0, os.path.dirname('C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part'))

from ml_part.random_forest.data_prep.preparation import DataPreparation
from ml_part.random_forest.train import RFTrain
from ml_part.molecule_features.constants import identificator_to_molecule_type, cis_trans_id_to_str

CSV_PATH = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\remained_features_pKa_01.02_v2.csv'
smiles_filepath = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\smiles_to_index.pkl'

dataPreparation = DataPreparation(CSV_PATH)

unimportant_features_to_drop = []
X, y = dataPreparation.prepare_data_for_RF(is_pKa=True,
                                           use_mandatory_features=True,
                                           is_remove_outliers=False,
                                           is_remove_nan=False,
                                           outliers_features_to_skip=unimportant_features_to_drop)

correlated_features = ['f_atom_fraction', 'naHRing', 'nFaRing', 'nFaHRing', 'tpsa+f']
features_to_drop = []
for feature_name in X.columns:
    if feature_name in correlated_features:
        features_to_drop.append(feature_name)

X = X.drop(features_to_drop, axis=1)

rf_train = RFTrain(X=X, 
                   y=y,
                   smiles_filepath=smiles_filepath,
                   is_pKa=True,
                   k_folds=2)
smiles_to_index_dict = rf_train.smiles_to_index

y_train_all = rf_train.y_train
X_train_all = rf_train.X_train

y_test_all = rf_train.y_test
X_test_all = rf_train.X_test

train_df_all = pd.concat([X_train_all, y_train_all], axis=1)
test_df_all = pd.concat([X_test_all, y_test_all], axis=1)

df_all = pd.concat([train_df_all, test_df_all], axis=0)


True
183
['avg_atoms_in_cycle', 'nFaHRing', 'chirality', 'PPSA5', 'tpsa+f', 'RPCS', 'mol_num_cycles', 'GeomShapeIndex', 'angle_R2X2R1', 'nN', 'distance_between_atoms_in_f_group_centers', 'nC', 'nFARing', 'angle_R1X1R2', 'f_freedom', 'naHRing', 'nFAHRing', 'cis/trans', 'dipole_moment', 'f_to_fg', 'identificator', 'f_atom_fraction', 'nFRing', 'nFaRing', 'naRing', 'nFHRing', 'PBF', 'nARing', 'nF', 'dihedral_angle', 'nAHRing', 'nO', 'TASA', 'angle_X2X1R1', 'mol_volume', 'FPSA3', 'PNSA5', 'angle_X1X2R2', 'nHRing', 'pKa', 'logP']
Remains rows:178, amount of features: 41


In [25]:
indexes_to_drop = []
for index, _ in df_all.iterrows():
    if index in df.index:
        indexes_to_drop.append(index)

df_outliers = df_all.drop(index=indexes_to_drop)

df_all_h2o = h2o.H2OFrame(df_outliers)

Parse progress: |

████████████████████████████████████████████████████████████████| (done) 100%


In [26]:
preds = pKa_model.predict(df_all_h2o)
true = df_all_h2o.as_data_frame()['pKa']
predicted_values = preds.as_data_frame()['predict']

calculate_metrics(true, predicted_values)

predicted_values = [round(pred, 2) for pred in list(preds.as_data_frame()['predict'])]

df_outliers['predicted_pKa'] = predicted_values

deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%




In [27]:
df_outliers['used_for'] = ['outliers'] * len(df_outliers)

df_outliers['cis/trans'] = [cis_trans_id_to_str[param] for param in df_outliers['cis/trans']]
df_outliers['identificator'] = [identificator_to_molecule_type[param] for param in df_outliers['identificator']]
df_outliers.rename(columns = {'pKa': 'true_pKa'}, inplace=True)

In [33]:
df_combined = pd.concat([df, df_outliers])

zero_despertion_features = []
for feature_name in df_combined.columns:
    if len(df_combined[feature_name].unique()) == 1:       
        zero_despertion_features.append(feature_name)

df_combined.drop(zero_despertion_features, axis=1, inplace=True)

smiles_values = []
for index, row in df_combined.iterrows():
    SMILES = list(smiles_to_index_dict.keys())[list(smiles_to_index_dict.values()).index(index)]
    smiles_values.append(SMILES)

df_combined['smiles'] = smiles_values

In [34]:
df_combined['smiles']

0                    FC1(F)CCC(C(O)=O)CC1
1                          OC(C1CCCCC1)=O
2           OC([C@@H]1C[C@H]2C[C@H]2C1)=O
3            OC([C@H]1C[C@H]2C[C@H]2C1)=O
4      OC([C@H]1C[C@H]2C(F)(F)[C@H]2C1)=O
                      ...                
83              FC1(F)C(C2)CNCC1CS2(=O)=O
127                                    NC
35                 O=C(O)C(C1)CC21CC(F)C2
37           OC(C12OCC(C(F)(F)F)(C2)C1)=O
124                            NCC(F)(F)F
Name: smiles, Length: 178, dtype: object

In [36]:
df_combined.sort_index(inplace=True)

In [35]:
df_combined

Unnamed: 0,avg_atoms_in_cycle,chirality,PPSA5,RPCS,mol_num_cycles,GeomShapeIndex,angle_R2X2R1,nN,distance_between_atoms_in_f_group_centers,nC,...,mol_volume,FPSA3,PNSA5,angle_X1X2R2,nHRing,fold_id,true_pKa,predicted_pKa,used_for,smiles
0,6.0,0,14.226840,20.923542,1,0.948011,176.207389,0,1.641853,7,...,139.38,0.043079,-28.942683,154.504909,0,0.0,4.32,4.26,Train,FC1(F)CCC(C(O)=O)CC1
1,6.0,0,13.888098,16.720730,1,0.847867,,0,,7,...,128.86,0.046453,-13.676509,,0,1.0,4.85,5.00,Train,OC(C1CCCCC1)=O
2,3.0,3,14.422664,15.197764,2,0.712414,,0,,7,...,122.04,0.047023,-13.789063,,0,1.0,4.35,4.32,Train,OC([C@@H]1C[C@H]2C[C@H]2C1)=O
3,3.0,3,14.472009,17.778516,2,0.829473,,0,,7,...,122.34,0.047613,-14.420450,,0,0.0,4.24,4.34,Train,OC([C@H]1C[C@H]2C[C@H]2C1)=O
4,3.0,3,14.920075,20.595646,2,0.800485,131.710055,0,1.858448,7,...,132.52,0.045186,-29.793931,146.991002,0,0.0,3.80,3.96,Train,OC([C@H]1C[C@H]2C(F)(F)[C@H]2C1)=O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,4.5,0,11.942380,0.000000,2,0.560239,159.486596,1,4.509685,7,...,162.49,0.028894,-33.237148,154.221803,2,1.0,5.27,6.56,outliers,FC1(F)C(C2)CNCC1CS2(=O)=O
127,0.0,0,8.219843,73.475540,0,0.394463,0.000000,1,0.000000,1,...,41.01,0.051130,-8.385550,0.000000,0,1.0,10.78,10.12,outliers,NC
35,3.5,0,14.937986,20.344650,2,0.784297,150.235186,0,10.386362,8,...,143.84,0.046701,-19.182118,147.449926,0,,4.39,5.33,outliers,O=C(O)C(C1)CC21CC(F)C2
37,2.0,0,15.407972,0.000000,3,0.841568,159.320247,0,1.472000,7,...,145.41,0.040555,-52.090400,166.824213,2,,2.88,2.33,outliers,OC(C12OCC(C(F)(F)F)(C2)C1)=O


In [37]:
df_combined

Unnamed: 0,avg_atoms_in_cycle,chirality,PPSA5,RPCS,mol_num_cycles,GeomShapeIndex,angle_R2X2R1,nN,distance_between_atoms_in_f_group_centers,nC,...,mol_volume,FPSA3,PNSA5,angle_X1X2R2,nHRing,fold_id,true_pKa,predicted_pKa,used_for,smiles
0,6.0,0,14.226840,20.923542,1,0.948011,176.207389,0,1.641853,7,...,139.38,0.043079,-28.942683,154.504909,0,0.0,4.32,4.26,Train,FC1(F)CCC(C(O)=O)CC1
1,6.0,0,13.888098,16.720730,1,0.847867,,0,,7,...,128.86,0.046453,-13.676509,,0,1.0,4.85,5.00,Train,OC(C1CCCCC1)=O
2,3.0,3,14.422664,15.197764,2,0.712414,,0,,7,...,122.04,0.047023,-13.789063,,0,1.0,4.35,4.32,Train,OC([C@@H]1C[C@H]2C[C@H]2C1)=O
3,3.0,3,14.472009,17.778516,2,0.829473,,0,,7,...,122.34,0.047613,-14.420450,,0,0.0,4.24,4.34,Train,OC([C@H]1C[C@H]2C[C@H]2C1)=O
4,3.0,3,14.920075,20.595646,2,0.800485,131.710055,0,1.858448,7,...,132.52,0.045186,-29.793931,146.991002,0,0.0,3.80,3.96,Train,OC([C@H]1C[C@H]2C(F)(F)[C@H]2C1)=O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,3.0,2,11.743145,4.833689,1,0.569427,93.412014,1,3.082645,4,...,94.70,0.042360,-20.444742,122.453227,0,1.0,7.16,7.23,Train,N[C@@H]1[C@H](C(F)([H])F)C1
176,3.0,2,8.670753,98.958912,1,0.620640,,1,,4,...,83.71,0.041147,-5.706004,,0,1.0,9.44,9.42,Train,C[C@@H]1C[C@H]1N
177,3.0,2,7.931815,78.177540,1,0.492211,,1,,4,...,83.67,0.035464,-6.524603,,0,1.0,9.15,9.08,Train,C[C@@H]1C[C@@H]1N
180,2.5,0,8.165825,71.616824,2,0.806713,,1,,5,...,92.34,0.033184,-6.476025,,0,0.0,9.41,9.30,Train,NC1C2(CC2)C1


In [40]:
df_combined.to_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\csv_for_analyse\pKa_analyse.csv')