In [1]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt

def calculate_metrics(true_values, pred_values):
    mse = round(mean_squared_error(true_values, pred_values),3)
    mae = round(mean_absolute_error(true_values, pred_values),3)
    r_score = round(r2_score(true_values, pred_values),3)

    return {"mse": mse,
            "mae": mae,
            "r^2": r_score,}

In [2]:
import h2o

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 21.0.1+12-LTS-29, mixed mode, sharing)
  Starting server from C:\work\DrugDiscovery\drug-discovery-venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\38066\AppData\Local\Temp\tmppl5q0n6x
  JVM stdout: C:\Users\38066\AppData\Local\Temp\tmppl5q0n6x\h2o_38066_started_from_python.out
  JVM stderr: C:\Users\38066\AppData\Local\Temp\tmppl5q0n6x\h2o_38066_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Kiev
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,1 month and 16 days
H2O_cluster_name:,H2O_from_python_38066_ypozg7
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.950 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [4]:
import pandas as pd
import numpy as np

import h2o

import sys
import os
sys.path.insert(0, os.path.dirname('C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part'))

from ml_part.random_forest.data_prep.preparation import DataPreparation
from ml_part.random_forest.train import RFTrain
from ml_part.molecule_features.constants import identificator_to_molecule_type, cis_trans_id_to_str

CSV_PATH = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\remained_features_logP_01.02_v3.csv'
smiles_filepath = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\smiles_to_index.pkl'

dataPreparation = DataPreparation(CSV_PATH)

unimportant_features_to_drop = []
X, y = dataPreparation.prepare_data_for_RF(is_pKa=False,
                                           use_mandatory_features=True,
                                           is_remove_outliers=True,
                                           is_remove_nan=False,
                                           outliers_features_to_skip=unimportant_features_to_drop)

correlated_features = ['f_atom_fraction', 'naHRing', 'nFaRing', 'nFaHRing', 'tpsa+f']
features_to_drop = []
for feature_name in X.columns:
    if feature_name in correlated_features:
        features_to_drop.append(feature_name)

X = X.drop(features_to_drop, axis=1)

rf_train = RFTrain(X=X, 
                   y=y,
                   smiles_filepath=smiles_filepath,
                   is_pKa=False,
                   k_folds=2)
smiles_to_index_dict = rf_train.smiles_to_index

y_train = rf_train.y_train
X_train = rf_train.X_train

y_test = rf_train.y_test
X_test = rf_train.X_test

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

train = h2o.H2OFrame(train_df)
test = h2o.H2OFrame(test_df)


True
183
['nC', 'FPSA3', 'nFARing', 'RPCS', 'dipole_moment', 'angle_R1X1R2', 'nFAHRing', 'angle_X2X1R1', 'nFHRing', 'naHRing', 'chirality', 'cis/trans', 'nHRing', 'TASA', 'angle_X1X2R2', 'nO', 'nF', 'PNSA5', 'dihedral_angle', 'f_atom_fraction', 'f_to_fg', 'nARing', 'angle_R2X2R1', 'identificator', 'naRing', 'nFaRing', 'nFaHRing', 'mol_volume', 'tpsa+f', 'mol_weight', 'avg_atoms_in_cycle', 'mol_num_cycles', 'nN', 'nAHRing', 'PBF', 'nFRing', 'WPSA5', 'f_freedom', 'distance_between_atoms_in_f_group_centers', 'PPSA5', 'GeomShapeIndex', 'pKa', 'logP']
dipole_moment outliers indexes: [19, 21, 23, 27, 61, 66, 67, 73, 82, 136, 152, 182]
f_atom_fraction outliers indexes: [124]
mol_volume outliers indexes: [127]
distance_between_atoms_in_f_group_centers outliers indexes: [168]
logP outliers indexes: [82, 83]
Remains rows:162, amount of features: 43
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |█████████████████████████████████████

In [5]:
logP_model = h2o.load_model(r'C:\\work\\DrugDiscovery\\main_git\\XAI_Chem\\ml_part\\h2o_model\\models\\01.02.24_features_2.1\\logP\\all_molecules(without_angle_feature)_identificator_without_outliers_sorted_by_cv\\StackedEnsemble_BestOfFamily_3_AutoML_2_20240205_132350')

In [55]:
preds = logP_model.predict(test)
true = test.as_data_frame()['logP']
predicted_values = preds.as_data_frame()['predict']

print(calculate_metrics(true, predicted_values))

predicted_values = [round(pred, 2) for pred in list(preds.as_data_frame()['predict'])]

test_df['predicted_pKa'] = predicted_values

stackedensemble prediction progress: |

███████████████████████████████████████████| (done) 100%
{'mse': 0.044, 'mae': 0.168, 'r^2': 0.83}




In [56]:
preds = logP_model.predict(train)
true = train.as_data_frame()['logP']
predicted_values = preds.as_data_frame()['predict']

print(calculate_metrics(true, predicted_values))

predicted_values = [round(pred, 2) for pred in list(preds.as_data_frame()['predict'])]

train_df['predicted_pKa'] = predicted_values

stackedensemble prediction progress: |

███████████████████████████████████████████| (done) 100%
{'mse': 0.016, 'mae': 0.084, 'r^2': 0.958}




In [9]:
train_df['used_for'] = ['Train'] * len(train_df)
test_df['used_for'] = ['Out of sample'] * len(test_df)

df = pd.concat([train_df, test_df])
df['cis/trans'] = [cis_trans_id_to_str[param] for param in df['cis/trans']]
df['identificator'] = [identificator_to_molecule_type[param] for param in df['identificator']]
df.rename(columns = {'pKa': 'true_pKa'}, inplace=True)

In [10]:
for feature_name in df.columns:
    if "angle" in feature_name or "distance" in feature_name:
        df[feature_name] = [np.NaN if value == 0.000 else value for value in df[feature_name]]

In [11]:
df['identificator']

1      Carboxylic acid
2      Carboxylic acid
3      Carboxylic acid
4      Carboxylic acid
5      Carboxylic acid
            ...       
55       Primary amine
126      Primary amine
103      Primary amine
49     Carboxylic acid
22     Carboxylic acid
Name: identificator, Length: 162, dtype: object

In [15]:
import pandas as pd
import numpy as np

import h2o

import sys
import os
sys.path.insert(0, os.path.dirname('C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part'))

from ml_part.random_forest.data_prep.preparation import DataPreparation
from ml_part.random_forest.train import RFTrain
from ml_part.molecule_features.constants import identificator_to_molecule_type, cis_trans_id_to_str

CSV_PATH = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\remained_features_logP_01.02_v3.csv'
smiles_filepath = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\smiles_to_index.pkl'

dataPreparation = DataPreparation(CSV_PATH)

unimportant_features_to_drop = []
X, y = dataPreparation.prepare_data_for_RF(is_pKa=False,
                                           use_mandatory_features=True,
                                           is_remove_outliers=False,
                                           is_remove_nan=False,
                                           outliers_features_to_skip=unimportant_features_to_drop)

correlated_features = ['f_atom_fraction', 'naHRing', 'nFaRing', 'nFaHRing', 'tpsa+f']
features_to_drop = []
for feature_name in X.columns:
    if feature_name in correlated_features:
        features_to_drop.append(feature_name)

X = X.drop(features_to_drop, axis=1)

rf_train = RFTrain(X=X, 
                   y=y,
                   smiles_filepath=smiles_filepath,
                   is_pKa=False,
                   k_folds=2)
smiles_to_index_dict = rf_train.smiles_to_index

y_train_all = rf_train.y_train
X_train_all = rf_train.X_train

y_test_all = rf_train.y_test
X_test_all = rf_train.X_test

train_df_all = pd.concat([X_train_all, y_train_all], axis=1)
test_df_all = pd.concat([X_test_all, y_test_all], axis=1)

df_all = pd.concat([train_df_all, test_df_all], axis=0)


True
183
['nC', 'FPSA3', 'nFARing', 'RPCS', 'dipole_moment', 'angle_R1X1R2', 'nFAHRing', 'angle_X2X1R1', 'nFHRing', 'naHRing', 'chirality', 'cis/trans', 'nHRing', 'TASA', 'angle_X1X2R2', 'nO', 'nF', 'PNSA5', 'dihedral_angle', 'f_atom_fraction', 'f_to_fg', 'nARing', 'angle_R2X2R1', 'identificator', 'naRing', 'nFaRing', 'nFaHRing', 'mol_volume', 'tpsa+f', 'mol_weight', 'avg_atoms_in_cycle', 'mol_num_cycles', 'nN', 'nAHRing', 'PBF', 'nFRing', 'WPSA5', 'f_freedom', 'distance_between_atoms_in_f_group_centers', 'PPSA5', 'GeomShapeIndex', 'pKa', 'logP']
Remains rows:178, amount of features: 43


In [16]:
indexes_to_drop = []
for index, _ in df_all.iterrows():
    if index in df.index:
        indexes_to_drop.append(index)

df_outliers = df_all.drop(index=indexes_to_drop)

df_all_h2o = h2o.H2OFrame(df_outliers)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [54]:
preds = logP_model.predict(df_all_h2o)
true = df_all_h2o.as_data_frame()['logP']
predicted_values = preds.as_data_frame()['predict']

print(calculate_metrics(true, predicted_values))

predicted_values = [round(pred, 2) for pred in list(preds.as_data_frame()['predict'])]

df_outliers['predicted_pKa'] = predicted_values

stackedensemble prediction progress: |

███████████████████████████████████████████| (done) 100%
{'mse': 0.067, 'mae': 0.184, 'r^2': 0.914}




In [19]:
df_outliers['used_for'] = ['outliers'] * len(df_outliers)

df_outliers['cis/trans'] = [cis_trans_id_to_str[param] for param in df_outliers['cis/trans']]
df_outliers['identificator'] = [identificator_to_molecule_type[param] for param in df_outliers['identificator']]
df_outliers.rename(columns = {'pKa': 'true_pKa'}, inplace=True)

In [48]:
MAIN_CSV_PATH = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\init_data\pKa_Prediction_Starting data_2024.01.25.csv'

df_for_logP_smiles = pd.read_csv(MAIN_CSV_PATH, index_col=0)

index_to_logP_smiles = {}

for index, row in df_for_logP_smiles.iterrows():
    if pd.isnull(row['Amides for LogP']):
        continue

    index_to_logP_smiles[index - 1] = row['Amides for LogP']
index_to_logP_smiles

{0: 'FC1(F)CCC(CC1)C(=O)NC1=CC=CC=C1',
 1: 'O=C(NC1=CC=CC=C1)C1CCCCC1',
 2: 'O=C(NC1=CC=CC=C1)[C@H]1C[C@@H]2C[C@@H]2C1',
 3: 'O=C(NC1=CC=CC=C1)[C@@H]1C[C@@H]2C[C@@H]2C1',
 4: 'FC1(F)[C@H]2C[C@H](C[C@@H]12)C(=O)NC1=CC=CC=C1',
 5: 'FC1(F)[C@H]2C[C@@H](C[C@@H]12)C(=O)NC1=CC=CC=C1',
 6: 'O=C(NC1=CC=CC=C1)C1CCC1',
 7: 'FCC1(CCC1)C(=O)NC1=CC=CC=C1',
 8: 'FC(F)C1(CCC1)C(=O)NC1=CC=CC=C1',
 9: 'FC(F)(F)C1(CCC1)C(=O)NC1=CC=CC=C1',
 10: 'FC(F)(F)[C@H]1C[C@@H](C1)C(=O)NC1=CC=CC=C1',
 11: 'FC(F)(F)[C@@H]1C[C@@H](C1)C(=O)NC1=CC=CC=C1',
 12: '[H]C(F)(F)[C@H]1C[C@@H](C1)C(=O)NC1=CC=CC=C1',
 13: '[H]C(F)(F)[C@@H]1C[C@@H](C1)C(=O)NC1=CC=CC=C1',
 14: '[H]C([H])(F)[C@H]1C[C@@H](C1)C(=O)NC1=CC=CC=C1',
 15: '[H]C([H])(F)[C@@H]1C[C@@H](C1)C(=O)NC1=CC=CC=C1',
 16: 'FC1(F)CC1C(=O)NC1=CC=CC=C1',
 17: 'FC1(F)CCC1C(=O)NC1=CC=CC=C1',
 18: 'FC1(F)CC(C1)C(=O)NC1=CC=CC=C1',
 19: 'FC1(F)CCCC1C(=O)NC1=CC=CC=C1',
 20: 'FC1(F)CCC(C1)C(=O)NC1=CC=CC=C1',
 21: 'FC1(F)CCCCC1C(=O)NC1=CC=CC=C1',
 22: 'FC1(F)CCCC(C1)C(=O)NC1=CC

In [50]:
df_combined = pd.concat([df, df_outliers])

zero_despertion_features = []
for feature_name in df_combined.columns:
    if len(df_combined[feature_name].unique()) == 1:       
        zero_despertion_features.append(feature_name)

df_combined.drop(zero_despertion_features, axis=1, inplace=True)

smiles_values = []
for index, row in df_combined.iterrows():
    if index == 0:
        print(row)
    SMILES = index_to_logP_smiles[index]
    print(index, SMILES, row['logP'])
    smiles_values.append(SMILES)

df_combined['smiles'] = smiles_values

1 O=C(NC1=CC=CC=C1)C1CCCCC1 2.88
2 O=C(NC1=CC=CC=C1)[C@H]1C[C@@H]2C[C@@H]2C1 2.87
3 O=C(NC1=CC=CC=C1)[C@@H]1C[C@@H]2C[C@@H]2C1 2.92
4 FC1(F)[C@H]2C[C@H](C[C@@H]12)C(=O)NC1=CC=CC=C1 2.37
5 FC1(F)[C@H]2C[C@@H](C[C@@H]12)C(=O)NC1=CC=CC=C1 2.88
6 O=C(NC1=CC=CC=C1)C1CCC1 2.26
7 FCC1(CCC1)C(=O)NC1=CC=CC=C1 1.88
8 FC(F)C1(CCC1)C(=O)NC1=CC=CC=C1 2.19
9 FC(F)(F)C1(CCC1)C(=O)NC1=CC=CC=C1 2.51
10 FC(F)(F)[C@H]1C[C@@H](C1)C(=O)NC1=CC=CC=C1 3.02
11 FC(F)(F)[C@@H]1C[C@@H](C1)C(=O)NC1=CC=CC=C1 2.72
12 [H]C(F)(F)[C@H]1C[C@@H](C1)C(=O)NC1=CC=CC=C1 2.39
13 [H]C(F)(F)[C@@H]1C[C@@H](C1)C(=O)NC1=CC=CC=C1 2.23
14 [H]C([H])(F)[C@H]1C[C@@H](C1)C(=O)NC1=CC=CC=C1 2.14
15 [H]C([H])(F)[C@@H]1C[C@@H](C1)C(=O)NC1=CC=CC=C1 1.96
18 FC1(F)CC(C1)C(=O)NC1=CC=CC=C1 2.26
20 FC1(F)CCC(C1)C(=O)NC1=CC=CC=C1 2.5
24 FC1(F)CCCCC(C1)C(=O)NC1=CC=CC=C1 3.03
25 FC1(F)CCCC(CC1)C(=O)NC1=CC=CC=C1 2.95
26 CCC(F)(F)CC(=O)NC1=CC=CC=C1 2.26
28 FC(F)CCCC(=O)NC1=CC=CC=C1 2.06
29 O=C(NC1=CC=CC=C1)C1CC1 1.92
30 O=C(NC1=CC=CC=C1)C1CCC1 2.26
31

In [52]:
df_combined.sort_index(inplace=True)

In [58]:
df_combined

Unnamed: 0,nC,FPSA3,nFARing,RPCS,dipole_moment,angle_R1X1R2,nFAHRing,angle_X2X1R1,nFHRing,chirality,...,WPSA5,f_freedom,distance_between_atoms_in_f_group_centers,PPSA5,GeomShapeIndex,fold_id,logP,predicted_pKa,used_for,smiles
0,13,0.029613,0,0.000000,0.707067,116.758869,0,111.411909,0,0,...,7.003700,0,4.173806,16.161259,0.875790,,2.63,2.63,Out of sample,FC1(F)CCC(CC1)C(=O)NC1=CC=CC=C1
1,13,0.033183,0,27.282695,0.507854,,0,,0,0,...,6.805737,1,,16.178954,0.860000,1.0,2.88,2.92,Train,O=C(NC1=CC=CC=C1)C1CCCCC1
2,13,0.029652,1,16.638718,0.501031,,0,,0,3,...,6.398578,1,,15.488153,0.958274,0.0,2.87,3.09,Train,O=C(NC1=CC=CC=C1)[C@H]1C[C@@H]2C[C@@H]2C1
3,13,0.032625,1,20.164804,0.505852,,0,,0,3,...,6.565329,1,,15.945793,0.754336,1.0,2.92,3.06,Train,O=C(NC1=CC=CC=C1)[C@@H]1C[C@@H]2C[C@@H]2C1
4,13,0.030749,1,11.184998,0.493885,103.610503,0,111.876204,0,3,...,7.112882,0,4.245568,16.472886,0.891772,1.0,2.37,2.58,Train,FC1(F)[C@H]2C[C@H](C[C@@H]12)C(=O)NC1=CC=CC=C1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,11,0.034830,0,44.792851,0.473569,93.627775,0,122.429092,0,2,...,6.974581,1,2.671617,17.390764,0.761236,1.0,1.36,1.36,Train,[H]C(F)(F)[C@@H]1C[C@@H]1NC(=O)C1=CC=CC=C1
176,11,0.033483,0,34.695093,0.449349,,0,,0,2,...,5.894519,1,,15.156861,0.887202,0.0,1.83,1.83,Train,C[C@@H]1C[C@H]1NC(=O)C1=CC=CC=C1
177,11,0.032038,0,36.029520,0.454426,,0,,0,2,...,6.056597,1,,15.457994,0.782160,1.0,1.64,1.65,Train,C[C@@H]1C[C@@H]1NC(=O)C1=CC=CC=C1
180,12,0.033000,0,35.464215,0.457185,,0,,0,0,...,6.269730,1,,15.517112,0.797100,1.0,2.08,2.10,Train,O=C(NC1CC11CC1)C1=CC=CC=C1


In [59]:
df_combined.to_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\csv_for_analyse\logP_analyse.csv')