In [13]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt

def calculate_metrics(true_values, pred_values):
    mse = round(mean_squared_error(true_values, pred_values),3)
    mae = round(mean_absolute_error(true_values, pred_values),3)
    r_score = round(r2_score(true_values, pred_values),3)

    return {"mse": mse,
            "mae": mae,
            "r^2": r_score,}

In [5]:
import pandas as pd

train_csv_path = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\logP_lipophilicity_data\gnn_cv\train.csv'
test_csv_path = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\logP_lipophilicity_data\gnn_cv\test.csv'

df_train = pd.read_csv(train_csv_path, index_col=0)
df_test = pd.read_csv(test_csv_path, index_col=0)

WITHOUT PREPROCESSING

In [36]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Crippen import MolLogP

true_values = []
pred_values = []

for index, row in df_test.iterrows():
    SMILES = row['Smiles']
    true_logP = row['logP']
    
    mol = AllChem.MolFromSmiles(SMILES)
    pred_logP = MolLogP(mol=mol,
                        includeHs=True)
    
    true_values.append(true_logP)
    pred_values.append(pred_logP)

calculate_metrics(true_values=true_values,
                  pred_values=pred_values)

{'mse': 0.351, 'mae': 0.502, 'r^2': -0.319}

SMILES PREPARATION

In [24]:
from rdkit.Chem import rdForceFieldHelpers

def find_conf_with_min_energy(mol):
    optimization_result = rdForceFieldHelpers.MMFFOptimizeMoleculeConfs(mol)
    
    min_energy, min_energy_conf_index = pow(10,5), None
    for index, (status, energy) in enumerate(optimization_result):
        if energy < min_energy and status == 0:
            min_energy_conf_index = index
            min_energy = min(min_energy, energy)

    return min_energy_conf_index, min_energy, mol

In [26]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem.Crippen import MolLogP

true_values = []
pred_values = []

for index, row in df_test.iterrows():
    SMILES = row['Smiles']
    true_logP = row['logP']
    
    mol = AllChem.MolFromSmiles(SMILES)
    mol = Chem.AddHs(mol)
    rdForceFieldHelpers.MMFFSanitizeMolecule(mol)

    num_rotatable_bonds = Descriptors.NumRotatableBonds(mol)
    amount_of_confs = pow(3, num_rotatable_bonds + 3)
    AllChem.EmbedMultipleConfs(mol, numConfs=amount_of_confs, randomSeed=3407)

    min_energy_conf_index, min_energy, mol = find_conf_with_min_energy(mol)

    pred_logP = MolLogP(mol=mol,
                        includeHs=True)
    
    print(f"SMILES: {SMILES}, pred: {pred_logP}, true: {true_logP}")
    
    true_values.append(true_logP)
    pred_values.append(pred_logP)

calculate_metrics(true_values=true_values,
                  pred_values=pred_values)

SMILES: O=C(NC1CCC1)C1=CC=CC=C1, pred: 1.9688999999999999, true: 1.95
SMILES: FC1(F)CC1C(=O)NC1=CC=CC=C1, pred: 2.2803000000000004, true: 2.07
SMILES: FC1(F)CC2(CC(C2)C(=O)NC2=CC=CC=C2)C1, pred: 3.4506000000000014, true: 2.93
SMILES: FC1CCN(CC1)C(=O)C1=CC=CC=C1, pred: 2.2607, true: 1.31
SMILES: FC1(F)CCC(CC1)C(=O)NC1=CC=CC=C1, pred: 3.4506000000000014, true: 2.63
SMILES: [H]C(F)(F)[C@@H]1CC[C@H]1NC(=O)C1=CC=CC=C1, pred: 2.4601000000000006, true: 1.94
SMILES: O=C(NC1=CC=CC=C1)C1CC2(CCC2)C1, pred: 3.205400000000002, true: 3.27
SMILES: O=C(N1CC2CCCC(C2)C1)C1=CC=CC=C1, pred: 2.948800000000001, true: 2.36
SMILES: FC1(F)CCN(CC1)C(=O)C1=CC=CC=C1, pred: 2.557900000000001, true: 1.51
SMILES: [H]C(F)(F)C1CCN(CC1)C(=O)C1=CC=CC=C1, pred: 2.8039000000000014, true: 1.67
SMILES: C[C@@H]1C[C@@H]1C(=O)NC1=CC=CC=C1, pred: 2.2811000000000003, true: 2.31
SMILES: FC(F)(F)CCNC(=O)C1=CC=CC=C1, pred: 2.3688000000000002, true: 1.93
SMILES: [H]C(F)(F)C12CC(C1)(OC2)C(=O)NC1=CC=CC=C1, pred: 2.4394, true: 1.93
SMI

{'mse': 0.351, 'mae': 0.502, 'r^2': -0.319}