In [1]:
import os
import sys
sys.path.append('modules')

import pandas as pd
import numpy as np
import gzip

import warnings
from tqdm import tqdm

import pickle
import gzip
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import SDWriter
from rdkit.Chem import ForwardSDMolSupplier

from itertools import islice
import tensorflow as tf
from nfp.preprocessing import MolAPreprocessor, GraphSequence

from model import make_model
import tensorflow as tf
import tensorflow_probability as tfp
import warnings
warnings.filterwarnings("ignore")

tfd = tfp.distributions
tfpl = tfp.layers
tf.get_logger().setLevel('ERROR')
tf.keras.backend.set_floatx('float64')

os.environ["CUDA_VISIBLE_DEVICES"]="-1"

2025-05-15 10:39:48.580568: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-05-15 10:39:49.064072: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/abhijeet/anaconda3/envs/dl_nmr2/lib/:/home/abhijeet/.local/lib/python3.10/site-packages/nvidia/cudnn/lib
2025-05-15 10:39:49.064149: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/abhijeet/an

In [2]:
def getIndices(mol):
    Cs = [x for x in mol.GetAtoms() if x.GetAtomicNum()==6]
    return np.array([x.GetIdx() for x in Cs]).astype(int)

#Embed SMILES to Mols
smiles_list = ['CCO', 'CC(=O)O', 'c1ccccc1']
mols = []
for i, smi in enumerate(smiles_list):
    mol = Chem.MolFromSmiles(smi)
    mol = Chem.AddHs(mol)

    AllChem.EmbedMolecule(mol)
    AllChem.MMFFOptimizeMolecule(mol)

    mol.SetProp("_Name", f"Molecule_{i+1}")
    mols.append(mol)

#Get Prediction Targets
smiles = []
atom_indices = []
for i, mol in enumerate(mols):
    C_indices = getIndices(mol)
    atom_indices.append(C_indices)
    smiles.extend([smiles_list[i] for x in range(len(C_indices))])

In [3]:
inp_df = pd.DataFrame({'Mol':mols,'atom_index':atom_indices})
inp_df

Unnamed: 0,Mol,atom_index
0,<rdkit.Chem.rdchem.Mol object at 0x7d7bf3c954d0>,"[0, 1]"
1,<rdkit.Chem.rdchem.Mol object at 0x7d7d9d65a0a0>,"[0, 1]"
2,<rdkit.Chem.rdchem.Mol object at 0x7d7d9d659fc0>,"[0, 1, 2, 3, 4, 5]"


In [4]:
#Preprocess Inputs

def _compute_stacked_offsets(sizes, repeats):
    return np.repeat(np.cumsum(np.hstack([0, sizes[:-1]])), repeats)

def ragged_const(inp_arr):
    return tf.ragged.constant(np.expand_dims(inp_arr,axis=0), ragged_rank=1)

def atomic_number_tokenizer(atom):
    return atom.GetAtomicNum()

def Mol_iter(df):
    for index,r in df.iterrows():
        yield(r['Mol'], r['atom_index'])

class RBFSequence(GraphSequence):
    def process_data(self, batch_data):
        
        offset = _compute_stacked_offsets(
            batch_data['n_pro'], batch_data['n_atom'])

        offset = np.where(batch_data['atom_index']>=0, offset, 0)
        batch_data['atom_index'] += offset
        
        features = ['node_attributes', 'node_coordinates', 'edge_indices', 'atom_index', 'n_pro']
        for feature in features:
            batch_data[feature] = ragged_const(batch_data[feature])

        del batch_data['n_atom']
        del batch_data['n_bond']
        del batch_data['distance']
        del batch_data['bond']
        del batch_data['node_graph_indices']

        return batch_data

with open('preprocessor_orig.p', 'rb') as f:
    input_data = pickle.load(f)
    
preprocessor = input_data['preprocessor']

inputs_test = preprocessor.predict(Mol_iter(inp_df))

test_sequence = RBFSequence(inputs_test, batch_size=32)

3it [00:00, 791.88it/s]


In [5]:
#Load Model and Make Predictions

model = make_model()
model.load_weights('best_model_val_mae.h5')

from tqdm import tqdm
predictions = []
uncertainty = []
for x in tqdm(test_sequence):
    predictions.extend(model(x).mean().numpy().flatten())
    uncertainty.extend(model(x).stddev().numpy().flatten())

INFO:kgcnn.model.utils:Updated model kwargs:
INFO:kgcnn.model.utils:{'name': 'PAiNN', 'inputs': [{'shape': (None,), 'name': 'node_attributes', 'dtype': 'float32', 'ragged': True}, {'shape': (None, 3), 'name': 'node_coordinates', 'dtype': 'float32', 'ragged': True}, {'shape': (None, 2), 'name': 'edge_indices', 'dtype': 'int64', 'ragged': True}, {'shape': (None,), 'name': 'atom_index', 'dtype': 'int32', 'ragged': True}, {'shape': (None, 1), 'name': 'n_pro', 'dtype': 'int64', 'ragged': True}], 'input_embedding': {'node': {'input_dim': 256, 'output_dim': 256}}, 'equiv_initialize_kwargs': {'dim': 3, 'method': 'eps'}, 'bessel_basis': {'num_radial': 20, 'cutoff': 5.0, 'envelope_exponent': 5}, 'pooling_args': {'pooling_method': 'mean'}, 'conv_args': {'units': 256, 'cutoff': None}, 'update_args': {'units': 256}, 'equiv_normalization': False, 'node_normalization': False, 'depth': 6, 'verbose': 10, 'output_embedding': 'graph', 'output_to_tensor': True, 'output_mlp': {'use_bias': [True, True], 'un

In [6]:
def confidence(x):
    if x >= 1.485774:
        return 'Low'
    elif x >= 1.419831:
        return 'Moderate'
    else:
        return 'High'

In [7]:
# Make Prediction df
uncertainty = [1.96*x for x in uncertainty]
pred_ub = [sum(x) for x in zip(predictions, uncertainty)]
df = pd.DataFrame({'SMILES':smiles,'Atom_ID':np.concatenate(atom_indices),'Predictions':predictions,'Pred_ub':pred_ub})
df['Predictions'] = df['Predictions'].apply(lambda x: x*50.484337 +99.798111)
df['Predictions'] = df['Predictions'].apply(lambda x: round(x,2))
df['Pred_ub'] = df['Pred_ub'].apply(lambda x: x*50.484337 +99.798111)
df['Confidence'] = df['Pred_ub'] - df['Predictions']
df.drop('Pred_ub', axis=1, inplace=True)
df['Confidence'] = df['Confidence'].apply(lambda x: confidence(x))
df

Unnamed: 0,SMILES,Atom_ID,Predictions,Confidence
0,CCO,0,18.73,Moderate
1,CCO,1,58.63,High
2,CC(=O)O,0,20.96,Moderate
3,CC(=O)O,1,174.26,High
4,c1ccccc1,0,128.51,High
5,c1ccccc1,1,128.51,High
6,c1ccccc1,2,128.51,High
7,c1ccccc1,3,128.51,High
8,c1ccccc1,4,128.51,High
9,c1ccccc1,5,128.51,High
