In [1]:
from gtda.homology import EuclideanCechPersistence,VietorisRipsPersistence
from gtda.diagrams import PersistenceImage
from gtda.pipeline import Pipeline
from gtda.diagrams import Filtering

In [2]:
import tensorflow as tf
import gudhi as gd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
import scipy.spatial as spatial
import json
from rich import print
from random import choice, sample
from tqdm.notebook import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors
from gc import collect
from pickle import load

with open("9701_cech_persistence_images_012_50x50.pickle", mode = "rb") as data:
    save_dict = load(data)
    
    test_molecules = save_dict["test_molecules"]
    test_data = save_dict["test_data"]
    
%matplotlib inline
del save_dict
collect()

4

# Extract descriptors
(this can be done from rdkit or from the descriptor .csv)

In [3]:
df = pd.read_csv("data/Drugbank_some_descriptors.csv")
cntr = 0
for d in df:
    cntr += 1
print(cntr)

### Filter descriptors which are unavailable

In [4]:
descriptors = list()

# consider only descriptors which have few missing values and which are floating point
for d in df:
    if df[d].isna().sum() < 800 and df[d].dtype == np.float64:
        descriptors.append(d)

num_descriptors = len(descriptors)
print(descriptors)

test_smiles = set(test_molecules)

for i,d in enumerate(descriptors):
    avail_rows = df[np.logical_not(df[d].isna())]
    test_smiles = set(avail_rows["SMILES"]).intersection(test_smiles)

test_idxs = np.asarray(sorted([ test_molecules.index(t) for t in test_smiles ]))


test_molecules = np.asarray(test_molecules)
test_molecules = test_molecules[test_idxs]
test_data = test_data[test_idxs]

### Build numpy arrays of descriptors

In [5]:
collect()

test_labels = np.empty((test_data.shape[0], num_descriptors))

test_labels[:] = np.nan

for i,mol in enumerate(tqdm(test_molecules)):
    data_row = df.loc[df['SMILES'] == mol, descriptors ]
    data_vec = [ r.to_list()[-1] for l,r in data_row.items() ]
    test_labels[i,:] = np.asarray(data_vec)

  0%|          | 0/1802 [00:00<?, ?it/s]

In [6]:
model = tf.keras.models.load_model('saved_model/my_model')

2022-01-08 17:39:21.832427: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-08 17:39:22.881157: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6939 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1070 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1


In [7]:
collect()
compare = np.zeros((test_data.shape[0], num_descriptors, 2))
compare[:,:,0] = test_labels
compare[:,:,1] = model.predict(test_data).reshape((test_data.shape[0],num_descriptors))

out_dict = dict()
out_dict["Descriptor"] = list()
out_dict["Average"] = list()
out_dict["Standard deviation"] = list()

for d in descriptors:
    out_dict["Descriptor"].append(d)
    out_dict["Average"].append(df[d].mean(skipna = True)) # population mean for this descriptor
    out_dict["Standard deviation"].append(df[d].std(skipna = True)) # population mean for this descriptor
    
out_dict["Test Average"] = compare[:,:,1].mean(axis = 0)
out_dict["Test Mean Absolute Error"] = np.abs(compare[:,:,0]-compare[:,:,1]).mean(axis = 0)
#out_dict["Predictability"] = out_dict["Average"]/out_dict["Test Mean Absolute Error"]

pd.set_option("display.max_rows", None, "display.max_columns", None)
out_df = pd.DataFrame(data = out_dict)
out_df
#print(out_df.to_latex())

2022-01-08 17:39:23.516118: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 162180000 exceeds 10% of free system memory.
2022-01-08 17:39:23.640535: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 162180000 exceeds 10% of free system memory.
2022-01-08 17:39:25.186875: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8301


Unnamed: 0,Descriptor,Average,Standard deviation,Test Average,Test Mean Absolute Error
0,ACD Descriptors;logP,2.055326,3.002862,2.363056,1.990135
1,ACD logD-logP;logP,2.053207,3.001606,2.167280,1.994233
2,MOE Descriptors;h_ema,8.864515,9.691163,7.450187,2.837067
3,MOE Descriptors;h_emd,12.580101,10.821464,11.510098,4.215574
4,MOE Descriptors;h_emd_c,6.492526,5.020204,6.005712,1.964621
...,...,...,...,...,...
69,AZlogD74 (NN);nearest_neighbours[]- GT similarity,0.628486,0.201104,-0.116276,0.751250
70,AZlogD74 (NN);nearest_neighbours[]- GT measured,1.637720,1.561973,1.347385,1.375594
71,clogP;clogp,1.655366,3.447059,1.522084,2.222104
72,clogP;clogp_error,20.365517,24.318937,9.325663,17.864670


In [46]:
A = np.asarray([[1,2,3],[4,5,6],[7,8,9]])
A = np.asarray(sample([a for a in A], len(A)))
print(A)

In [51]:
collect()
compare = np.zeros((test_data.shape[0], num_descriptors, 2))
compare[:,:,0] = np.asarray(sample([vec for vec in test_labels], len(test_labels)))
compare[:,:,1] = model.predict(test_data).reshape((test_data.shape[0],num_descriptors))

out_dict = dict()
out_dict["Descriptor"] = list()
out_dict["Average"] = list()
out_dict["Standard deviation"] = list()

for d in descriptors:
    out_dict["Descriptor"].append(d)
    out_dict["Average"].append(df[d].mean(skipna = True)) # population mean for this descriptor
    out_dict["Standard deviation"].append(df[d].std(skipna = True)) # population mean for this descriptor
    
out_dict["Test Average"] = compare[:,:,1].mean(axis = 0)
out_dict["Test Mean Absolute Error"] = np.abs(compare[:,:,0]-compare[:,:,1]).mean(axis = 0)
#out_dict["Predictability"] = out_dict["Average"]/out_dict["Test Mean Absolute Error"]

pd.set_option("display.max_rows", None, "display.max_columns", None)
print(out_dict["Test Mean Absolute Error"].mean())
out_df = pd.DataFrame(data = out_dict)
out_df

Unnamed: 0,Descriptor,Average,Standard deviation,Test Average,Test Mean Absolute Error
0,ACD Descriptors;logP,2.055326,3.002862,2.363056,2.582372
1,ACD logD-logP;logP,2.053207,3.001606,2.16728,2.613927
2,MOE Descriptors;h_ema,8.864515,9.691163,7.450187,4.134048
3,MOE Descriptors;h_emd,12.580101,10.821464,11.510098,5.869087
4,MOE Descriptors;h_emd_c,6.492526,5.020204,6.005712,3.480841
5,MOE Descriptors;h_logp,6.583703,4.622931,6.163813,3.751346
6,MOE Descriptors;h_logs,9.838051,5.975325,9.926347,4.413926
7,MOE Descriptors;h_log_dbo,-3.8365,3.06657,-2.55885,2.69055
8,MOE Descriptors;h_log_pbo,1.932621,3.788166,1.20984,2.639968
9,MOE Descriptors;h_mr,0.327136,0.829737,-0.279508,0.711346
