In [1]:
import tensorflow as tf
import gudhi as gd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
import scipy.spatial as spatial
import json
from rich import print
from random import choice, sample
from tqdm.notebook import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors
from gc import collect
from pickle import load

with open("9701_cech_persistence_entropy_012.pickle", mode = "rb") as data:
    save_dict = load(data)
    train_molecules = save_dict["train_molecules"]
    train_data = save_dict["train_data"]
    
    test_molecules = save_dict["test_molecules"]
    test_data = save_dict["test_data"]
    
%matplotlib inline
del save_dict
collect()

4

In [2]:
print(train_data.shape, test_data.shape)

# Extract descriptors
(this can be done from rdkit or from the descriptor .csv)

In [3]:
df = pd.read_csv("data/Drugbank_some_descriptors.csv")
cntr = 0
for d in df:
    cntr += 1
print(cntr)

### Filter descriptors which are unavailable

In [4]:
descriptors = list()

#---- consider only descriptors which have few missing values and which are floating point
for d in df:
    if df[d].isna().sum() < 800 and df[d].dtype == np.float64:
        descriptors.append(d)

#filter_descriptors = filter(lambda d : "OEselma Descriptors" not in d, descriptors)
#descriptors = list(set(descriptors) - set(filter_descriptors))
num_descriptors = len(descriptors)

#---- select only smiles which have these descriptor values
train_smiles = set(train_molecules)
test_smiles = set(test_molecules)

for i,d in enumerate(descriptors):
    avail_rows = df[np.logical_not(df[d].isna())]
    train_smiles = set(avail_rows["SMILES"]).intersection(train_smiles)
    test_smiles = set(avail_rows["SMILES"]).intersection(test_smiles)

train_idxs = np.asarray(sorted([ train_molecules.index(t) for t in train_smiles ]))
test_idxs = np.asarray(sorted([ test_molecules.index(t) for t in test_smiles ]))

train_molecules = np.asarray(train_molecules)
train_molecules = train_molecules[train_idxs]
train_data = train_data[train_idxs]

test_molecules = np.asarray(test_molecules)
test_molecules = test_molecules[test_idxs]
test_data = test_data[test_idxs]
#----
print(descriptors)

In [5]:
print(train_data.shape, test_data.shape)

### Build numpy arrays of descriptors

In [6]:
collect()
train_labels = np.empty((train_data.shape[0], num_descriptors))
test_labels = np.empty((test_data.shape[0], num_descriptors))
train_labels[:] = np.nan
test_labels[:] = np.nan

for i,mol in enumerate(tqdm(train_molecules)):
    data_row = df.loc[df['SMILES'] == mol, descriptors ]
    data_vec = [ r.to_list()[-1] for l,r in data_row.items() ]
    train_labels[i,:] = np.asarray(data_vec)

for i,mol in enumerate(tqdm(test_molecules)):
    data_row = df.loc[df['SMILES'] == mol, descriptors ]
    data_vec = [ r.to_list()[-1] for l,r in data_row.items() ]
    test_labels[i,:] = np.asarray(data_vec)

  0%|          | 0/7205 [00:00<?, ?it/s]

  0%|          | 0/1787 [00:00<?, ?it/s]

In [7]:
print(train_labels.shape, test_labels.shape)

In [8]:
print(not np.isnan(train_labels).any(), not np.isnan(test_labels).any())

In [9]:
with np.printoptions(2, suppress = True):
    print(choice(train_labels))

# Setup OLS model

In [10]:
from sklearn import linear_model

#creating a regression object
reg = linear_model.LinearRegression()

#runnin OLS on your data, assuming that you already have arrays x and y
reg.fit( train_data, train_labels )
reg.score(train_data, train_labels)

0.21731159396226127

In [11]:
reg.coef_.shape

(74, 9)

In [12]:
# Using 'auto'/'sum_over_batch_size' reduction type.
mae = tf.keras.losses.MeanAbsoluteError()
mae(test_labels, reg.predict(test_data)).numpy()

2022-01-11 19:26:46.179471: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-11 19:26:46.567211: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6884 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1070 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1


19.298789978027344

In [13]:
predicted = reg.predict(test_data)

out_dict = dict()
out_dict["Descriptor"] = list()
out_dict["_sigma"] = list()
out_dict["Average"] = list()
out_dict["Test Average"] = test_labels.mean(axis = 0)
out_dict["MAE"] = np.abs(test_labels-predicted).mean(axis = 0)
out_dict["Gauss MAE"] = list()

for i,d in enumerate(descriptors):
    out_dict["Descriptor"].append(d)
    out_dict["Average"].append(df[d].mean(skipna = True)) # population mean for this descriptor
    out_dict["_sigma"].append(df[d].std(skipna = True)) # population mean for this descriptor
    out_dict["Gauss MAE"].append(np.abs(
        test_labels[:,i] - np.random.normal(
            loc = df[d].mean(skipna = True),
            scale = df[d].std(skipna = True),
            size = (test_labels.shape[0],1)
        )
    ).mean())
    
out_dict["MAE/sigma"] = out_dict["MAE"]/out_dict["_sigma"]
out_dict["(GMAE*MAE)/sigma"] = out_dict["MAE/sigma"] * out_dict["Gauss MAE"]
    
#out_dict["abs avg diff"] = np.abs(out_dict["Average"] - out_dict["Test Average"])

pd.set_option("display.max_rows", None, "display.max_columns", None)
out_df = pd.DataFrame(data = out_dict)
print(out_dict["MAE"].sum()/len(descriptors)) # THIS IS WHAT tf.keras.losses.MeanAbsoluteError() DOES
out_df = out_df.sort_values("MAE/sigma")
out_df

Unnamed: 0,Descriptor,_sigma,Average,Test Average,MAE,Gauss MAE,MAE/sigma,(GMAE*MAE)/sigma
45,OEselma Descriptors;silicon_count,0.108439,0.004239,0.00056,0.002336,0.088288,0.021538,0.001902
20,OEselma Descriptors;iodine_count,0.287843,0.02845,0.039731,0.068471,0.262941,0.237878,0.062548
10,OEselma Descriptors;bromine_count,0.265308,0.034856,0.033016,0.067484,0.235247,0.254362,0.059838
62,Mol Weight,301.265463,374.51252,371.636732,102.311704,280.672032,0.339606,95.318041
64,Polarizability,29.516618,37.783778,37.861987,10.196873,27.783745,0.345462,9.598232
63,Molar Refractivity,74.060489,95.559089,95.94068,25.766911,69.45443,0.347917,24.164384
2,MOE Descriptors;h_ema,9.691163,8.864515,8.293012,3.458272,9.000534,0.356848,3.211822
65,TPSA,133.333536,107.810334,104.397905,50.274115,121.795053,0.377055,45.923469
47,OEselma Descriptors;tsa,261.688962,414.13886,413.91662,104.243409,260.178975,0.398349,103.641908
56,Rotatable Bond Count;Rotatable Bond Count,7.213711,6.681677,6.670397,2.915731,7.208879,0.404193,2.913778


In [14]:
latex_dict = dict()
latex_dict["Descriptors"] = out_dict["Descriptor"]
latex_dict["sigma"]       = out_dict["_sigma"]
latex_dict["Avg"]         = out_dict["Average"]
latex_dict["Test avg"]    = out_dict["Test Average"]
latex_dict["MAE"]         = out_dict["MAE"] 
latex_dict["MAE/sigma"]   = out_dict["MAE/sigma"]
latex_df = pd.DataFrame(data = latex_dict)
latex_df = latex_df.sort_values("MAE/sigma")
latex_df

Unnamed: 0,Descriptors,sigma,Avg,Test avg,MAE,MAE/sigma
45,OEselma Descriptors;silicon_count,0.108439,0.004239,0.00056,0.002336,0.021538
20,OEselma Descriptors;iodine_count,0.287843,0.02845,0.039731,0.068471,0.237878
10,OEselma Descriptors;bromine_count,0.265308,0.034856,0.033016,0.067484,0.254362
62,Mol Weight,301.265463,374.51252,371.636732,102.311704,0.339606
64,Polarizability,29.516618,37.783778,37.861987,10.196873,0.345462
63,Molar Refractivity,74.060489,95.559089,95.94068,25.766911,0.347917
2,MOE Descriptors;h_ema,9.691163,8.864515,8.293012,3.458272,0.356848
65,TPSA,133.333536,107.810334,104.397905,50.274115,0.377055
47,OEselma Descriptors;tsa,261.688962,414.13886,413.91662,104.243409,0.398349
56,Rotatable Bond Count;Rotatable Bond Count,7.213711,6.681677,6.670397,2.915731,0.404193


In [15]:
print(latex_df.to_latex())