In [1]:
import tensorflow as tf
import gudhi as gd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
import scipy.spatial as spatial
import json
from rich import print
from random import choice, sample
from tqdm.notebook import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors
from gc import collect
from pickle import load
from sklearn.preprocessing import MinMaxScaler

with open("9701_cech_persistence_entropy_012.pickle", mode = "rb") as data:
    save_dict = load(data)
    train_molecules = save_dict["train_molecules"]
    train_data = save_dict["train_data"]
    
    test_molecules = save_dict["test_molecules"]
    test_data = save_dict["test_data"]
    
%matplotlib inline
del save_dict
collect()

4

In [2]:
print(train_data.shape, test_data.shape)

# Extract descriptors
(this can be done from rdkit or from the descriptor .csv)

In [3]:
df = pd.read_csv("data/Drugbank_some_descriptors.csv")
cntr = 0
for d in df:
    cntr += 1
print(cntr)

### Filter descriptors which are unavailable

In [4]:
descriptors = list()

#---- consider only descriptors which have few missing values and which are floating point
for d in df:
    if df[d].isna().sum() < 800 and df[d].dtype == np.float64:
        descriptors.append(d)

#filter_descriptors = filter(lambda d : "OEselma Descriptors" not in d, descriptors)
#descriptors = list(set(descriptors) - set(filter_descriptors))
num_descriptors = len(descriptors)

#---- select only smiles which have these descriptor values
train_smiles = set(train_molecules)
test_smiles = set(test_molecules)

for i,d in enumerate(descriptors):
    avail_rows = df[np.logical_not(df[d].isna())]
    train_smiles = set(avail_rows["SMILES"]).intersection(train_smiles)
    test_smiles = set(avail_rows["SMILES"]).intersection(test_smiles)

train_idxs = np.asarray(sorted([ train_molecules.index(t) for t in train_smiles ]))
test_idxs = np.asarray(sorted([ test_molecules.index(t) for t in test_smiles ]))

train_molecules = np.asarray(train_molecules)
train_molecules = train_molecules[train_idxs]
train_data = train_data[train_idxs]

test_molecules = np.asarray(test_molecules)
test_molecules = test_molecules[test_idxs]
test_data = test_data[test_idxs]
#----
print(descriptors)

In [5]:
print(train_data.shape, test_data.shape)

### Build numpy arrays of descriptors

In [6]:
collect()
train_labels = np.empty((train_data.shape[0], num_descriptors))
test_labels = np.empty((test_data.shape[0], num_descriptors))
train_labels[:] = np.nan
test_labels[:] = np.nan

for i,mol in enumerate(tqdm(train_molecules)):
    data_row = df.loc[df['SMILES'] == mol, descriptors ]
    data_vec = [ r.to_list()[-1] for l,r in data_row.items() ]
    train_labels[i,:] = np.asarray(data_vec)

for i,mol in enumerate(tqdm(test_molecules)):
    data_row = df.loc[df['SMILES'] == mol, descriptors ]
    data_vec = [ r.to_list()[-1] for l,r in data_row.items() ]
    test_labels[i,:] = np.asarray(data_vec)

  0%|          | 0/7205 [00:00<?, ?it/s]

  0%|          | 0/1787 [00:00<?, ?it/s]

In [7]:
print(train_labels.shape, test_labels.shape)

In [8]:
print(not np.isnan(train_labels).any(), not np.isnan(test_labels).any())

In [9]:
with np.printoptions(2, suppress = True):
    _idx = choice(list(range(train_labels.shape[0])))
    print(train_labels[_idx])

# Normalize labels

In [10]:
train_scaler = MinMaxScaler(feature_range=(0, 1)).fit(train_labels)
train_labels = train_scaler.transform(train_labels)

test_scaler = MinMaxScaler(feature_range=(0, 1)).fit(test_labels)
test_labels = test_scaler.transform(test_labels)

In [11]:
with np.printoptions(2, suppress = True):
    print(train_scaler.inverse_transform(train_labels)[_idx])
    print(train_labels[_idx])

# Setup OLS model

In [12]:
from sklearn import linear_model

#creating a regression object
reg = linear_model.LinearRegression()

#runnin OLS on your data, assuming that you already have arrays x and y
reg.fit( train_data, train_labels )
reg.score(train_data, train_labels)

0.21731159396225624

In [13]:
reg.coef_.shape

(74, 9)

In [14]:
# Using 'auto'/'sum_over_batch_size' reduction type.
mae = tf.keras.losses.MeanAbsoluteError()
mae(
    test_scaler.inverse_transform(test_labels),
    test_scaler.inverse_transform(reg.predict(test_data))
).numpy()

2022-01-13 17:19:56.960224: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-13 17:19:57.375603: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6912 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1070 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1


20.665943145751953

In [15]:
predicted = reg.predict(test_data)

In [16]:
test_labels = test_scaler.inverse_transform(test_labels)
predicted = test_scaler.inverse_transform(predicted)

In [17]:
out_dict = dict()
out_dict["Descriptor"] = list()
out_dict["_sigma"] = list()
out_dict["Average"] = list()
out_dict["Test Average"] = test_labels.mean(axis = 0)
out_dict["MAE"] = np.abs(test_labels - predicted).mean(axis = 0)

for i,d in enumerate(descriptors):
    out_dict["Descriptor"].append(d)
    out_dict["Average"].append(df[d].mean(skipna = True)) # population mean for this descriptor
    out_dict["_sigma"].append(df[d].std(skipna = True)) # population mean for this descriptor
    
out_dict["MAE/sigma"] = out_dict["MAE"]/out_dict["_sigma"]

pd.set_option("display.max_rows", None, "display.max_columns", None)
out_df = pd.DataFrame(data = out_dict)
print(out_dict["MAE"].sum()/len(descriptors)) # THIS IS WHAT tf.keras.losses.MeanAbsoluteError() DOES
out_df = out_df.sort_values("MAE/sigma")
out_df

Unnamed: 0,Descriptor,_sigma,Average,Test Average,MAE,MAE/sigma
45,OEselma Descriptors;silicon_count,0.108439,0.004239,0.00056,0.001152,0.01062
10,OEselma Descriptors;bromine_count,0.265308,0.034856,0.033016,0.046804,0.176412
20,OEselma Descriptors;iodine_count,0.287843,0.02845,0.039731,0.068471,0.237878
62,Mol Weight,301.265463,374.51252,371.636732,102.591485,0.340535
65,TPSA,133.333536,107.810334,104.397905,50.234762,0.37676
64,Polarizability,29.516618,37.783778,37.861987,11.185619,0.37896
63,Molar Refractivity,74.060489,95.559089,95.94068,28.155237,0.380165
47,OEselma Descriptors;tsa,261.688962,414.13886,413.91662,102.083594,0.390095
2,MOE Descriptors;h_ema,9.691163,8.864515,8.293012,3.832488,0.395462
51,Molecular Volume (2D);Molecular Volume (2D),371.986946,592.330629,587.763424,157.475143,0.423335


In [18]:
latex_dict = dict()
latex_dict["Descriptors"]                              = out_dict["Descriptor"]
latex_dict["$\sigma$"]                                 = out_dict["_sigma"]
latex_dict["$\mu$"]                                    = out_dict["Average"]
latex_dict["$\overline{\mathbf{f}_d}$"]                = out_dict["Test Average"]
latex_dict["$\overline{|\mathbf{y}_d-\mathbf{f}_d|}$"] = out_dict["MAE"] 
latex_dict["Score"]                                    = out_dict["MAE/sigma"]
latex_df = pd.DataFrame(data = latex_dict)
latex_df = latex_df.sort_values("Score")
latex_df

Unnamed: 0,Descriptors,$\sigma$,$\mu$,$\overline{\mathbf{f}_d}$,$\overline{|\mathbf{y}_d-\mathbf{f}_d|}$,Score
45,OEselma Descriptors;silicon_count,0.108439,0.004239,0.00056,0.001152,0.01062
10,OEselma Descriptors;bromine_count,0.265308,0.034856,0.033016,0.046804,0.176412
20,OEselma Descriptors;iodine_count,0.287843,0.02845,0.039731,0.068471,0.237878
62,Mol Weight,301.265463,374.51252,371.636732,102.591485,0.340535
65,TPSA,133.333536,107.810334,104.397905,50.234762,0.37676
64,Polarizability,29.516618,37.783778,37.861987,11.185619,0.37896
63,Molar Refractivity,74.060489,95.559089,95.94068,28.155237,0.380165
47,OEselma Descriptors;tsa,261.688962,414.13886,413.91662,102.083594,0.390095
2,MOE Descriptors;h_ema,9.691163,8.864515,8.293012,3.832488,0.395462
51,Molecular Volume (2D);Molecular Volume (2D),371.986946,592.330629,587.763424,157.475143,0.423335


In [19]:
latex_string = latex_df.to_latex(index = False, escape = False)
latex_string = latex_string.replace("_", " ")
latex_string = latex_string.replace("OEselma Descriptors;", "OEselma ")
latex_string = latex_string.replace("&\n"," & ")
latex_string = latex_string.replace("\n&"," & ")
latex_string = latex_string.replace("& \n"," & ")
latex_string = latex_string.replace("MOE Descriptors;", "MOE ")

lines = latex_string.split("\n")
for i,l in enumerate(lines):
    if i < 4 or i > len(lines)-4:
        continue
    
    a = l.split("&")
    b = a[1:]
    c = str.title(a[0].split(";")[0])
    d = [ c ] + b
    lines[i] = "&".join(d)
latex_string = "\n".join(lines)

latex_string = latex_string.replace("&", " & ")

for i in range(40):
    latex_string = latex_string.replace("  ", " ")


print(latex_string)