In [1]:
from gtda.homology import EuclideanCechPersistence,VietorisRipsPersistence
#from gtda.diagrams import PersistenceImage
from gtda.pipeline import Pipeline
from gtda.diagrams import Filtering

In [2]:
import tensorflow as tf
import gudhi as gd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
import scipy.spatial as spatial
import json
from rich import print
from random import choice, sample
from tqdm.notebook import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors
from gc import collect
from pickle import load

with open("9701_cech_persistence_entropy_012.pickle", mode = "rb") as data:
    save_dict = load(data)
    train_molecules = save_dict["train_molecules"]
    train_data = save_dict["train_data"]
    
    test_molecules = save_dict["test_molecules"]
    test_data = save_dict["test_data"]
    
%matplotlib inline
del save_dict
collect()

4

In [3]:
print(train_data.shape, test_data.shape)

# Extract descriptors
(this can be done from rdkit or from the descriptor .csv)

In [4]:
df = pd.read_csv("data/Drugbank_some_descriptors.csv")
cntr = 0
for d in df:
    cntr += 1
print(cntr)

### Filter descriptors which are unavailable

In [5]:
descriptors = list()

#---- consider only descriptors which have few missing values and which are floating point
for d in df:
    if df[d].isna().sum() < 800 and df[d].dtype == np.float64:
        descriptors.append(d)

filter_descriptors = filter(lambda d : "OEselma Descriptors;ring_count" not in d and "OEselma Descriptors;carbon_count" not in d, descriptors)
descriptors = list(set(descriptors) - set(filter_descriptors))
num_descriptors = len(descriptors)

#---- select only smiles which have these descriptor values
train_smiles = set(train_molecules)
test_smiles = set(test_molecules)

for i,d in enumerate(descriptors):
    avail_rows = df[np.logical_not(df[d].isna())]
    train_smiles = set(avail_rows["SMILES"]).intersection(train_smiles)
    test_smiles = set(avail_rows["SMILES"]).intersection(test_smiles)

train_idxs = np.asarray(sorted([ train_molecules.index(t) for t in train_smiles ]))
test_idxs = np.asarray(sorted([ test_molecules.index(t) for t in test_smiles ]))

train_molecules = np.asarray(train_molecules)
train_molecules = train_molecules[train_idxs]
train_data = train_data[train_idxs]

test_molecules = np.asarray(test_molecules)
test_molecules = test_molecules[test_idxs]
test_data = test_data[test_idxs]
#----
print(descriptors)

In [6]:
print(train_data.shape, test_data.shape)

### Build numpy arrays of descriptors

In [7]:
collect()
train_labels = np.empty((train_data.shape[0], num_descriptors))
test_labels = np.empty((test_data.shape[0], num_descriptors))
train_labels[:] = np.nan
test_labels[:] = np.nan

for i,mol in enumerate(tqdm(train_molecules)):
    data_row = df.loc[df['SMILES'] == mol, descriptors ]
    data_vec = [ r.to_list()[-1] for l,r in data_row.items() ]
    train_labels[i,:] = np.asarray(data_vec)

for i,mol in enumerate(tqdm(test_molecules)):
    data_row = df.loc[df['SMILES'] == mol, descriptors ]
    data_vec = [ r.to_list()[-1] for l,r in data_row.items() ]
    test_labels[i,:] = np.asarray(data_vec)

  0%|          | 0/7754 [00:00<?, ?it/s]

  0%|          | 0/1939 [00:00<?, ?it/s]

In [8]:
print(train_labels.shape, test_labels.shape)

In [9]:
print(not np.isnan(train_labels).any(), not np.isnan(test_labels).any())

In [10]:
with np.printoptions(2, suppress = True):
    print(choice(train_labels))

# Setup OLS model

In [11]:
from sklearn import linear_model

#creating a regression object
reg = linear_model.LinearRegression()

#runnin OLS on your data, assuming that you already have arrays x and y
reg.fit( train_data, train_labels )
reg.score(train_data, train_labels)

0.3585688571168597

In [12]:
reg.coef_.shape

(2, 9)

In [13]:
# Using 'auto'/'sum_over_batch_size' reduction type.
mae = tf.keras.losses.MeanAbsoluteError()
mae(test_labels, reg.predict(test_data)).numpy()

2022-01-11 15:48:39.812265: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-11 15:48:40.247443: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 17 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1070 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1


3.438525915145874

In [14]:
predicted = reg.predict(test_data)

out_dict = dict()
out_dict["Descriptor"] = list()
out_dict["_sigma"] = list()
out_dict["Average"] = list()
out_dict["Test Average"] = test_labels.mean(axis = 0)
out_dict["MAE"] = np.abs(test_labels-predicted).mean(axis = 0)
out_dict["Gauss MAE"] = list()

for i,d in enumerate(descriptors):
    out_dict["Descriptor"].append(d)
    out_dict["Average"].append(df[d].mean(skipna = True)) # population mean for this descriptor
    out_dict["_sigma"].append(df[d].std(skipna = True)) # population mean for this descriptor
    out_dict["Gauss MAE"].append(np.abs(
        test_labels[:,i] - np.random.normal(
            loc = df[d].mean(skipna = True),
            scale = df[d].std(skipna = True),
            size = (test_labels.shape[0],1)
        )
    ).mean())
    
out_dict["MAE/sigma"] = out_dict["MAE"]/out_dict["_sigma"]
out_dict["(GMAE*MAE)/sigma"] = out_dict["MAE/sigma"] * out_dict["Gauss MAE"]
    
#out_dict["abs avg diff"] = np.abs(out_dict["Average"] - out_dict["Test Average"])

pd.set_option("display.max_rows", None, "display.max_columns", None)
out_df = pd.DataFrame(data = out_dict)
print(out_dict["MAE"].sum()/len(descriptors)) # THIS IS WHAT tf.keras.losses.MeanAbsoluteError() DOES
out_df = out_df.sort_values("MAE/sigma")
out_df

Unnamed: 0,Descriptor,_sigma,Average,Test Average,MAE,Gauss MAE,MAE/sigma,(GMAE*MAE)/sigma
1,OEselma Descriptors;carbon_count,11.377305,17.636929,18.03507,5.706187,12.058898,0.501541,6.048034
0,OEselma Descriptors;ring_count,1.811629,2.602073,2.65704,1.170865,1.990166,0.646305,1.286253


In [15]:
latex_dict = dict()
latex_dict["Descriptors"] = out_dict["Descriptor"]
latex_dict["sigma"]       = out_dict["_sigma"]
latex_dict["Avg"]         = out_dict["Average"]
latex_dict["Test avg"]    = out_dict["Test Average"]
latex_dict["MAE"]         = out_dict["MAE"] 
latex_dict["MAE/sigma"]   = out_dict["MAE/sigma"]
latex_df = pd.DataFrame(data = latex_dict)
latex_df = latex_df.sort_values("MAE/sigma")
latex_df

Unnamed: 0,Descriptors,sigma,Avg,Test avg,MAE,MAE/sigma
1,OEselma Descriptors;carbon_count,11.377305,17.636929,18.03507,5.706187,0.501541
0,OEselma Descriptors;ring_count,1.811629,2.602073,2.65704,1.170865,0.646305


In [16]:
print(latex_df.to_latex())