In [None]:
import tensorflow as tf
import gudhi as gd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
import scipy.spatial as spatial
import json
from rich import print
from random import choice, sample
from tqdm.notebook import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors
from gc import collect
from pickle import load

with open("9701_cech_persistence_images_012_50x50.pickle", mode = "rb") as data:
    save_dict = load(data)
    train_molecules = save_dict["train_molecules"]
    train_data = save_dict["train_data"]
    
    test_molecules = save_dict["test_molecules"]
    test_data = save_dict["test_data"]
    
%matplotlib inline
del save_dict
collect()

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)]
        )
    except RuntimeError as e:
        print(e)

In [None]:
print(train_data.shape, test_data.shape)

for ims in tqdm(train_data):
    for im in ims:
        max_px = im.max()
        assert not (max_px < 0.0)
        if max_px > 0.0:
            im *= 1/max_px
        
for ims in tqdm(test_data):
    for im in ims:
        max_px = im.max()
        assert not (max_px < 0.0)
        if max_px > 0.0:
            im *= 1/max_px

In [None]:
plt.imshow(train_data[10,3])

# Extract descriptors
(this can be done from rdkit or from the descriptor .csv)

In [None]:
df = pd.read_csv("data/Drugbank_some_descriptors.csv")
cntr = 0
for d in df:
    cntr += 1
print(cntr)

### Filter descriptors which are unavailable

In [None]:
descriptors = list()

# consider only descriptors which have few missing values and which are floating point
for d in df:
    if df[d].isna().sum() < 800 and df[d].dtype == np.float64:
        descriptors.append(d)

filter_descriptors = filter(lambda d : "OEselma Descriptors" not in d, descriptors)
descriptors = list(set(descriptors) - set(filter_descriptors))
num_descriptors = len(descriptors)
print(descriptors)

# select only smiles which have these descriptor values
train_smiles = set(train_molecules)
test_smiles = set(test_molecules)

for i,d in enumerate(descriptors):
    avail_rows = df[np.logical_not(df[d].isna())]
    train_smiles = set(avail_rows["SMILES"]).intersection(train_smiles)
    test_smiles = set(avail_rows["SMILES"]).intersection(test_smiles)

train_idxs = np.asarray(sorted([ train_molecules.index(t) for t in train_smiles ]))
test_idxs = np.asarray(sorted([ test_molecules.index(t) for t in test_smiles ]))

train_molecules = np.asarray(train_molecules)
train_molecules = train_molecules[train_idxs]
train_data = train_data[train_idxs]

test_molecules = np.asarray(test_molecules)
test_molecules = test_molecules[test_idxs]
test_data = test_data[test_idxs]

In [None]:
print(train_data.shape, test_data.shape)

### Build numpy arrays of descriptors

In [None]:
collect()
train_labels = np.empty((train_data.shape[0], num_descriptors))
test_labels = np.empty((test_data.shape[0], num_descriptors))
train_labels[:] = np.nan
test_labels[:] = np.nan

for i,mol in enumerate(tqdm(train_molecules)):
    data_row = df.loc[df['SMILES'] == mol, descriptors ]
    data_vec = [ r.to_list()[-1] for l,r in data_row.items() ]
    train_labels[i,:] = np.asarray(data_vec)

for i,mol in enumerate(tqdm(test_molecules)):
    data_row = df.loc[df['SMILES'] == mol, descriptors ]
    data_vec = [ r.to_list()[-1] for l,r in data_row.items() ]
    test_labels[i,:] = np.asarray(data_vec)

In [None]:
print(train_labels.shape, test_labels.shape)

In [None]:
print(not np.isnan(train_labels).any(), not np.isnan(test_labels).any())

In [None]:
with np.printoptions(2, suppress = True):
    print(choice(train_labels))

# Setup CNN

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Normalization(
        input_shape = train_data.shape[1:],
    ),
    tf.keras.layers.Conv2D(
        filters = 32,
        kernel_size = 3, # sliding window
        data_format = "channels_first",
        activation = "relu",
    ),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(
        filters = 32,
        kernel_size = 5,
        activation = "relu"
    ),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(num_descriptors),
])

In [None]:
model.summary()

In [None]:
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate = .001),
    loss = "mean_absolute_error",
)

In [None]:
%%time
history = model.fit(
    train_data,
    train_labels,
    verbose = 1,
    epochs = 100,
    # Calculate validation results on 20% of the training data.
    validation_split = 0.2
)

In [None]:
def plot_loss(history):
    fig = plt.figure(figsize=(20,10))
    ax = fig.add_subplot()
    p1 = ax.plot(history.history['loss'], "--", color = "black")
    p2 = ax.plot(history.history['val_loss'], color = "black")
    ax.set_ylim([7, 25])
    ax.set_xlabel('Epoch', fontsize = 20)
    ax.set_ylabel('Mean Absolute Error', fontsize = 20)
    ax.tick_params(axis='both', which='major', labelsize=20)
    ax.legend(["Training loss", "Validation loss"], fontsize = 20)
    ax.grid(True)
    ax.set_title("(OEselma) ÄŒech complex persistence entropy, Regression CNN loss", fontsize = 20)
plt.rcParams.update({
  "text.usetex": True,
  "font.family": "Times"
})
plot_loss(history)

In [None]:
predicted = model.predict(test_data).reshape((test_data.shape[0],num_descriptors))

In [None]:
out_dict = dict()
out_dict["Descriptor"] = list()
out_dict["_sigma"] = list()
out_dict["Average"] = list()
out_dict["Test Average"] = test_labels.mean(axis = 0)
out_dict["MAE"] = np.abs(test_labels - predicted).mean(axis = 0)

for i,d in enumerate(descriptors):
    out_dict["Descriptor"].append(d)
    out_dict["Average"].append(df[d].mean(skipna = True)) # population mean for this descriptor
    out_dict["_sigma"].append(df[d].std(skipna = True)) # population mean for this descriptor
    
out_dict["MAE/sigma"] = out_dict["MAE"]/out_dict["_sigma"]

pd.set_option("display.max_rows", None, "display.max_columns", None)
out_df = pd.DataFrame(data = out_dict)
print(out_dict["MAE"].sum()/len(descriptors)) # THIS IS WHAT tf.keras.losses.MeanAbsoluteError() DOES
out_df = out_df.sort_values("MAE/sigma")
out_df

In [None]:
latex_dict = dict()
latex_dict["Descriptors"]                              = out_dict["Descriptor"]
latex_dict["$\sigma$"]                                 = out_dict["_sigma"]
latex_dict["$\mu$"]                                    = out_dict["Average"]
latex_dict["$\overline{\mathbf{f}_d}$"]                = out_dict["Test Average"]
latex_dict["$\overline{|\mathbf{y}_d-\mathbf{f}_d|}$"] = out_dict["MAE"] 
latex_dict["Score"]                                    = out_dict["MAE/sigma"]
latex_df = pd.DataFrame(data = latex_dict)
latex_df = latex_df.sort_values("Score")
latex_df

In [None]:
latex_string = latex_df.to_latex(index = False, escape = False)
latex_string = latex_string.replace("_", " ")
latex_string = latex_string.replace("OEselma Descriptors;", "OEselma ")
latex_string = latex_string.replace("&\n"," & ")
latex_string = latex_string.replace("\n&"," & ")
latex_string = latex_string.replace("& \n"," & ")
latex_string = latex_string.replace("MOE Descriptors;", "MOE ")

lines = latex_string.split("\n")
for i,l in enumerate(lines):
    if i < 4 or i > len(lines)-4:
        continue
    
    a = l.split("&")
    b = a[1:]
    c = str.title(a[0].split(";")[0])
    d = [ c ] + b
    lines[i] = "&".join(d)
latex_string = "\n".join(lines)

latex_string = latex_string.replace("&", " & ")

for i in range(40):
    latex_string = latex_string.replace("  ", " ")


print(latex_string)