<a href="https://colab.research.google.com/github/alejogiley/ChemGraphs/blob/prototype/notebooks/playground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
%%bash

url='https://raw.githubusercontent.com/alejogiley/ChemGraphs/prototype/datasets/estrogen_receptor_alpha.sdf'
curl $url --output estrogen_receptor_alpha.sdf 

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 34.6M  100 34.6M    0     0   194M      0 --:--:-- --:--:-- --:--:--  194M


In [2]:
%%bash

x86='/usr/lib/x86_64-linux-gnu'
url='https://anaconda.org/rdkit/rdkit/2018.09.1.0/download/linux-64/rdkit-2018.09.1.0-py36h71b666b_1.tar.bz2'

# download & extract
curl -L $url | tar xj lib

# move to python packages directory
mv lib/python3.6/site-packages/rdkit /usr/local/lib/python3.6/dist-packages/
mv lib/*.so.* $x86/

# rdkit need libboost
ln -s $x86/libboost_python3-py36.so.1.65.1 $x86/libboost_python3.so.1.65.1

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  3681    0  3681    0     0  36810      0 --:--:-- --:--:-- --:--:-- 37181
  3 20.2M    3  628k    0     0  1544k      0  0:00:13 --:--:--  0:00:13 1544k 42 20.2M   42 8889k    0     0  6247k      0  0:00:03  0:00:01  0:00:02 8138k 75 20.2M   75 15.2M    0     0  6501k      0  0:00:03  0:00:02  0:00:01 7513k100 20.2M  100 20.2M    0     0  6712k      0  0:00:03  0:00:03 --:--:-- 7500k


In [3]:
import sys

sys.path.append('/usr/local/lib/python3.6/site-packages')

In [5]:
%%capture

!pip install spektral

In [50]:
import os

import numpy as np
import tensorflow as tf
import scipy.sparse as sp

from rdkit import Chem
from rdkit.Chem import AllChem

from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import (
    Dense, Input, 
    Activation, Dropout,
    BatchNormalization)

from spektral.data import BatchLoader, Dataset, Graph
from spektral.transforms import LayerPreprocess
from spektral.layers import (
    ECCConv, GCSConv, 
    MinCutPool, GlobalSumPool)

In [135]:
def get_nodes(mol):
    
    AllChem.ComputeGasteigerCharges(mol)
    nodes = np.concatenate((
        np.array([(
            atom.GetAtomicNum(), 
            atom.GetDoubleProp("_GasteigerCharge")) 
        for atom in mol.GetAtoms()]),
        mol.GetConformer().GetPositions()[:,:2]),
        axis=1
    )
    return nodes

def symmetrize(matrix):
    return matrix + matrix.T - np.diag(matrix.diagonal())

def get_edges(mol):
    
    natms = mol.GetNumAtoms()
    edges = np.zeros((natms, natms))
    
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edges[i, j] = bond.GetBondTypeAsDouble()
    
    return symmetrize(edges)[:, :, None]

def isfloat(s):
    
    try:
        float(s)
        return True
    
    except ValueError:
        pass
 
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    
    except (TypeError, ValueError):
        pass
 
    return False

def get_labels(mol, key='IC50 (nM)'):
    """Generate label data for each molecule
    
    "rank" indicates precense or absence of angle brackets,
    which are reported for concentrations beyond detection limits.
    rank = 0 when "<", 1 when ">", and 2 when none
    
    "conc" containts the reported concentration values
    angle brackets are removed and boundary values are saved.
    when conc value is 0, it means metric was not reported.
    
    """
    # read potency metric
    sample = mol.GetPropsAsDict()[key]
    # remove leading and trailing whitespaces
    sample = sample.strip()
        
    # below exp. range
    if "<" in sample: 
        
        rank = 0
        conc = sample.replace('<', '')

    # outside exp. range
    elif ">" in sample:
        
        rank = 1
        conc = sample.replace('>', '')

    # inside exp. range
    elif isfloat(sample):
        
        rank = 2
        conc = sample

    # no data provided
    else:
        rank = 2
        conc = 0.0
    
    return np.array([rank, np.log10(float(conc))])

In [10]:
# create instance of sdf reader
suppl = Chem.SDMolSupplier('estrogen_receptor_alpha.sdf', sanitize=True, strictParsing=True)

# read all molecules besides ones with errors into a list
mols = [mol for mol in suppl if mol is not None]

# Get nodes
x = [get_nodes(mol) for mol in mols]
    
# Adjacency matrices
a = [Chem.rdmolops.GetAdjacencyMatrix(mol) for mol in mols]

# Edge features: bond types
e = [get_edges(mol) for mol in mols]

# Labels: (rank, IC50s)
# this metric is less reliable than e.g. Kd as 
# it depends on the of the substrates used in 
# the essay and it is cell type dependent.
y = [get_labels(mol) for mol in mols]

In [132]:
class EstrogenDB(Dataset):
    """Dataset from BindingDB
    """
    def __init__(self, 
                 n_samples,
                 dpath=None, 
                 nodes=None, 
                 edges=None, 
                 adjcs=None, 
                 feats=None,
                 **kwargs):
        self.n_samples = n_samples
        self.nodes = nodes
        self.edges = edges
        self.adjcs = adjcs
        self.feats = feats
        # dataset to load
        self.dpath = dpath
        
        super().__init__(**kwargs)
        
    @Dataset.path.getter
    def path(self):
        return self.dpath
        
    def read(self):
        # create Graph objects
        data = np.load(os.path.join(
            self.dpath, f'EstrogenDB.npz'), 
                       allow_pickle=True)
        
        output = [
            self.make_graph(
                node=data['x'][i],
                adjc=data['a'][i], 
                edge=data['e'][i],
                feat=data['y'][i])
            for i in range(self.n_samples)
            if data['y'][i][1] != 0
        ]
        
        self.n_samples = len(output)
        
        return output
    
    def download(self):
        # save graph arrays into directory
        filename = os.path.join(self.dpath, f'EstrogenDB')
        
        np.savez_compressed(
            filename, 
            x=self.nodes, 
            a=self.adjcs, 
            e=self.edges, 
            y=self.feats)
    
    @staticmethod
    def make_graph(node, adjc, edge, feat):
        # The node features
        x = node.astype(float)
        
        # The adjacency matrix
        # convert to scipy.sparse matrix
        a = adjc.astype(int)
        a = sp.csr_matrix(a)
        # check shape (n_nodes, n_nodes)
        assert a.shape[0] == len(node)
        assert a.shape[1] == len(node)
        
        # The labels
        y = feat.astype(float)
        # transform IC50 values
        # into pIC50 logscaled
        y[1] = np.log10(y[1])
        
        # The edge features 
        e = edge.astype(float)
        # check shape (n_nodes, n_nodes, ..)
        assert e.shape[0] == len(node)
        assert e.shape[1] == len(node)
        
        return Graph(x=x, a=a, e=e, y=y)

In [133]:
url = "../datasets"

# dataset = EstrogenDB(
#     n_samples=1000,
#     nodes=x, edges=e, 
#     adjcs=a, feats=y, 
#     dpath=url)

dataset = EstrogenDB(n_samples=1000, dpath=url)

In [186]:
# Transform the adjacency matrix 
# according to ECCConv
dataset.apply(LayerPreprocess(ECCConv))

# randomize indexes
indxs = np.random.permutation(len(dataset))

# split 90%/10%
split = int(0.9 * len(dataset))

# Train/test indexes
trnxs, tesxs = np.split(indxs, [split])

# Dataset partition
train, tests = dataset[trnxs], dataset[tesxs]

In [284]:
epochs = 4  # Number of training epochs
batch_size = 6 # MiniBatch sizes
learning_rate = 1e-4 # Optimizer learning rate

n_layers = 3  # number of ECCConv layers
n_neurons = 8  # number of Dense channels
n_channels = [64, 32, 32]  # number of Hidden units

In [285]:
def gcn_model(nodes_shape, edges_shape, n_channels, n_layers, n_neurons):
    
    X = Input(shape=(None, nodes_shape))
    A = Input(shape=(None, None))
    E = Input(shape=(None, None, edges_shape))

    y = ECCConv(n_channels[0])([X, A, E])
    y = Activation('relu')(y)
    
    for i in range(n_layers - 1):
        y = ECCConv(n_channels[i + 1])([y, A, E])
        y = BatchNormalization(renorm=True)(y)
        y = Activation('relu')(y)
        y = Dropout(0.05)(y)
    
    # pooling graphs to 4 nodes
    y, Z = MinCutPool(4, mlp_hidden=[8, 16])([y, A])
    y = GCSConv(48)([y, Z])
    y = Activation('relu')(y)
    
    # pooling
    y = GlobalSumPool()(y)
    y = Dense(n_neurons)(y)
    y = Activation('relu')(y)
    y = Dropout(0.25)(y)
    
    # prediction
    O = Dense(4)(y)
    
    return Model(inputs=[X, A, E], outputs=O)


def msent_loss(y_true, y_pred):
    
    c_true, c_pred = y_true[:, 0], y_pred[:, 1:]
    p_true, p_pred = y_true[:, 1], y_pred[:, :1]
    
    # categorical cross-entropy for classes: 0, 1, 2
    ent = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)  
    # regression error for pIC50 values
    mse = tf.keras.losses.MeanSquaredError()
    
    # return the overal error
    return tf.reduce_mean(
        ent(c_true, c_pred) + mse(p_true, p_pred))


def train_model(dataset, epochs, learning_rate, n_channels, n_layers, n_neurons): 
    
    # Parameters
    F = dataset.n_node_features  # Dimension of node features
    S = dataset.n_edge_features  # Dimension of edge features

    # Create GCN model
    model = gcn_model(
        nodes_shape=F, 
        edges_shape=S, 
        n_layers=n_layers, 
        n_neurons=n_neurons,
        n_channels=n_channels)
    
    # Compile GCN
    model.compile(
        optimizer=Adam(lr=learning_rate), 
        #metrics=["mae"],
        loss=msent_loss)
    
    # Print network summary
    model.summary()
    
    loader = BatchLoader(
        dataset, 
        batch_size=batch_size)
    
    # Trains the model
    history = model.fit(
        loader.load(),
        epochs=epochs,
        steps_per_epoch=loader.steps_per_epoch)
    
    return model, history

In [286]:
model, history = train_model(dataset, epochs, learning_rate, n_channels, n_layers, n_neurons)

Model: "model_26"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_79 (InputLayer)           [(None, None, 4)]    0                                            
__________________________________________________________________________________________________
input_80 (InputLayer)           [(None, None, None)] 0                                            
__________________________________________________________________________________________________
input_81 (InputLayer)           [(None, None, None,  0                                            
__________________________________________________________________________________________________
ecc_conv_78 (ECCConv)           (None, None, 64)     768         input_79[0][0]                   
                                                                 input_80[0][0]            

In [287]:
print("Testing model")
loader = BatchLoader(tests, batch_size=batch_size, shuffle=False)

model_loss = model.evaluate(loader.load(), steps=loader.steps_per_epoch)
print("Done. Test loss: {}".format(model_loss))

Testing model
Done. Test loss: 0.7720264792442322


In [288]:
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x))

prediction = model.predict(loader.load(), steps=loader.steps_per_epoch)

pIC50_true = [tests[i]['y'][1] for i in range(tests.n_graphs)]
class_true = [tests[i]['y'][0] for i in range(tests.n_graphs)] 

pIC50_pred = prediction[:, :1]
class_pred = np.argmax(np.apply_along_axis(softmax, 0, prediction[:, 1:]), axis=1)



In [289]:
pIC50_true, pIC50_pred

([-0.8239087409443188,
  -0.6989700043360187,
  -0.8600013417252057,
  -0.6989700043360187],
 array([[-0.68673813],
        [-1.5253204 ],
        [-1.2745371 ],
        [-1.7179643 ]], dtype=float32))

In [290]:
class_true, class_pred

([2.0, 2.0, 2.0, 2.0], array([1, 0, 1, 2]))