In [2]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

#from rdkit.Chem.Draw import IPythonConsole
#from rdkit.Chem import Draw

# create instance of sdf reader
suppl = Chem.SDMolSupplier('datasets/estrogen_receptor_alpha.sdf', sanitize=True, strictParsing=True)

# read all molecules besides ones with errors into a list
mols = [mol for mol in suppl if mol is not None]

def get_nodes(mol):
    AllChem.ComputeGasteigerCharges(mol)
    nodes = np.concatenate((
        np.array([(
            atom.GetAtomicNum(), 
            atom.GetDoubleProp("_GasteigerCharge")) 
        for atom in mol.GetAtoms()]),
        mol.GetConformer().GetPositions()[:,:2]),
        axis=1
    )
    return nodes

def get_edges(mol):
    return np.array([
        bond.GetBondTypeAsDouble() for bond in mol.GetBonds()])

def str_is_number(s):
    try:
        float(s)
        return True
    
    except ValueError:
        pass
 
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    
    except (TypeError, ValueError):
        pass
 
    return False

def get_labels(mol, key='IC50 (nM)'):
    """Generate label data for each molecule
    
    "rank" refers to the precense of angle brackets 
    for concentrations that are beyond detection limits.
    1: "<", 2: ">", 3: none
    
    "conc" containts the reported concentration without 
    angle brackets, when value is 0 then metric was not reported.
    
    """
    # read potency metric
    sample = mol.GetPropsAsDict()[key]
    # remove leading and trailing whitespaces
    sample = sample.strip()
        
    # below exp. range
    if "<" in sample: 
        
        rank = 1
        conc = sample.replace('<', '')

    # outside exp. range
    elif ">" in sample:
        
        rank = 2
        conc = sample.replace('>', '')

    # inside exp. range
    elif str_is_number(sample):
        
        rank = 3
        conc = sample

    # no data provided
    else:
        rank = 3
        conc = 0.0
    
    return np.array([rank, conc])

# Get nodes
x = [get_nodes(mol) for mol in mols]
    
# Adjacency matrices
a = [Chem.rdmolops.GetAdjacencyMatrix(mol) for mol in mols]

# Edge features: bond types
e = [get_edges(mol) for mol in mols]

# Labels: (rank, IC50s)
# this metric is less reliable than e.g. Kd as 
# it depends on the of the substrates used in 
# the essay and it is cell type dependent.
y = [get_labels(mol) for mol in mols]

RDKit ERROR: [13:14:30] Explicit valence for atom # 0 B, 6, is greater than permitted
RDKit ERROR: [13:14:30] ERROR: Could not sanitize molecule ending on line 1040
RDKit ERROR: [13:14:30] ERROR: Explicit valence for atom # 0 B, 6, is greater than permitted
RDKit ERROR: [13:14:30] Explicit valence for atom # 0 B, 6, is greater than permitted
RDKit ERROR: [13:14:30] ERROR: Could not sanitize molecule ending on line 10198
RDKit ERROR: [13:14:30] ERROR: Explicit valence for atom # 0 B, 6, is greater than permitted
RDKit ERROR: [13:14:30] Explicit valence for atom # 0 B, 5, is greater than permitted
RDKit ERROR: [13:14:30] ERROR: Could not sanitize molecule ending on line 285480
RDKit ERROR: [13:14:30] ERROR: Explicit valence for atom # 0 B, 5, is greater than permitted
RDKit ERROR: [13:14:30] Explicit valence for atom # 0 B, 6, is greater than permitted
RDKit ERROR: [13:14:30] ERROR: Could not sanitize molecule ending on line 299686
RDKit ERROR: [13:14:30] ERROR: Explicit valence for atom

In [None]:
import scipy.sparse as sp
from spektral.data import Dataset, Graph

class EstrogenDB(Dataset):
    """Dataset from BindingDB
    """
    def __init__(self, n_samples, nodes, edges, adjcs, feats, **kwargs):
        self.nsamples = n_samples
        self.nodes = nodes
        self.edges = edges
        self.adjcs = adjcs
        self.feats = feats
        super().__init__(**kwargs)
        
    def read(self):
        return [
            make_graph(
                node=self.nodes[i],
                adjc=self.adjcs[i], 
                edge=self.edges[i],
                feat=self.feats[i])
            for i in range(self.n_samples)
            if self.feats[i][1] > 0
        ]
    
    def save(self):
        pass
    
    def load(self):
        pass
    
    def make_graph(node, adjc, edge, feat):
        # The node features
        x = node.astype(float)
        
        # The adjacency matrix
        # convert to scipy.sparse matrix
        a = adjc.astype(int)
        a = sp.csr_matrix(a)
        # check shape (n_nodes, n_nodes)
        assert node.shape == a.shape[0]
        assert node.shape == a.shape[1]
        
        # The labels
        y = feat.astype(float)
        
        # The edge features 
        e = edge.astype(float)
        
        return Graph(x=x, a=a, e=e, y=y)

In [None]:
#dataset = EstrogenDB(n_samples=1000, nodes=x, edges=e, adjcs=a, feats=y)

In [1]:
a[0]

NameError: name 'a' is not defined