In [1]:
!pip install rdkit==2023.9.3 mordred==1.2.0 networkx==2.8.4 numpy==1.23.5 nose-py3 pandas pyyaml tensorflow requests



In [2]:
from os import environ
environ['CUDA_VISIBLE_DEVICES']='-1'
import tensorflow as tf

class GraphConvolution(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super(GraphConvolution, self).__init__(**kwargs)
  def build(self, input_shape):
    self.bias = self.add_weight(name = 'bias', shape = (1,1,input_shape[1][-1]), initializer = tf.keras.initializers.GlorotUniform(), trainable = True)
  def call(self, inputs):
    # adjacent.shape = (batch, atom_num, atom_num)
    # annotations.shape = (batch, atom_num, in_channel)
    adjacent, annotations = inputs
    results = list()
    # NOTE: sparse_dense_matmul doesn't support matrix with batch dimension
    for i in range(tf.shape(adjacent)[0]):
      adj = tf.sparse.slice(adjacent, [i,0,0], [1,tf.shape(adjacent)[1],tf.shape(adjacent)[2]])
      adj = tf.sparse.reshape(adj, [tf.shape(adjacent)[1], tf.shape(adjacent)[2]])
      results.append(tf.sparse.sparse_dense_matmul(adj, annotations[i])) # results.shape = (batch, atom_num, in_channel)
    results = tf.stack(results, axis = 0)
    results = results + self.bias
    return results

class GatedGraphConvolution(tf.keras.Model):
  def __init__(self, channels, **kwargs):
    super(GatedGraphConvolution, self).__init__(**kwargs)
    self.gc = GraphConvolution()
    self.gru = tf.keras.layers.GRU(channels)
    self.channels = channels
  def call(self, adjacent, annotations):
    results = self.gc([adjacent, annotations]) # results.shape = (batch, atom_num, channels)
    shape = tf.shape(results)
    hidden_states = tf.reshape(annotations, (-1, self.channels)) # hidden_states.shape = (batch * atom_num, channels)
    visible_states = tf.reshape(results, (-1, 1, self.channels)) # visible_states.shape = (batch * atom_num, 1, channels)
    results = self.gru(visible_states, initial_state = hidden_states) # results.shape = (batch * atom_num, channels)
    results = tf.reshape(results, shape) # results.shape = (batch, atom_num, channels)
    return results

class FeatureExtractor(tf.keras.Model):
  def __init__(self, channels = 32, num_layers = 4, **kwargs):
    super(FeatureExtractor, self).__init__(**kwargs)
    self.embed = tf.keras.layers.Embedding(118, channels)
    self.ggnns = [GatedGraphConvolution(channels) for i in range(num_layers)]
    self.pool = tf.keras.layers.Lambda(lambda x: tf.math.reduce_mean(x, axis = 1))
  def call(self, adjacent, annotations):
    results = self.embed(annotations) # results.shape = (batch, atom_num, 32)
    for ggnn in self.ggnns:
      results = ggnn(adjacent, results)
    # graph pooling
    results = self.pool(results) # results.shape = (batch, 32)
    return results

class Predictor(tf.keras.Model):
  def __init__(self, channels = 32, num_layers = 4, **kwargs):
    super(Predictor, self).__init__(**kwargs)
    self.extractor = FeatureExtractor(channels, num_layers, **kwargs)
    self.dense = tf.keras.layers.Dense(1, activation = tf.keras.activations.sigmoid)
  def call(self, adjacent, annotations):
    results = self.extractor(adjacent, annotations)
    results = self.dense(results)
    return results


2024-02-02 11:18:32.607150: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-02 11:18:32.608632: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-02 11:18:32.630876: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-02 11:18:32.630899: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-02 11:18:32.630924: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [6]:
from requests import get
from shutil import rmtree
from os import system
from os.path import exists,join
from hashlib import md5

rmtree('ckpt.tar.gz'):
response = get('https://gitee.com/breadbread1984/molecule_attributes_prediction/raw/master/ckpt.tar.gz')
assert '20e6d4c678e7530e93dfad875559796b' == md5(response.content.hexdigest()
if exists('ckpt'): rmtree('ckpt')
system('tar xzvf ckpt.tar.gz')
predictor = Predictor(channels = 256, num_layers = 4)
optimizer = tf.keras.optimizers.Adam(1e-2)
checkpoint = tf.train.Checkpoint(model = predictor, optimizer = optimizer)
checkpoint.restore(tf.train.latest_checkpoint('ckpt'))

ckpt/
ckpt/ckpt-21.index
ckpt/events.out.tfevents.1706780381.dgxa100svr02.540029.0.v2
ckpt/checkpoint
ckpt/ckpt-21.data-00000-of-00001


<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7f9630f87090>

In [7]:
from rdkit import Chem

def smiles_to_graph(smiles: str):
    molecule = Chem.MolFromSmiles(smiles)
    atom_num = len(molecule.GetAtoms())
    annotations = list()
    indices = list()
    values = list()
    for atom in molecule.GetAtoms():
      idx = atom.GetIdx()
      annotations.append(atom.GetAtomicNum())
      for neighbor_atom in atom.GetNeighbors():
        neighbor_idx = neighbor_atom.GetIdx()
        indices.append((idx, neighbor_idx))
        # FIXME: bond type is not shown in adjacent matrix
        #bond_type = molecule.GetBondBetweenAtoms(idx, neighbor_idx).GetBondType()
        values.append(1)
    adjacent = tf.cast(tf.sparse.reorder(tf.sparse.SparseTensor(indices = indices, values = values, dense_shape = (atom_num, atom_num))), dtype = tf.float32)
    row_sum = tf.sparse.reduce_sum(adjacent, axis = -1, keepdims = True) # row_sum.shape = (atom_num, 1)
    adjacent = adjacent / row_sum # normalization
    annotations = tf.cast(tf.stack(annotations), dtype = tf.int32) # annotations.shape = (atom_num)
    return adjacent, annotations

csv = open('/bohr/ai4scup-cns-5zkz/v3/mol_test.csv', 'r')
output = open('submission.csv', 'w')
output.write('SMILES,TARGET\n')
for line, row in enumerate(csv.readlines()):
  if line == 0: continue
  smiles, label = row.split(',')
  adjacent, atoms = smiles_to_graph(smiles)
  adjacent = tf.sparse.expand_dims(adjacent, axis = 0)
  atoms = tf.expand_dims(atoms, axis = 0)
  pred = predictor(adjacent, atoms)
  output.write("%s,%d" % (smiles, 1 if pred[0] > 0.5 else 0) + '\n')
output.close()
csv.close()

CC1OC(=O)CC(O)CC(O)CC(O)CCC(O)C(O)CC(=O)CC(O)C(C(O)CC(OC2OC(C)C(O)C(N)C2O)C=CC=CC=CC=CCCC=CC=CC(C)C(O)C1C)C(=O)O tf.Tensor([[0.01881538]], shape=(1, 1), dtype=float32)
NCCCCC(NC(CCc1ccccc1)C(=O)O)C(=O)N2CCCC2C(=O)O tf.Tensor([[0.01332816]], shape=(1, 1), dtype=float32)
c1ccc(cc1)C2=NCC(=O)Nc3c2cc(cc3)[N+](=O)[O] tf.Tensor([[0.9857293]], shape=(1, 1), dtype=float32)
CCN(CC)C(C)CN1c2ccccc2Sc3c1cccc3 tf.Tensor([[0.9994578]], shape=(1, 1), dtype=float32)
CC(CCCC(C)(C)O)C1CCC2C(=CC=C3CC(O)CC(O)C3=C)CCCC12C tf.Tensor([[0.01678237]], shape=(1, 1), dtype=float32)
c1ccc(cc1)C(c2ccc(cc2)Cl)N3CCN(CC3)CCOCCO tf.Tensor([[0.9758645]], shape=(1, 1), dtype=float32)
CCCSc1ccc2[nH]c(NC(=O)OC)nc2c1 tf.Tensor([[0.00329394]], shape=(1, 1), dtype=float32)
NC12CC3CC(CC(C3)C1)C2 tf.Tensor([[0.6055918]], shape=(1, 1), dtype=float32)
CC1CCc2cc(F)cc3C(=O)C(=CN1c23)C(=O)O tf.Tensor([[0.00417894]], shape=(1, 1), dtype=float32)
CC1OC(CC(OC(=O)C)C1O)OC2C(O)CC(OC3C(O)CC(OC4CCC5(C)C(CCC6C5CCC7(C)C(CCC67O)C8=CC(=O)OC8)