In [None]:
!pip install rdkit tensorflow tensorflow_gnn absl-py

In [1]:
from os import environ
environ['CUDA_VISIBLE_DEVICES']='-1'
import tensorflow as tf
import tensorflow_gnn as tfgnn

def graph_tensor_spec():
  spec = tfgnn.GraphTensorSpec.from_piece_specs(
      node_sets_spec = {
        "atom": tfgnn.NodeSetSpec.from_field_specs(
          features_spec = {
            tfgnn.HIDDEN_STATE: tf.TensorSpec((None, 118), tf.float32)
          },
          sizes_spec = tf.TensorSpec((1,), tf.int32)
        )
      },
      edge_sets_spec = {
        "bond": tfgnn.EdgeSetSpec.from_field_specs(
          features_spec = {
            tfgnn.HIDDEN_STATE: tf.TensorSpec((None, 22), tf.float32)
          },
          sizes_spec = tf.TensorSpec((1,), tf.int32),
          adjacency_spec = tfgnn.AdjacencySpec.from_incident_node_sets("atom", "atom")
        )
      }
  )
  return spec

class GATv2Convolution(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super(GATv2Convolution, self).__init__()
    self.in_channel = kwargs.get('in_channel', 64)
    self.out_channel = kwargs.get('out_channel', 8)
    self.head = kwargs.get('head', 8)
    self.drop_rate = kwargs.get('drop_rate', 0.1)
  def build(self, input_shape):
    self.w1 = self.add_weight(name = 'w1', shape = (self.in_channel, self.head * self.out_channel), initializer = tf.keras.initializers.GlorotUniform(), trainable = True)
    self.w2 = self.add_weight(name = 'w2', shape = (self.in_channel, self.head * self.out_channel), initializer = tf.keras.initializers.GlorotUniform(), trainable = True)
    self.a = self.add_weight(name = 'a', shape = (1, self.head * self.out_channel), initializer = tf.keras.initializers.GlorotUniform(), trainable = True)
  def call(self, graph, edge_set_name):
    h = tfgnn.keras.layers.Readout(node_set_name = 'atom', feature_name = tfgnn.HIDDEN_STATE)(graph) # h.shape = (node_num, in_channel)
    hi = tfgnn.broadcast_node_to_edges(graph, edge_set_name, tfgnn.SOURCE, feature_value = h) # hi.shape = (edge_num, in_channel)
    hj = tfgnn.broadcast_node_to_edges(graph, edge_set_name, tfgnn.TARGET, feature_value = h) # hj.shape = (edge_num, in_channel)
    hi = tf.linalg.matmul(hi, self.w1) # hi.shape = (edge_num, head * out_channel)
    hj = tf.linalg.matmul(hj, self.w2) # hj.shape = (edge_num, head * out_channel)
    e = tf.keras.layers.LeakyReLU()(hi + hj) # e.shape = (edge_num, head * out_channel)
    e = tf.nn.dropout(e, rate = self.drop_rate) # e.shape = (edge_num, head * out_channel)
    e = e * self.a # e.shape = (edge_num, head * channel)
    e = tf.nn.dropout(e, rate = self.drop_rate) # e.shape = (edge_num, head * out_channel)
    e = tf.reshape(e, (-1, self.head, self.out_channel)) # e.shape = (edge_num, head, out_channel)
    e = tf.math.reduce_sum(e, axis = -1, keepdims = True) # e.shape = (edge_num, head, 1)
    attention = tfgnn.softmax(graph, per_tag = tfgnn.TARGET, edge_set_name = edge_set_name, feature_value = e) # e.shape = (edge_num, head, 1)
    hi = tf.reshape(hi, (-1, self.head, self.out_channel)) # hi.shape = (edge_num, head, out_channel)
    hi = hi * attention
    hi = tf.reshape(hi, (-1, self.head * self.out_channel)) # hi.shape = (edge_num, head * out_channel)
    h = tfgnn.pool_edges_to_node(graph, edge_set_name, tfgnn.TARGET, reduce_type = 'sum', feature_value = hi) # h.shape = (node_num, head * out_channel)
    return h
  def get_config(self):
    config = super(GATv2Convolution, self).get_config()
    config['in_channel'] = self.in_channel
    config['out_channel'] = self.out_channel
    config['head'] = self.head
    config['drop_rate'] = self.drop_rate
    return config
  @classmethod
  def from_config(cls, config):
    return cls(**config)

class UpdateHidden(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super(UpdateHidden, self).__init__()
    self.in_channel = kwargs.get('in_channel', 64)
    self.out_channel = kwargs.get('out_channel', 8)
    self.head = kwargs.get('head', 8)

    if self.in_channel != self.head * self.out_channel:
      self.w = self.add_weight(name = 'w', shape = (self.in_channel, self.head * self.out_channel), initializer = tf.keras.initializers.GlorotUniform(), trainable = True)
  def call(self, inputs):
    node_features, incident_node_features, context_features = inputs
    # NOTE: this is residual structure
    if self.in_channel != self.head * self.out_channel:
      skip = tf.linalg.matmul(node_features, self.w)
    else:
      skip = node_features
    return tf.keras.layers.ELU()(skip + incident_node_features['bond'])

def GATv2(channel = 8, head = 8, layer_num = 4, drop_rate = 0.3):
  inputs = tf.keras.Input(type_spec = graph_tensor_spec())
  results = inputs.merge_batch_to_components()
  results = tfgnn.keras.layers.MapFeatures(
    node_sets_fn = lambda node_set, *, node_set_name: tf.keras.layers.Dense(head * channel)(node_set[tfgnn.HIDDEN_STATE]))(results)
  for i in range(layer_num):
    results = tfgnn.keras.layers.GraphUpdate(
      node_sets = {
        "atom": tfgnn.keras.layers.NodeSetUpdate(
          edge_set_inputs = {
            "bond": GATv2Convolution(
              in_channel = channel * head,
              out_channel = channel,
              head = head if i != layer_num - 1 else 1,
              drop_rate = drop_rate)
          },
          next_state = UpdateHidden(
              in_channel = channel * head,
              out_chnanel = channel,
              head = head if i != layer_num - 1 else 1)
        )
      }
    )(results)
  results = tfgnn.keras.layers.Pool(tag = tfgnn.CONTEXT, reduce_type = "mean", node_set_name = "atom")(results)
  results = tf.keras.layers.Dense(1, activation = tf.keras.activations.sigmoid)(results)
  return tf.keras.Model(inputs = inputs, outputs = results)


2024-03-06 16:53:13.402493: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-06 16:53:13.439661: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-06 16:53:13.666623: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-06 16:53:13.666702: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-06 16:53:13.668542: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [2]:
from os.path import join, exists
from os import system
from shutil import rmtree
from requests import get
from hashlib import md5

response = get('https://raw.githubusercontent.com/breadbread1984/GATv2-tf2/classification/ckpt.tar.gz')
with open('ckpt.tar.gz', 'wb') as f:
  f.write(response.content)
print(response.status_code)
assert '378cef545b17952dd79d557a37a00b6f' == md5(response.content).hexdigest()
if exists('ckpt'): rmtree('ckpt')
system('tar xzvf ckpt.tar.gz')

predictor = GATv2()
predictor.load_weights(join('ckpt', 'ckpt', 'variables', 'variables'))

200
ckpt/
ckpt/validation/
ckpt/validation/events.out.tfevents.1709712865.dgxa100svr02.1970136.1.v2
ckpt/ckpt/
ckpt/ckpt/keras_metadata.pb
ckpt/ckpt/assets/
ckpt/ckpt/variables/
ckpt/ckpt/variables/variables.data-00000-of-00001
ckpt/ckpt/variables/variables.index
ckpt/ckpt/saved_model.pb
ckpt/ckpt/fingerprint.pb
ckpt/train/
ckpt/train/events.out.tfevents.1709712858.dgxa100svr02.1970136.0.v2


<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7fc8e814d8d0>

In [8]:
from rdkit import Chem

def smiles_to_sample(smiles):
  molecule = Chem.MolFromSmiles(smiles)
  indices = list()
  nodes = list()
  edges = list()
  for atom in molecule.GetAtoms():
    idx = atom.GetIdx()
    nodes.append(atom.GetAtomicNum())
    indices.append(idx)
    for neighbor_atom in atom.GetNeighbors():
      neighbor_idx = neighbor_atom.GetIdx()
      bond = molecule.GetBondBetweenAtoms(idx, neighbor_idx)
      edges.append((idx, neighbor_idx, bond.GetBondType()))
  sidx = tf.argsort(indices)
  nodes = tf.stack(nodes, axis = 0) # nodes.shape = (node_num,)
  nodes = tf.gather(nodes, sidx)
  edges = tf.stack(edges, axis = 0) # edges.shape = (edge_num, 3)
  graph = tfgnn.GraphTensor.from_pieces(
    node_sets = {
      "atom": tfgnn.NodeSet.from_fields(
        sizes = tf.constant([nodes.shape[0]]),
        features = {
          tfgnn.HIDDEN_STATE: tf.one_hot(nodes, 118)
        }
      )
    },
    edge_sets = {
      "bond": tfgnn.EdgeSet.from_fields(
        sizes = tf.constant([edges.shape[0]]),
        adjacency = tfgnn.Adjacency.from_indices(
          source = ("atom", edges[:,0]),
          target = ("atom", edges[:,1])
        ),
        features = {
          tfgnn.HIDDEN_STATE: tf.one_hot(edges[:,2], 22)
        }
      )
    }
  )
  return graph

In [9]:
csv = open('/bohr/ai4scup-cns-5zkz/v3/mol_test.csv', 'r')
output = open('submission.csv', 'w')
output.write('SMILES,TARGET\n')
for line,row in enumerate(csv.readlines()):
  if line == 0: continue
  smiles, label = row.split(',')
  graph = smiles_to_sample(smiles)
  pred = predictor(graph)
  output.write('%s,%d' % (smiles, 1 if pred[0] > 0.5 else 0) + '\n')
output.close()
csv.close()

FileNotFoundError: [Errno 2] No such file or directory: '/bohr/ai4scup-cns-5zkz/v3/mol_test.csv'