In [None]:
import numpy as np
import tensorflow as tf
import keras
import time

In [None]:
#class for splitting up SMILE strings into tokens and one hot encoding them
class SMILES_Tokenizer(object):
  def __init__(self):
      
      #creating list of all characters in SMILE encoding
      atoms = [
            'Li', 'Na', 'Al', 'Si', 'Cl', 'Sc', 'Zn', 'As', 'Se', 'Br', 'Sn',
            'Te', 'Cn', 'H', 'B', 'C', 'N', 'O', 'F', 'P', 'S', 'K', 'V', 'I'
      ]

      special = [
            '(', ')', '[', ']', '=', '#', '%','.', '0', '1', '2', '3', '4', '5',
            '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's'
      ]

      #creating list of characters used for starting string, padding string, 
      #and ending string
      padding = ['G', 'A', 'E']

      #creating total list of characters
      self.characters = sorted(atoms, key=len, reverse=True) + special + padding
      dict_len = len(self.characters)

      self.dict_len1 = [x for x in self.characters if len(x) == 1]
      self.dict_len2 = [x for x in self.characters if len(x) == 2]

      self.one_hot_dict = {}
      
      # creating one hot encoding vector for each character
      for i, char in enumerate(self.characters):
        vec = np.zeros(dict_len, dtype=np.float32)
        vec[i] = 1
        self.one_hot_dict[char] = vec
      self.one_hot_dict[-1] = np.full(dict_len, dtype=np.float32, fill_value=-1)



  #splitting SMILE string into tokens
  def tokenize(self, smiles):
    char_Count = len(smiles)
    smiles += ''
    tokens = []
    i = 0
    z = 0
    while(i<char_Count):
      
      #checking if next character has length 2 if so, finding what character it
      #is then appending it to list of tokens
      if smiles[i:i+2] in self.dict_len2:
        tokens.append(smiles[i:i+2])
        i+=2  
        continue
      
      #checking if character has length 1 if so, finding waht character it is
      #then appending it to list of tokens
      if smiles[i:i+1] in self.dict_len1:
        tokens.append(smiles[i:i+1])
        i+=1
        continue

      z +=1
    return tokens
  
  def one_hot_encode(self, tokenized_smiles):
    encoded_smiles = np.array([self.one_hot_dict[symbol] for symbol in tokenized_smiles],np.float32)
    encoded_smiles = encoded_smiles.reshape(encoded_smiles.shape[0], encoded_smiles.shape[1])
    tensor = tf.convert_to_tensor(encoded_smiles, dtype=tf.float32)
    return encoded_smiles

In [None]:
class Data_Processing(object):
  def __init__(self, max_len):
    self.tokenizer = SMILES_Tokenizer()
    self.one_hot_dict = self.tokenizer.one_hot_dict
    self.dictionary = self.tokenizer.characters

    #setting max length for SMILE strings
    self.max_len = max_len

  #split SMILE string into tokens
  def tokenize(self, smi):
    return self.tokenizer.tokenize(smi)

  #tokenize for batches
  def tokenize_data(self, data):
    tokenized_smiles = [self.tokenizer.tokenize(smi) for smi in data]
    return tokenized_smiles  
      
  #one hot encode tokenized SMILE string
  def one_hot_encode(self, pad_smi):
    return self.tokenizer.one_hot_encode(pad_smi)
  
  #one_hot_encode for batches
  def one_hot_encoding(self, data):
    encoded_smiles = [self.tokenizer.one_hot_encode(pad_smi) for pad_smi in data]
    return encoded_smiles

  #adding padding and adding Start and End tokens to SMILE strings
  def pad(self, tokenized_smi):
    return ['G'] + tokenized_smi + ['E'] + [
      'A' for _ in range(self.max_len - len(tokenized_smi))
    ]

  #padding for batches
  def padding(self, data):
    padded_smiles = [self.pad(t_smi) for t_smi in data]
    return padded_smiles

  def get_dictionary(self):
    return self.one_hot_dict

  #mapping one-hot encoded array to character
  def basic_inverse_dictionary(self, encoded_str):
    
    decoded_str = [np.where(vector == 1) for vector in encoded_str]
    return [self.characters[index] for index in decoded_str[0][1]]

  #mapping index to character
  def inverse_dictionary(self, index):
    return self.characters[index]

  #inverse_dictionary for batches
  def inverse_dictionary_(self, index):
    return [self.characters[i] for i in index]



In [None]:
#network archtecture definition
class GRU(tf.keras.Model):
  def __init__(self, time_steps, vocab_size, batch_size):
    super(GRU ,self).__init__()
    #first hidden layer
    self.gru1 = tf.keras.layers.GRU(
                                   enc_units,
                                   return_sequences = True,
                                   return_state = True,
                                   recurrent_initializer='glorot_uniform',
                                   dropout = 0.3)
    #second hidden layer
    self.gru2 = tf.keras.layers.GRU(
                                   enc_units,
                                   return_sequences = True,
                                   return_state = True,
                                   recurrent_initializer='glorot_uniform',
                                   dropout = 0.3)
    #third hidden layer
    self.gru3 = tf.keras.layers.GRU(
                                   enc_units,
                                   return_sequences = True,
                                   return_state = True,
                                   recurrent_initializer='glorot_uniform',
                                   dropout = 0.3)      
    self.vocab_size = vocab_size
    self.time_steps = time_steps
    self.batch_size = batch_size
    
    #dense layer doesnt have softmax, softmax is only used for inference as its
    #supposed to make model training more stable
    self.dense = tf.keras.layers.Dense(vocab_size,
                                       kernel_initializer='glorot_uniform')

  #forward step for calculating prediction for next time step and hidden states
  def call(self, x, hidden_state1, hidden_state2, hidden_state3):
    
    #calculating hidden states and passing them along to next hidden layer
    output, hidden_state1 = self.gru1(x, hidden_state1)
    output, hidden_state2 = self.gru2(output, hidden_state2)
    output, hidden_state3 = self.gru3(output, hidden_state3)
    
    #setting tensor dimensions for compatibility with dense output layer
    output = tf.reshape(output, (-1, output.shape[2]))
    output = self.dense(output)
    
    return output, hidden_state1, hidden_state2, hidden_state3

  #function for setting initial hidden states
  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size, enc_units))


In [None]:
#setting parameters
BATCH_SIZE = 1024
enc_units = 1024
vocab_inp_size = 53
vocab_tar_size = 53
learning_rate = 0.0001

In [None]:
#creating loss calculator and optimizer objects
loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True, reduction='none')
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

In [None]:
#creating reccurent neural network
gru = GRU(62, 53, BATCH_SIZE)

#loading model weights to sample from
checkpoint_dir = "/content/drive/MyDrive/GRU_CHEM_V15/ckpt-27"
checkpoint = tf.train.Checkpoint(optimizer=optimizer, GRU=gru)
options = tf.train.CheckpointOptions(experimental_io_device="/job:localhost")
checkpoint.restore(checkpoint_dir, options=options)

In [None]:
#generates molecules in batches
def generate(sample_temp, batches):
  data = Data_Processing(60)
  sequence = []
  start = time.time()
  for j in range(batches):
    temp = []
    
    #setting initial states for model
    prediction = np.repeat(data.one_hot_encode('G'), 1024, axis=0)
    GRU_hidden1 = GRU_hidden2 = GRU_hidden3 = tf.zeros((1024, enc_units))
    
    for i in range(75):
      #adding dimension to input so it is correct dimension for GRU 
      GRU_input = tf.expand_dims(prediction, 1)
      
      #calculating new prediction and new hidden states
      prediction, GRU_hidden1, GRU_hidden2, GRU_hidden3 = gru(GRU_input, GRU_hidden1, GRU_hidden2, GRU_hidden3)
      
      #choosing class based on model prediction
      idx = sample_with_temp(prediction, sample_temp)
      temp += [data.inverse_dictionary_(idx)]
      
      #setting new prediction so it can be fed back into model
      prediction = data.one_hot_encode(data.inverse_dictionary_(idx))

    #reorienting array from x by y to y by x 
    sequence.append(np.array(temp).transpose())
  
  #collapsing 2d list to 1d
  sequence = list(itertools.chain.from_iterable(sequence))
  print(time.time() - start)
  
  #removing padding characters from SMILE string
  for i, smi in enumerate(sequence):
    sequence[i] = ''.join(smi)
    sequence[i] = sequence[i].strip('E')
    sequence[i] = sequence[i].strip('A')
    sequence[i] = sequence[i].strip('E')
  
  return sequence

#given logits for different classes and a sampling temperature, return a class
def sample_with_temp(prediction, sampling_temp):
  #calculating probabilities using softmax
  prediction = tf.nn.softmax(logits=prediction)
  
  #normalizing probabilities by sampling_temp parameter
  stretched = np.log(prediction) / sampling_temp

  #recalculating probabilities after normalization
  stretched_probs = np.exp(stretched) / np.sum(np.exp(stretched), axis=1)[:,None]
  
  #choosing class bsed on probability
  return [np.random.choice(range(len(stretched[0])), p=stretched_prob) for stretched_prob in stretched_probs]

In [None]:
generated_testing = generate(0.5, 1)
generated_testing = generated_testing[:500]

with open('generated_testing', 'wb') as fp:
    pickle.dump(generated_testing, fp)

In [None]:
generated = generate(0.75, 98)
generated_malaria = generated[:10**5]

with open('generated_malaria', 'wb') as fp:
    pickle.dump(generated_malaria, fp)