# Rmax prediction of 6-helicenes from neural networks models

Linear and non-linear models have been built for regression of Rmax from a dataset of 6-helicenes with up to 4 halogens (F,Cl,Br,I) elements replacing the hydrogen atoms. 

The fundamental 6-helicene is made of 6 carbon rings with 16 bonding hydrogens. For the neural network model, each molecule is represented by a 16-component vector, each component associated to the position of one hydrogen. For example, a 6-helicene with a Fl atom in the position of the 4th hydrogen and a Br atom in the position of the 8th hydrogen atom is represented as:

$$(H,H,H,Cl,H,H,H,Br,H,H,H,H,H,H,H,H)$$


The models respect specular symmetry with respect to the middle of the molecule, which means that for the model:

$$(H,H,H,Cl,H,H,H,Br,H,H,H,H,H,H,H,H) = \\(H,H,H,H,H,H,H,H,Br,H,H,H,Cl,H,H,H)$$

In [2]:
# Load libraries

import numpy as np
import tensorflow as tf
tf.__version__

2023-05-11 10:49:14.025111: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-11 10:49:14.227233: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-11 10:49:14.878911: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-05-11 10:49:14.879050: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

'2.11.0'

In [193]:
# Load pre-trained model

#model = tf.keras.models.load_model('./Models/R_model_invariant_linear')
#model.summary()

linear_model = tf.keras.models.load_model('./Models/R_model_invariant_linear')
non_linear_model = tf.keras.models.load_model('./Models/R_model_invariant_non-linear_many-body_dropout')

print(linear_model.summary())
#print(non_linear_model.summary())

Model: "model_17"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Pos_1 (InputLayer)             [(None, 5)]          0           []                               
                                                                                                  
 Pos_2 (InputLayer)             [(None, 5)]          0           []                               
                                                                                                  
 Pos_3 (InputLayer)             [(None, 5)]          0           []                               
                                                                                                  
 Pos_4 (InputLayer)             [(None, 5)]          0           []                               
                                                                                           

In [194]:
# Auxiliar tools to define the input molecule of the model

#x_int = np.zeros(16)
n_pos = 16
x0_int = [0]*n_pos
x0_str =['H']*n_pos
#print('Fundamental 6-helicene:')
#print(x0_int)
#print(x0_str)

# dictionary to convert integer representation to string representation
int2str = {
  "0": "H",
  "1": "Fl",
  "2": "Cl",
  "3": "Br",
  "4": "I"
}

# dictionary to convert string representation to integer representation
str2int = {
  "H": "0",
  "Fl": "1",
  "Cl": "2",
  "Br": "3",
  "I": "4"
}

# function to convert integer representation to string representation
def int_to_str(x_int):
  x_str = [int2str[str(k)] for k in x_int]
  return x_str

# function to convert string representation to integer representation
def str_to_int(x_str):
  x_int = [int(str2int[c]) for c in x_str]
  return x_int

# One-hot encode input features
n_atom_types = 5 # there are 5 atom types: hydrogen and 4 halogens
n_features = n_atom_types*n_pos

# functions to define one-hot encoded vector being input of the model
def int_to_ohe(x_int):
  x_ohe = np.zeros(n_features) 
  for j in range(n_pos):
    atom_id = x_int[j]
    x_ohe[j*n_atom_types+atom_id] = 1.0
  return x_ohe

def str_to_ohe(x_str):
  #x_int = [int_to_str[str(k)] for k in x_str]
  x_int = str_to_int(x_str)
  print(x_int)
  x_ohe = np.zeros(n_features) 
  for j in range(n_pos):
    atom_id = x_int[j]
    x_ohe[j*n_atom_types+atom_id] = 1.0
  return x_ohe

# function to reshape the input as (1,n_features) and split it into multiple inputs
# The model architecture is designed to respect symmetries
def int_to_input(x_int):
  x_input = np.zeros((1,n_features))
  x_ohe = int_to_ohe(x_int)
  x_input[0,:] = x_ohe[:]
  x_input = np.split(x_input,n_pos,axis=1)
  return x_input

def str_to_input(x_str):
  x_input = np.zeros((1,n_features))
  x_int = str_to_int(x_str)
  x_ohe = int_to_ohe(x_int)
  x_input[0,:] = x_ohe[:]
  x_input = np.split(x_input,n_pos,axis=1)
  return x_input

""" # Examples
print()
x_int = x0_int
x_int[2] = 3
#x_str2 = [int2str[str(k)] for k in x_int]
x_str2 = int_to_str(x_int)
print(x_str2)
x_ohe2 = int_to_ohe(x_int)
print(x_ohe2)

print()
x_str = x0_str
x_str[2] = 'Br'
#x_str2 = [int2str[str(k)] for k in x_int]
x_int2 = str_to_int(x_str)
print(x_int2)
x_ohe2 = int_to_ohe(x_int2)
print(x_ohe2)

# Rmax calculation example (integer representation)
n_data = 1
x_input = np.zeros((n_data,n_features))
x = x0_int.copy()
x[2] = 3
x_ohe = int_to_ohe(x)
print(x)
print(x_ohe)
x_input[0,:] = x_ohe[:]
x_input = np.split(x_input,n_pos,axis=1)
#print(x_input)
Rmax = model.predict(x_input)
Rmax = float(Rmax)
print(Rmax) 

# Example of molecule with integer representation
x = x0_int.copy()
x[2] = 3
x_input = int_to_input(x)
print(float(model.predict(x_input))) """


" # Examples\nprint()\nx_int = x0_int\nx_int[2] = 3\n#x_str2 = [int2str[str(k)] for k in x_int]\nx_str2 = int_to_str(x_int)\nprint(x_str2)\nx_ohe2 = int_to_ohe(x_int)\nprint(x_ohe2)\n\nprint()\nx_str = x0_str\nx_str[2] = 'Br'\n#x_str2 = [int2str[str(k)] for k in x_int]\nx_int2 = str_to_int(x_str)\nprint(x_int2)\nx_ohe2 = int_to_ohe(x_int2)\nprint(x_ohe2)\n\n# Rmax calculation example (integer representation)\nn_data = 1\nx_input = np.zeros((n_data,n_features))\nx = x0_int.copy()\nx[2] = 3\nx_ohe = int_to_ohe(x)\nprint(x)\nprint(x_ohe)\nx_input[0,:] = x_ohe[:]\nx_input = np.split(x_input,n_pos,axis=1)\n#print(x_input)\nRmax = model.predict(x_input)\nRmax = float(Rmax)\nprint(Rmax) \n\n# Example of molecule with integer representation\nx = x0_int.copy()\nx[2] = 3\nx_input = int_to_input(x)\nprint(float(model.predict(x_input))) "

In [189]:
# Auxiliar functions to evaluate Rmax with original units

R_mean = 591.451949
R_std = 85.381995

def calculate_Rmax(x,rep,mod):
    if(rep=='int'): # integer represenation of the input vector
        x_input = int_to_input(x)
    elif(rep=='str'): # string represenation of the input vector
        x_input = str_to_input(x)
    if(mod=='linear'):
        model = linear_model
    elif(mod=='non_linear'):
        model = non_linear_model
    Rmax = float( model.predict(x_input) )
    Rmax = Rmax * R_std + R_mean
    return Rmax

In [197]:
# Examples 

print('Fundamental 6-helicene:')
print(x0_int)
print(x0_str)

# Example of molecule with integer representation
x = x0_int.copy()
x[5] = 3
Rmax = calculate_Rmax(x,rep='int',mod='linear')
print()
print(x)
print(Rmax)

# Example of molecule with integer representation
x = x0_str.copy()
x[5] = 'Br'
Rmax = calculate_Rmax(x,rep='str',mod='linear')
print()
print(x)
print(Rmax)

x = x0_str.copy()
x[10] = 'Br'
Rmax = calculate_Rmax(x,rep='str',mod='linear')
print()
print(x)
print(Rmax)



Fundamental 6-helicene:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']

[0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
647.629036339589

['H', 'H', 'H', 'H', 'H', 'Br', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']
647.629036339589

['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'Br', 'H', 'H', 'H', 'H', 'H']
647.629036339589
