In [3]:
import numpy as np
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
from rdkit import Chem
import pandas as pd

from sklearn.model_selection import train_test_split


### Loading data

SMILES are fetched from CSV and for each a PIL image and an adjacency matrix is generated.

Adjacency matrix is padded with zeros. Both are loaded and converted into numpy arrays.

In [None]:
def get_x_y(smiles_string, image_dims=(60, 60)):
    # TODO: needs assertion that the string is good   
    mol = Chem.MolFromSmiles(smiles_string)
    img = Chem.Draw.MolToImage(mol, size=image_dims)
    thresh = 200
    fn = lambda x : 0 if x > thresh else 255
    r = img.convert('L').point(fn, mode='1')
    #r = img.convert('L')
    adj = Chem.GetAdjacencyMatrix(mol)
    padded_adj = np.zeros(image_dims)
    for index, bond_val in np.ndenumerate(adj):
        padded_adj[index] += bond_val
    return np.expand_dims(np.array(r), -1), np.expand_dims(padded_adj, -1)
# data
#df = pd.read_excel('data/SMILES_ID.xlsx')
df = pd.read_csv('data/SMILES.csv')
output_shape = (300, 300)
x_data, y_data = [],[]
#sel_smiles = []
for SMILES in df['SMILES']:
    try:
        img, adj_matrix = get_x_y(SMILES, image_dims=output_shape)
        x_data.append(img)
        y_data.append(adj_matrix)
        #sel_smiles.append(SMILES)
    #except Exception as e:
        #print(SMILES, e)
    except:
        continue

x_data, y_data = np.array(x_data), np.array(y_data)

### Split data

into train and test using sk-learn helper function

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x_data, y_data, test_size=0.33, random_state=42)
#x_train = np.expand_dims(x_train, -1)
#x_test = np.expand_dims(x_test, -1)
#y_train = np.expand_dims(y_train, -1)
#y_test = np.expand_dims(y_test, -1)
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print(x_train.shape, "train samples")
print(x_test.shape, "test samples")

### Model creation

Adding layers

In [None]:
# Model parameters

input_shape = (*output_shape, 1)

model = keras.Sequential(
    [   
        keras.layers.Input(input_shape),
        #keras.layers.Dense(60, activation='relu'),
        keras.layers.Dense(30, activation='relu'),
        keras.layers.Dense(20, activation="relu"),
        #keras.layers.Dense(10, activation="sigmoid"),
        #keras.layers.Dense(10, activation="relu"),
        #keras.layers.Dense(1, activation="relu"),
        #keras.layers.Dense(2, activation="relu"),
        #keras.layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        #keras.layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        keras.layers.MaxPooling2D(pool_size=(2, 2)),
        #keras.layers.Conv2D(128, kernel_size=(3, 3), activation="relu"),
        #keras.layers.Conv2D(128, kernel_size=(3, 3), activation="relu"),
        #keras.layers.GlobalAveragePooling2D(),
        #keras.layers.Dropout(0.5),
        #keras.layers.Conv2D(64, kernel_size=(3,3), activation="relu"),
    ]
)


### Compiling model 

geting summary after compilation

In [None]:

model.compile(
        optimizer='sgd',
        loss='mean_squared_error',
        metrics=[
        keras.metrics.ConcordanceCorrelation(name="acc"),
    ],
)

print(model.summary())

batch_size = 8
epochs = 3

callbacks = [
    keras.callbacks.ModelCheckpoint(filepath="model_at_epoch_{epoch}.keras"),
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=2),
]


### Training

In [None]:

model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.15,
    callbacks=callbacks,
)
score = model.evaluate(x_test, y_test, verbose=1)

### Prediction test

test shape of final output

In [76]:
x_real, y_real = get_x_y('CCCO')
print('real x shape: ', x_real.shape)
print('real y shape: ', y_real.shape)
prd = model.predict(x_real)
print('predicted y shape: ', prd.shape)