In [1]:
import numpy as np
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
from rdkit import Chem
import pandas as pd

from sklearn.model_selection import train_test_split

### Generating data

SMILES are fetched from CSV and for each a PIL image and an adjacency matrix is generated.

Adjacency matrix is padded with zeros. Both are loaded and converted into numpy arrays.

In [2]:
def get_x_y(smiles_string, image_dims=(60, 60)):
    # TODO: needs assertion that the string is good   
    mol = Chem.MolFromSmiles(smiles_string)
    img = Chem.Draw.MolToImage(mol, size=image_dims)
    thresh = 200
    fn = lambda x : 0 if x > thresh else 255
    r = img.convert('L').point(fn, mode='1')
    #r = img.convert('L')
    adj = Chem.GetAdjacencyMatrix(mol)
    padded_adj = np.zeros(image_dims)
    for index, bond_val in np.ndenumerate(adj):
        padded_adj[index] += bond_val
    return np.expand_dims(np.array(r), -1), np.expand_dims(padded_adj, -1)

In [None]:
# generte data
#df = pd.read_excel('data/SMILES_ID.xlsx')
df = pd.read_csv('data/SMILES.csv')
output_shape = (60, 60)
x_data, y_data = [],[]
#sel_smiles = []
for SMILES in df['SMILES']:
    try:
        img, adj_matrix = get_x_y(SMILES, image_dims=output_shape)
        x_data.append(img)
        y_data.append(adj_matrix)
        #sel_smiles.append(SMILES)
    #except Exception as e:
        #print(SMILES, e)
    except:
        continue

x_data, y_data = np.array(x_data), np.array(y_data)
with open('test_np_X.npy', 'wb') as f:
    np.save(f, x_data, allow_pickle=True)
with open('test_np_Y.npy', 'wb') as f:
    np.save(f, y_data, allow_pickle=True)

### Load pregenerated data

mmap mode in numpy uses data from disk


In [3]:
# load pregenerated data
output_shape = (60, 60)
x_data = np.load('test_np_X.npy', mmap_mode='r', allow_pickle=True)
y_data = np.load('test_np_y.npy', mmap_mode='r', allow_pickle=True)

### Split data

into train and test using sk-learn helper function

In [4]:
x_train, x_test, y_train, y_test = train_test_split(
    x_data, y_data, test_size=0.33, random_state=42)
#x_train = np.expand_dims(x_train, -1)
#x_test = np.expand_dims(x_test, -1)
#y_train = np.expand_dims(y_train, -1)
#y_test = np.expand_dims(y_test, -1)
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")

x_train shape: (6600, 60, 60, 1)
y_train shape: (6600, 60, 60, 1)
6600 train samples
3251 test samples


### Model creation

Adding layers

In [5]:
# Model parameters

input_shape = (*output_shape, 1)

model = keras.Sequential(
    [   
        keras.layers.Input(shape=input_shape),
        keras.layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        keras.layers.MaxPooling2D(pool_size=(2, 2)),
        keras.layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        keras.layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        keras.layers.MaxPooling2D(pool_size=(3, 3)),
        keras.layers.Dense(57, activation='linear'),
        keras.layers.Flatten(),
        keras.layers.Reshape((3648, 1)),
        keras.layers.Dropout(0.5),
        keras.layers.Cropping1D(cropping=24),
        #keras.layers.Dense(5),
        #keras.layers.Dropout(0.5),
        keras.layers.Reshape((60, 60, 1)),
        keras.layers.Dense(1, activation=keras.activations.hard_sigmoid),
    ]
)


### Compiling model 

geting summary after compilation

In [6]:

model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[
        keras.metrics.Accuracy(name="acc"),
    ],
)

print(model.summary())

batch_size = 256
epochs = 5

callbacks = [
    keras.callbacks.ModelCheckpoint(filepath="model_at_epoch_{epoch}.keras"),
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=2),
]


None


### Training

In [7]:

model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.15,
    callbacks=callbacks,
)
score = model.evaluate(x_test, y_test, verbose=1)

Epoch 1/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 329ms/step - acc: 0.0291 - loss: 0.6372 - val_acc: 0.4711 - val_loss: 0.2417
Epoch 2/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 331ms/step - acc: 0.4088 - loss: 0.4212 - val_acc: 0.9068 - val_loss: 0.1499
Epoch 3/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 328ms/step - acc: 0.4926 - loss: 0.4178 - val_acc: 0.9547 - val_loss: 0.1554
Epoch 4/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 325ms/step - acc: 0.4927 - loss: 0.4145 - val_acc: 0.9593 - val_loss: 0.1566
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - acc: 0.9589 - loss: 0.1576


### Prediction test

test shape of final output

In [8]:
x_real, y_real = get_x_y('CCCO')
print('real x shape: ', x_real.shape)
print('real y shape: ', y_real.shape)
prd = model.predict(x_real.reshape(1, 60, 60, 1))
print('predicted y shape: ', prd.shape)

real x shape:  (60, 60, 1)
real y shape:  (60, 60, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
predicted y shape:  (1, 60, 60, 1)
