## Artificial Neural Networks Trained on Molecular Descriptors for Predicting Toxicity

In [8]:
# Python ≥ 3.7 is required
import sys
assert sys.version_info >= (3, 7)

# Scikit-Learn ≥ 1.0.1 is required
import sklearn
assert sklearn.__version__ >= '1.0.1'

# TensorFlow ≥ 2.0 is required
import tensorflow as tf
print('Tensorflow version:', tf.__version__)

# Keras ≥ 2.4 is required
from tensorflow import keras
print('Keras version:', keras.__version__) 

# Common imports
import numpy as np
import os
import pandas as pd

# Sci-kit Learn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# RDKit
from rdkit import Chem
from rdkit.Chem import RDKFingerprint
from rdkit.Chem import rdMolDescriptors

Tensorflow version: 2.11.0
Keras version: 2.11.0


## ANN 1: Trained on Descriptors from RDKit and Mordred Libraries

(with stratified k-fold cross validation)

In [9]:
# to produce the X dataset containing Morgan Fingerprint + RDkit descriptors 

rdkit = pandas.read_csv("data/rdkit_descriptors.csv")
rdkit = rdkit.drop(rdkit.columns[[0,1]],axis = 1) # remove first column (numbers) and second column (SMILES)
rdkit_array = np.array(rdkit)

mordred = pd.read_csv('data/mordred_descriptors.csv')
mordred = mordred.to_numpy()
mordred_array = mordred[:,2::].astype(float)

descriptors = np.append(rdkit_array, mordred_array, axis=1)

In [10]:
# rdkit.shape
# mordred.shape
descriptors.shape # check the shapes

(554, 1329)

In [11]:
# to get the y dataset i.e., binarised toxicity vector
original = pandas.read_csv("data/fathead_minnow_dataset.csv")
lc50 = original["LC50_(mg/L)"]

lc50_binary_list = []
for value in lc50:
    if value > 0.5:
        lc50_binary_list.append(0) # not high toxicity
    elif value <= 0.5:
        lc50_binary_list.append(1)

lc50_binary = np.array(lc50_binary_list)

In [12]:
seed = 42 # manually set the seed value as it is needed for the kfold set up
tf.random.set_seed(seed)  

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) # sets up the k-fold, n_splits = 5 so its 5-fold cross validation
cvscores = [] # to record accuracy scores from each run

X = descriptors # assign your x dataset (molecular parameters)
y = lc50_binary # assign your y dataset i.e. response variables

In [18]:
keras.backend.clear_session() # This ensures that any previous runs don't interfere.

for train, test in kfold.split(X, y):

    ann_model1 = keras.models.Sequential([
        keras.layers.Dense(4000, input_shape=(1329,), activation = 'relu'),
        keras.layers.Dense(1000, activation = 'relu'),
        keras.layers.Dense(500, activation = 'relu'),
        keras.layers.Dense(10, activation = 'relu'),
        keras.layers.Dense(1, activation = 'sigmoid')
    ])

#     model.summary()

    ann_model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    ann_model1.fit(X[train], y[train], epochs=20, verbose=1)
    
    scores = ann_model1.evaluate(X[test], y[test], verbose=1) 
    print("%s: %.2f%%" % (ann_model1.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
    
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
accuracy: 91.89%
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
accuracy: 91.89%
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
accuracy: 91.89%
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
accuracy: 91.89%
Epoch 1/

In [22]:
# view the results:

print(cvscores) # shows scores of each run
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) # shows final cross validated accuracy

[91.89189076423645, 91.89189076423645, 91.89189076423645, 91.89189076423645, 91.89189076423645, 91.8181836605072]
91.88% (+/- 0.03%)


## ANN 2: Trained on Morgan Fingerprints

(with stratified k-fold cross validation)

In [23]:
with open("data/morgan_fingerprints.csv", 'r') as file:
    file.readline()
    content = list(file)
    
fingerprints = []
for line in content:
    line_list = line.strip("\n").split(",")
    fingerprint = np.array([int(i) for i in line_list[2:]])
    fingerprints.append(fingerprint)
    
fingerprints = np.array(fingerprints)

In [24]:
fingerprints.shape

(554, 2048)

In [25]:
# same code as before to curate y dataset
original = pandas.read_csv("data/fathead_minnow_dataset.csv")
lc50 = original["LC50_(mg/L)"]

lc50_binary_list = []
for value in lc50:
    if value > 0.5:
        lc50_binary_list.append(0) # not high toxicity
    elif value <= 0.5:
        lc50_binary_list.append(1)

lc50_binary = np.array(lc50_binary_list)

In [26]:
seed = 42 # manually set the seed value as it is needed for the kfold set up
tf.random.set_seed(seed)  

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) # sets up the k-fold, n_splits = 5 so its 5-fold cross validation
cvscores = [] # to record accuracy scores from each run

X2 = fingerprints # assign your x dataset (in this case morgan fingerprints)
y = lc50_binary # assign your y dataset i.e. response variables

In [27]:
keras.backend.clear_session() # This ensures that any previous runs don't interfere.

for train, test in kfold.split(X2, y):

    ann_model2 = keras.models.Sequential([
        keras.layers.Dense(4000, input_shape=(2048,), activation = 'relu'),
        keras.layers.Dense(1500, activation = 'relu'),
        keras.layers.Dense(500, activation = 'relu'),
        keras.layers.Dense(10, activation = 'relu'),
        keras.layers.Dense(1, activation = 'sigmoid')
    ])

#     model.summary()

    ann_model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    ann_model2.fit(X2[train], y[train], epochs=20, verbose=1)
    
    scores = ann_model2.evaluate(X2[test], y[test], verbose=1) 
    print("%s: %.2f%%" % (ann_model2.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
    
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
accuracy: 91.89%
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
accuracy: 92.79%
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
accuracy: 92.79%
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
accuracy: 92.79%
Epoch 1/

In [28]:
# view the results:

print(cvscores) # shows scores of each run
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) # shows final cross validated accuracy

[91.89189076423645, 92.79279112815857, 92.79279112815857, 92.79279112815857, 92.72727370262146]
92.60% (+/- 0.35%)
