In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import model_selection
from numpy.random import seed
seedval = 13
seed(seedval)
from tensorflow.random import set_seed
set_seed(seedval)

In [None]:
import json
pathAREtrainx = '/content/drive/My Drive/SMILES/SR-ARE_wholetraining_x.txt'
pathAREtrainy = '/content/drive/My Drive/SMILES/SR-ARE_wholetraining_y.txt'
pathAREscorex = '/content/drive/My Drive/SMILES/SR-ARE_score_x.txt'
pathAREscorey = '/content/drive/My Drive/SMILES/SR-ARE_score_y.txt'

pathMMPtrainx = '/content/drive/My Drive/SMILES/SR-MMP_wholetraining_x.txt'
pathMMPtrainy = '/content/drive/My Drive/SMILES/SR-MMP_wholetraining_y.txt'
pathMMPscorex = '/content/drive/My Drive/SMILES/SR-MMP_score_x.txt'
pathMMPscorey = '/content/drive/My Drive/SMILES/SR-MMP_score_y.txt'

pathERtrainx = '/content/drive/My Drive/SMILES/NR-ER_wholetraining_x.txt'
pathERtrainy = '/content/drive/My Drive/SMILES/NR-ER_wholetraining_y.txt'
pathERscorex = '/content/drive/My Drive/SMILES/NR-ER_score_x.txt'
pathERscorey = '/content/drive/My Drive/SMILES/NR-ER_score_y.txt'

pathAhRtrainx = '/content/drive/My Drive/SMILES/NR-AhR_wholetraining_x.txt'
pathAhRtrainy = '/content/drive/My Drive/SMILES/NR-AhR_wholetraining_y.txt'
pathAhRscorex = '/content/drive/My Drive/SMILES/NR-AhR_score_x.txt'
pathAhRscorey = '/content/drive/My Drive/SMILES/NR-AhR_score_y.txt'

# function to load data from path
def load_from_path(path):
  with open(path) as f:
      data = json.load(f)
  return data

# function taking a data list and creating a tensor of (n molecules, length, features)
def molecular_tensor(moldata, maxlen=400, numfeat=42):
  M_list = []
  for mol in moldata:
    mol_matrix = np.reshape(mol, (400, 42))
    M_list.append(mol_matrix)
  M_tensor = np.array(M_list)
  return M_tensor 

In [None]:
# train dataset actual creation

AREtraindata = load_from_path(pathAREtrainx)
AREtraindata = molecular_tensor(AREtraindata)
AREtrainclass = load_from_path(pathAREtrainy)
AREtrainclass = np.array(AREtrainclass)

MMPtraindata = load_from_path(pathMMPtrainx)
MMPtraindata = molecular_tensor(MMPtraindata)
MMPtrainclass = load_from_path(pathMMPtrainy)
MMPtrainclass = np.array(MMPtrainclass)

ERtraindata = load_from_path(pathERtrainx)
ERtraindata = molecular_tensor(ERtraindata)
ERtrainclass = load_from_path(pathERtrainy)
ERtrainclass = np.array(ERtrainclass)

AhRtraindata = load_from_path(pathAhRtrainx)
AhRtraindata = molecular_tensor(AhRtraindata)
AhRtrainclass = load_from_path(pathAhRtrainy)
AhRtrainclass = np.array(AhRtrainclass)


In [None]:
# score dataset
AREscoredata = load_from_path(pathAREscorex)
AREscoredata = molecular_tensor(AREscoredata)
AREscoreclass = load_from_path(pathAREscorey)
AREscoreclass = np.array(AREscoreclass)

MMPscoredata = load_from_path(pathMMPscorex)
MMPscoredata = molecular_tensor(MMPscoredata)
MMPscoreclass = load_from_path(pathMMPscorey)
MMPscoreclass = np.array(MMPscoreclass)

ERscoredata = load_from_path(pathERscorex)
ERscoredata = molecular_tensor(ERscoredata)
ERscoreclass = load_from_path(pathERscorey)
ERscoreclass = np.array(ERscoreclass)

AhRscoredata = load_from_path(pathAhRscorex)
AhRscoredata = molecular_tensor(AhRscoredata)
AhRscoreclass = load_from_path(pathAhRscorey)
AhRscoreclass = np.array(AhRscoreclass)

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM, Conv1D, MaxPooling1D, AveragePooling1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding

def class_weight_calculator(molclass):
  tot = len(molclass)
  pos = sum(molclass.astype(int))
  neg = tot - pos

  w0 = (1. / neg) * tot / 2.
  w1 = (1. / pos) * tot / 2.

  cw = {'0': w0,
        '1': w1}
  return cw

cwARE = class_weight_calculator(AREtrainclass)
cwMMP = class_weight_calculator(MMPtrainclass)
cwER = class_weight_calculator(ERtrainclass)
cwAhR = class_weight_calculator(AhRtrainclass)

def deep_model():
  modeldeepcnn = Sequential() 
  modeldeepcnn.add(Conv1D(filters=120, kernel_size=15, activation='relu', input_shape=(400, 42), padding='same')) 
  modeldeepcnn.add(MaxPooling1D(3)) 
  modeldeepcnn.add(Conv1D(filters=120, kernel_size=15, activation='relu', padding='same')) 
  modeldeepcnn.add(MaxPooling1D(3)) 
  modeldeepcnn.add(Conv1D(filters=120, kernel_size=15, activation='relu', padding='same')) 
  modeldeepcnn.add(GlobalAveragePooling1D()) 
  modeldeepcnn.add(Dropout(0.5)) 
  modeldeepcnn.add(Dense(64, activation='relu')) 
  modeldeepcnn.add(Dense(1, activation='sigmoid')) 
  modeldeepcnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.AUC()]) 
  return modeldeepcnn

def wide_model_bigk():
  modelmanywidecnn = Sequential() 
  modelmanywidecnn.add(Conv1D(filters=300, kernel_size=30, activation='relu', input_shape=(400, 42), padding='same')) 
  modelmanywidecnn.add(MaxPooling1D(3)) 
  modelmanywidecnn.add(Conv1D(filters=300, kernel_size=30, activation='relu', padding='same')) 
  modelmanywidecnn.add(GlobalAveragePooling1D()) 
  modelmanywidecnn.add(Dropout(0.5)) 
  modelmanywidecnn.add(Dense(64, activation='relu')) 
  modelmanywidecnn.add(Dense(1, activation='sigmoid')) 
  modelmanywidecnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.AUC()])
  return modelmanywidecnn

def LSTM_400_200(embed_dim=6):
  embedding_dim = embed_dim
  features = Toembed_tensor.shape[2]
  n_samples = Toembed_tensor.shape[0]
  model = Sequential()
  model.add(Embedding(n_samples, embedding_dim, input_length=400))
  model.add(LSTM(400, return_sequences=True))
  model.add(LSTM(200))
  model.add(Dropout(0.5))
  model.add(Dense(64, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.AUC()])
  return model

Using TensorFlow backend.


In [None]:
def model_evaluator(modelling, M_tensor, molclass, Test_tensor, testclass, cw, epos=3, runs=10, title='EVALUATION'):
  pr_hat=[]
  re_hat=[]
  f1_hat=[]
  avgacc_hat=[]
  rocauc_hat=[]
  for i in range(runs):
    model = modelling() 
    # Fit the model
    history = model.fit(M_tensor, molclass,
                        epochs=epos, batch_size=256,
                        verbose=False,
                        class_weight=cw)

    Y_test_predict = model.predict(Test_tensor)
    y_true = testclass.astype(int)
    y_pred = []
    for prediction in Y_test_predict:
        prediction = np.where(prediction>0.5, 1, 0)
        y_pred.append(prediction)
    y_pred = np.asarray(y_pred)
    rep=classification_report(y_true, y_pred, output_dict=True)
    pr, re, f1, avgacc, rocauc = rep['1']['precision'],rep['1']['recall'],rep['1']['f1-score'],rep['accuracy'], metrics.roc_auc_score(y_true, y_pred)
    pr_hat.append(pr)
    re_hat.append(re)
    f1_hat.append(f1)
    avgacc_hat.append(avgacc)
    rocauc_hat.append(rocauc)
    print('One run complete')

  pr_sd=np.std(pr_hat)
  re_sd=np.std(re_hat)
  f1_sd=np.std(f1_hat)
  avgacc_sd=np.std(avgacc_hat)
  rocauc_sd=np.std(rocauc_hat)

  pr_hat=np.mean(pr_hat)
  re_hat=np.mean(re_hat)
  f1_hat=np.mean(f1_hat)
  avgacc_hat=np.mean(avgacc_hat)
  rocauc_hat=np.mean(rocauc_hat)

  print(title)
  print('precision: %.2f +/- %.2f' % (pr_hat, pr_sd))
  print('recall: %.2f +/- %.2f' % (re_hat, re_sd))
  print('f1-score: %.2f +/- %.2f' % (f1_hat, f1_sd))
  print('accuracy: %.2f +/- %.2f' % (avgacc_hat, avgacc_sd))
  print('roc-auc: %.2f +/- %.2f' % (rocauc_hat, rocauc_sd))


In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

**EVALUATING MODELS ON SR-ARE**

In [None]:
# DEEP
model_evaluator(deep_model,
                AREtraindata, AREtrainclass,
                AREscoredata, AREscoreclass, cw=cwARE,
                epos=15,
                title='TEST (SCORE) STATISTICS FOR DEEP CNN MODEL ON SR-ARE')

One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
TEST (SCORE) STATISTICS FOR DEEP CNN MODEL ON SR-ARE
precision: 0.40 +/- 0.04
recall: 0.54 +/- 0.10
f1-score: 0.45 +/- 0.02
accuracy: 0.78 +/- 0.04
roc-auc: 0.69 +/- 0.02


In [None]:
# WIDE
model_evaluator(wide_model_bigk,
                AREtraindata, AREtrainclass,
                AREscoredata, AREscoreclass, cw=cwARE,
                epos=10,
                title='TEST (SCORE) STATISTICS FOR WIDE CNN MODEL ON SR-ARE')

One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
TEST (SCORE) STATISTICS FOR WIDE CNN MODEL ON SR-ARE
precision: 0.35 +/- 0.03
recall: 0.59 +/- 0.10
f1-score: 0.43 +/- 0.02
accuracy: 0.74 +/- 0.03
roc-auc: 0.68 +/- 0.02


**EVALUATING MODELS ON SR-MMP**

In [None]:
# DEEP
model_evaluator(deep_model,
                MMPtraindata, MMPtrainclass,
                MMPscoredata, MMPscoreclass, cw=cwMMP,
                epos=15,
                title='TEST (SCORE) STATISTICS FOR DEEP CNN MODEL ON SR-MMP')

One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
TEST (SCORE) STATISTICS FOR DEEP CNN MODEL ON SR-MMP
precision: 0.43 +/- 0.05
recall: 0.77 +/- 0.06
f1-score: 0.55 +/- 0.03
accuracy: 0.86 +/- 0.02
roc-auc: 0.82 +/- 0.02


In [None]:
# WIDE
model_evaluator(wide_model_bigk,
                MMPtraindata, MMPtrainclass,
                MMPscoredata, MMPscoreclass, cw=cwMMP,
                epos=10,
                title='TEST (SCORE) STATISTICS FOR WIDE CNN MODEL ON SR-MMP')

One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
TEST (SCORE) STATISTICS FOR WIDE CNN MODEL ON SR-MMP
precision: 0.40 +/- 0.06
recall: 0.81 +/- 0.10
f1-score: 0.52 +/- 0.03
accuracy: 0.84 +/- 0.03
roc-auc: 0.83 +/- 0.03


**EVALUATING MODELS ON NR-ER**

In [None]:
# DEEP
model_evaluator(deep_model,
                ERtraindata, ERtrainclass,
                ERscoredata, ERscoreclass, cw=cwER,
                epos=15,
                title='TEST (SCORE) STATISTICS FOR DEEP CNN MODEL ON NR-ER')

One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
TEST (SCORE) STATISTICS FOR DEEP CNN MODEL ON NR-ER
precision: 0.28 +/- 0.09
recall: 0.40 +/- 0.12
f1-score: 0.30 +/- 0.02
accuracy: 0.82 +/- 0.05
roc-auc: 0.63 +/- 0.03


In [None]:
# WIDE
model_evaluator(wide_model_bigk,
                ERtraindata, ERtrainclass,
                ERscoredata, ERscoreclass, cw=cwER,
                epos=10,
                title='TEST (SCORE) STATISTICS FOR WIDE CNN MODEL ON NR-ER')

One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
TEST (SCORE) STATISTICS FOR WIDE CNN MODEL ON NR-ER
precision: 0.27 +/- 0.13
recall: 0.47 +/- 0.11
f1-score: 0.31 +/- 0.03
accuracy: 0.80 +/- 0.04
roc-auc: 0.65 +/- 0.03


**EVALUATING MODELS ON NR-AhR**

In [None]:
# DEEP
model_evaluator(deep_model,
                AhRtraindata, AhRtrainclass,
                AhRscoredata, AhRscoreclass,  cw=cwAhR,
                epos=15,
                title='TEST (SCORE) STATISTICS FOR DEEP CNN MODEL ON NR-AhR')

One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
TEST (SCORE) STATISTICS FOR DEEP CNN MODEL ON NR-AhR
precision: 0.36 +/- 0.08
recall: 0.80 +/- 0.13
f1-score: 0.48 +/- 0.05
accuracy: 0.79 +/- 0.07
roc-auc: 0.79 +/- 0.02


In [None]:
# WIDE
model_evaluator(wide_model_bigk,
                AhRtraindata, AhRtrainclass,
                AhRscoredata, AhRscoreclass,  cw=cwAhR,
                epos=10,
                title='TEST (SCORE) STATISTICS FOR WIDE CNN MODEL ON NR-AhR')

One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
One run complete
TEST (SCORE) STATISTICS FOR WIDE CNN MODEL ON NR-AhR
precision: 0.35 +/- 0.07
recall: 0.78 +/- 0.13
f1-score: 0.47 +/- 0.04
accuracy: 0.79 +/- 0.07
roc-auc: 0.79 +/- 0.03


In [None]:
# LSTM EVALUATIONS

toembedslice = list(range(5))+list(range(21,42))
features = len(toembedslice)

from keras.preprocessing import sequence

def index_maker(M_tensor):
  Toembed_tensor = M_tensor[:,:, toembedslice]
  
  Toembed_idxs = list()
  for mol in Toembed_tensor:
    molec = []
    for block in mol:
      try:
        idx = list(block).index(1)
        molec.append(idx)
        if molec == None:
          print(idx, molec)
      except:
        continue
    Toembed_idxs.append(molec)
  Toembed_idxs = sequence.pad_sequences(np.array(Toembed_idxs), maxlen=400)
  return Toembed_idxs

def LSTM_400_200(embed_dim=10, feats=features):
  embedding_dim = embed_dim
  model = Sequential()
  model.add(Embedding(feats, embedding_dim, input_length=400))
  model.add(LSTM(400, return_sequences=True))
  model.add(LSTM(200))
  model.add(Dropout(0.5))
  model.add(Dense(64, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.AUC()])
  return model

def lstm_evaluator(modelling, Toembed_idxs, molclass, Toembtest_idxs, testclass, cw, epos=80, runs=3, title='EVALUATION'):
  pr_hat=[]
  re_hat=[]
  f1_hat=[]
  avgacc_hat=[]
  rocauc_hat=[]
  for i in range(runs):
    model = modelling() 
    # Fit the model
    history = model.fit(Toembed_idxs, molclass,
                        epochs=epos, batch_size=256,
                        verbose=False,
                        class_weight=cw)

    Y_test_predict = model.predict(Toembtest_idxs)
    y_true = testclass.astype(int)
    y_pred = []
    for prediction in Y_test_predict:
        prediction = np.where(prediction>0.5, 1, 0)
        y_pred.append(prediction)
    y_pred = np.asarray(y_pred)
    rep=classification_report(y_true, y_pred, output_dict=True)
    pr, re, f1, avgacc, rocauc = rep['1']['precision'],rep['1']['recall'],rep['1']['f1-score'],rep['accuracy'], metrics.roc_auc_score(y_true, y_pred)
    pr_hat.append(pr)
    re_hat.append(re)
    f1_hat.append(f1)
    avgacc_hat.append(avgacc)
    rocauc_hat.append(rocauc)
    print('One run complete')

  pr_sd=np.std(pr_hat)
  re_sd=np.std(re_hat)
  f1_sd=np.std(f1_hat)
  avgacc_sd=np.std(avgacc_hat)
  rocauc_sd=np.std(rocauc_hat)

  pr_hat=np.mean(pr_hat)
  re_hat=np.mean(re_hat)
  f1_hat=np.mean(f1_hat)
  avgacc_hat=np.mean(avgacc_hat)
  rocauc_hat=np.mean(rocauc_hat)

  print(title)
  print('precision: %.2f +/- %.2f' % (pr_hat, pr_sd))
  print('recall: %.2f +/- %.2f' % (re_hat, re_sd))
  print('f1-score: %.2f +/- %.2f' % (f1_hat, f1_sd))
  print('accuracy: %.2f +/- %.2f' % (avgacc_hat, avgacc_sd))
  print('roc-auc: %.2f +/- %.2f' % (rocauc_hat, rocauc_sd))

In [None]:
ARE2embed_traindata = index_maker(AREtraindata)
MMP2embed_traindata = index_maker(MMPtraindata) 
ER2embed_trainata = index_maker(ERtraindata),  
AhR2embed_traindata = index_maker(AhRtraindata)

ARE2embed_scoredata = index_maker(AREscoredata)
MMP2embed_scoredata = index_maker(MMPscoredata) 
ER2embed_scoredata = index_maker(ERscoredata),  
AhR2embed_scoredata = index_maker(AhRscoredata)



**EVALUATING LSTM ON SR-ARE**

In [None]:
lstm_evaluator(LSTM_400_200,
               ARE2embed_traindata, AREtrainclass,
               ARE2embed_scoredata, AREscoreclass,
               cw=cwARE)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


**EVALUATING LSTM ON SR-MMP**

In [None]:
lstm_evaluator(LSTM_400_200,
               MMP2embed_traindata, MMPtrainclass,
               MMP2embed_scoredata, MMPscoreclass,
               cw=cwMMP)

**EVALUATING LSTM ON NR-ER**

In [None]:
lstm_evaluator(LSTM_400_200,
               ER2embed_traindata, ERtrainclass,
               ER2embed_scoredata, ERscoreclass,
               cw=cwER)

**EVALUATING LSTM ON NR-AhR**

In [None]:
lstm_evaluator(LSTM_400_200,
               AhR2embed_traindata, AhRtrainclass,
               AhR2embed_scoredata, AhRscoreclass,
               cw=cwAhR)