In [None]:
import keras 
import keras.backend as K
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
from keras.wrappers.scikit_learn import KerasRegressor # this is for making a model like every other in scikit
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from time import time
import matplotlib.pyplot as plt

from keras.models import Model
from sklearn.model_selection import GridSearchCV

np.random.seed(42)
from tensorflow.random import set_seed 
# from tensorflow import set_random_seed
set_seed(42)

In [None]:
Interactions_train = []    
with open("Interactions_Trainset.tab",'r') as f:
    for line in f:
        tokens = line.split()
        # 'Target-ID', 'Compound-ID', 'pIC50'  
        Interactions_train.append( [tokens[0], tokens[1], float(tokens[2]) ])

Interactions_valid = []        
with open("Interactions_Validset.tab",'r') as f:
    for line in f:
        tokens = line.split()
        # 'Target-ID', 'Compound-ID', 'pIC50'  
        Interactions_valid.append( [tokens[0], tokens[1], float(tokens[2]) ])

Interactions = [x for x in Interactions_train]
Interactions.extend(Interactions_valid)
# we use a dataframe to quickly sort targets wrt #compounds:
DF = pd.DataFrame( Interactions, columns =['Target-ID', 'Compound-ID','Std-value']) 
temp = DF.groupby(['Target-ID']).agg('count').sort_values(by='Compound-ID') # count the number of molecules
Targets = list(temp.index)
Compounds = np.unique(DF['Compound-ID'])
del temp, DF

nT=len(Targets); nC=len(Compounds)

print("There are {0} targets and {1} compounds currently loaded with {2} interactions.".format(nT,nC,len(Interactions)))
print("A DTI matrix would be {0:.4}% dense!".format(100.0*len(Interactions)/nT/nC ))

Labels_Targ = dict()
indx=0
for x in Targets:
    Labels_Targ[x]=indx
    indx+=1
    
Labels_Comp = dict()
indx=0
for x in Compounds:
    Labels_Comp[x]=indx
    indx+=1

# Initialize sparse matrix - this will be binary
DTI = 10*np.ones((nC,nT),dtype=float)

for edge in Interactions_train:
    # each edge has "target-compound-value-active"
    DTI[ Labels_Comp[edge[1]], Labels_Targ[edge[0]] ] = edge[2]

values = [x[2] for x in Interactions]
print("New data: {0} | {1}".format(np.mean(values), np.std(values)))

# load fingerprints and prepare as feature vectors
Fingerprints={} # this contains one list per fingerprint - not efficient...
with open('Compound_Fingerprints.tab', 'r') as f:
    header = f.readline()
    for line in f:
        # each line is Comp-ID, SMILES, FP
        tokens = line.split()
        # we keep only those compounds which have FPs
        if tokens[2] != 'NOFP':
            fp = [int(c) for c in tokens[2] ]
            Fingerprints[ tokens[0] ] = fp
print("%d fingerprints were loaded!" % len(Fingerprints))

# split to train/test data
random_seed = 2019
np.random.seed(random_seed)


print("The sizes for train and validation sets are {0} and {1} respectivelly".format( len(Interactions_train), len(Interactions_valid) ))

There are 110 targets and 23167 compounds currently loaded with 56392 interactions.
A DTI matrix would be 2.213% dense!
New data: -4.604582905766776 | 2.5887050795505413
23167 fingerprints were loaded!
The sizes for train and validation sets are 45114 and 11278 respectivelly


In [None]:
import tensorflow as tf 
# tf.optimizers.Adam(learning_rate)
def MTL(lamda=0.02, wsl=200, whl=20, lr=0.0001):
    inputs = keras.Input(shape=(2048,))
    sharedlayer = keras.layers.Dense(wsl, activation='tanh',kernel_regularizer=regularizers.l2(lamda) )(inputs) 
    myinit = keras.initializers.Constant(-4.)
    hidden = []
    for i in range(len(Targets)):
        hl = Dense(units=whl,  activation='tanh', kernel_regularizer=regularizers.l2(lamda) )(sharedlayer)
        hidden.append( Dense(1, kernel_initializer=myinit, activity_regularizer=regularizers.l1(0.0001) )(hl) )

    MTL=Model(inputs=inputs, outputs=hidden)
    # MTL.compile(loss=masked_loss_function, optimizer=keras.optimizers.adam(lr=0.0001))
    MTL.compile(loss=masked_loss_function, optimizer=tf.optimizers.Adam(lr=0.0001))
    return MTL

def masked_loss_function(y_true, y_pred, MissingVal=10):
    # This function masks the elements of the vectors with true/predicted values so that the model focuses
    # only on the known data. By default, missing values are represented by 10
    mask = K.cast(K.not_equal(y_true, MissingVal), K.floatx())
    return keras.losses.mean_squared_error(y_true * mask, y_pred * mask)

def MTL_Drop( wsl, whl, drop_rate=0.1, lr=0.001):
    # a function that creates a NN with dropout incorporated after the first hidden layer
    inputs = keras.Input(shape=(2048,))
    sharedlayer = keras.layers.Dense(wsl, activation='tanh' )(inputs) 
    dropout= keras.layers.Dropout(drop_rate)(sharedlayer, training=True)
    myinit = keras.initializers.Constant(-4.)
    hidden = []
    for i in range(len(Targets)):
        hl = Dense(units=whl,  activation='tanh', kernel_regularizer=regularizers.l2(0.05) )(dropout)
        hidden.append( Dense(1, kernel_initializer=myinit, activity_regularizer=regularizers.l1(0.0001) )(hl) )

    MTL=Model(inputs=inputs, outputs=hidden)
    MTL.compile(loss=masked_loss_function, optimizer=tf.optimizers.Adam(lr=lr), )  #metrics=[masked_r2]
    return MTL
from scipy.stats import sem
from scipy.stats import t as tstat

def mulpredict(model, x_test, Ntargets, N=10, conf_flag=False):
    preds = np.zeros( (N, Ntargets) )
    for i in range(N):
        preds[i,:] = [ x[0][0] for x in model.predict( x_test ) ]
    if conf_flag:
        std_err = sem(preds, axis=0)
        h = std_err * tstat.ppf((1 + 0.95) / 2, len(preds) - 1)
        return np.mean(preds, axis=0), h
        # we need the column-wise average of this matrix
    else:
        return np.mean(preds, axis=0) 
def Evaluate(Inter_list, Comp_list, Model, Fingerprints, Ntar=110, Niter=10):
    Predictions = []
    Percomp = {} # contains dicts with lists: (target: [true, pred_NN] )
    for test_case in Comp_list:
        Percomp[ test_case ] = {}
        for tokens in Inter_list:
            if tokens[1]==test_case:
                Percomp[test_case][ tokens[0] ] = [ tokens[2] ]
        if len(Percomp[ test_case ])>0:
            # we've got some values for this compound, now produce predictions:
            preds = mulpredict(Model, np.array( Fingerprints[test_case]).reshape(1,-1), Ntar, Niter)
            
            for target in Percomp[test_case]: 
                Percomp[test_case][target].append( preds[Labels_Targ[target]])
                Predictions.append( [target,test_case, Percomp[test_case][target][0], Percomp[test_case][target][1] ])

        if len(Predictions) % 1000 == 0:
            r2 = r2_score([x[2] for x in Predictions], [x[3] for x in Predictions])
            print(f"\rMore than ", len(Predictions)," pairs have been parsed. Mean performance so far =",r2, end=" ")
    print(" ")
    r2 = r2_score([x[2] for x in Predictions], [x[3] for x in Predictions])
    print("Performance for MTL-D NN = %f" % r2)
    return Predictions
# Initialize sparse matrix - this will be binary
DTI = 10*np.ones((nC,nT),dtype=float)

for edge in Interactions_train:
    # each edge has "target-compound-value-active"
    DTI[ Labels_Comp[edge[1]], Labels_Targ[edge[0]] ] = edge[2]
DTI.shape

(23167, 110)

2. Cross validation with MTL with drop-out model


In [None]:
BS=64; Nepochs=50

filetosave = "Cross-val-MTLD"+str(Nepochs)+'.'+str(BS)+".txt"
with open(filetosave, 'w') as f:
    f.write('')
    
param_grid={'wsl':[200,300,2000],'whl':[100,50,20], 'drop_rate':[0.05,0.1,0.2]}

Loss_history_MTLD = {}
best_loss = 10
best_params =  {} #{'whl':0, 'lamda':0, 'lamda':}
for wsl in param_grid['wsl']:
    for whl in param_grid['whl']:
        for dr in param_grid['drop_rate']:
            MTLR = MTL_Drop(wsl=wsl, whl=whl, drop_rate=dr )
            MTLR.fit( np.array([Fingerprints[x] for x in Compounds]), [x for x in DTI.T], epochs=Nepochs, batch_size=BS, validation_split=0.25, verbose=0, use_multiprocessing=True )
            loss = np.mean( [MTLR.history.history[x][-1] for x in list(MTLR.history.history)[1:]] )
            print("Fitting for (shared, hidden, drop_rate)=({0},{1},{2}) is {3}".format(wsl, whl, dr, loss) )
            with open(filetosave, 'a') as f:
                f.write("Fitting for (shared, hidden, drop_rate)=({0},{1},{2}) is {3}\n".format(wsl, whl, dr, loss))
            if loss < best_loss:
                best_loss = loss
                best_params['wsl'] = wsl
                best_params['whl'] = whl
                best_params['drop_rate'] = dr
            Loss_history_MTLD[(wsl,whl,dr)] = MTLR.history.history

  super(Adam, self).__init__(name, **kwargs)


Fitting for (shared, hidden, drop_rate)=(200,100,0.05) is 0.7788933850388934


KeyboardInterrupt: ignored

In [None]:
best_params = {'wsl': 200, 'whl': 20, 'drop_rate': 0.1}

In [None]:
best_params

{'drop_rate': 0.1, 'whl': 20, 'wsl': 200}

In [None]:
wsl=best_params['wsl']; whl=best_params['whl']; dr=best_params['drop_rate']
print(wsl,whl,dr,Nepochs,BS)

# we assume that CV for model selection has already been performed!
MTLD = MTL_Drop(wsl,whl,dr,0.0001)
t0=time()
MTLD.fit( np.array([Fingerprints[x] for x in Compounds]), [x for x in DTI.T], epochs=Nepochs, batch_size=BS, verbose=0, use_multiprocessing=True )
print("Training length with {0} epochs and BS={1} is {2}".format(Nepochs, BS, time()-t0))

Predictions_MTLD = Evaluate( Interactions_valid, Compounds, MTLD, Fingerprints)
print("MSE- accuracy for MTL NN = %f" % MSE([x[2] for x in Predictions_MTLD], [x[3] for x in Predictions_MTLD]) )

with open(filetosave, 'a') as f:
    f.write('R2 on validation set = {0:.6f}'.format(r2_score([x[2] for x in Predictions_MTLD], [x[3] for x in Predictions_MTLD])))
    
name = 'MTLD-'+str(wsl)+'-'+str(whl)+'-'+str(dr)+'-model.h5'
print('Saving model as '+name)
MTLD.save(name)

200 20 0.1 50 64


  super(Adam, self).__init__(name, **kwargs)


Training length with 50 epochs and BS=64 is 924.1095578670502
More than  11000  pairs have been parsed. Mean performance so far = 0.6331742163553805  
Performance for MTL-D NN = 0.631359
MSE- accuracy for MTL NN = 2.440843
Saving model as MTLD-200-20-0.1-model.h5
