In [None]:
import numpy as np
import pandas as pd
import keras, os, pickle

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE

from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from time import time

from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
from keras.wrappers.scikit_learn import KerasRegressor # this is for making a model like every other in scikit
import  matplotlib.pyplot as plt

np.random.seed(42)
from tensorflow.random import set_seed 
# from tensorflow import set_random_seed
set_seed(42)

In [None]:
Interactions_train = []    
with open("Interactions_Trainset.tab",'r') as f:
    for line in f:
        tokens = line.split()
        # 'Target-ID', 'Compound-ID', 'pIC50'  
        Interactions_train.append( [tokens[0], tokens[1], float(tokens[2]) ])

Interactions_valid = []        
with open("Interactions_Validset.tab",'r') as f:
    for line in f:
        tokens = line.split()
        # 'Target-ID', 'Compound-ID', 'pIC50'  
        Interactions_valid.append( [tokens[0], tokens[1], float(tokens[2]) ])

Interactions = [x for x in Interactions_train]
Interactions.extend(Interactions_valid)
# we use a dataframe to quickly sort targets wrt #compounds:
DF = pd.DataFrame( Interactions, columns =['Target-ID', 'Compound-ID','Std-value']) 
temp = DF.groupby(['Target-ID']).agg('count').sort_values(by='Compound-ID') # count the number of molecules
Targets = list(temp.index)
Compounds = np.unique(DF['Compound-ID'])
del temp, DF

nT=len(Targets); nC=len(Compounds)

print("There are {0} targets and {1} compounds currently loaded with {2} interactions.".format(nT,nC,len(Interactions)))
print("A DTI matrix would be {0:.4}% dense!".format(100.0*len(Interactions)/nT/nC ))

# first we need to prepare each fp as a feature vector
Fingerprints={} # this contains one list per fingerprint - not efficient...
with open('Compound_Fingerprints.tab', 'r') as f:
    header = f.readline()
    for line in f:
        # each line is Comp-ID, SMILES, FP
        tokens = line.split()
        # we keep only those compounds which have FPs
        if tokens[2] != 'NOFP':
            fp = [int(c) for c in tokens[2] ]
            Fingerprints[ tokens[0] ] = fp
print("%d fingerprints were loaded!" % len(Fingerprints))

# data standardisation - no need after using pIC50 !
values = [x[2] for x in Interactions]
print("Stats for values : {0} | {1}".format(np.mean(values), np.std(values)))

There are 110 targets and 23167 compounds currently loaded with 56392 interactions.
A DTI matrix would be 2.213% dense!
23167 fingerprints were loaded!
Stats for values : -4.604582905766776 | 2.5887050795505413


In [None]:
from scipy.stats import sem
from scipy.stats import t as tstat
from keras import Model
import keras
import keras.backend as K
import tensorflow as tf 
def mulpredict(model, x_test, Ntargets, N=10, conf_flag=False):
    preds = np.zeros( (N, Ntargets) )
    for i in range(N):
        preds[i,:] = [ x[0][0] for x in model.predict( x_test ) ]
    # we need the column-wise average of this matrix
    if conf_flag:
        std_err = sem(preds, axis=0)
        h = std_err * tstat.ppf((1 + 0.95) / 2, len(preds) - 1)
        return np.mean(preds, axis=0), h
    else:
        return np.mean(preds, axis=0) 
    
def Evaluate(Inter_list, Comp_list, Model, Fingerprints, Ntar=110, Niter=10):
    
    Predictions = []
    Percomp = {} # contains dicts with lists: (target: [true, pred_NN] )
    for test_case in Comp_list:
        Percomp[ test_case ] = {}
        for tokens in Inter_list:
            if tokens[1]==test_case:
                # CID-TID -> [true_val]
                Percomp[test_case][ tokens[0] ] = [ tokens[2] ]
        if len(Percomp[ test_case ])>0:
            # we've got some values for this compound, now produce predictions:
            preds = mulpredict(Model, np.array( Fingerprints[test_case]).reshape(1,-1), Ntar, Niter)
            for target in Percomp[test_case]: 
                Percomp[test_case][target].append( preds[Labels_Targ[target]])
                Predictions.append( [target, test_case, Percomp[test_case][target][0], Percomp[test_case][target][1] ])
        if len(Predictions) % 1000 == 0:
            r2 = r2_score([x[2] for x in Predictions], [x[3] for x in Predictions])
            print(f"\rMore than ", len(Predictions)," predictions have been parsed. Mean performance so far =",r2, end=" ")
    print(" ")
    r2 = r2_score([x[2] for x in Predictions], [x[3] for x in Predictions])
    print("Performance for MTL-D NN = %f" % r2)
    return Predictions

def masked_loss_function(y_true, y_pred):
    mask = K.cast(K.not_equal(y_true, 10), K.floatx())
    return keras.losses.mean_squared_error(y_true * mask, y_pred * mask)

def MTL_Drop( wsl, whl, drop_rate=0.1, lr=0.0001):
    inputs = keras.Input(shape=(2048,))
    sharedlayer = keras.layers.Dense(wsl, activation='tanh' )(inputs) 
    dropout= keras.layers.Dropout(drop_rate)(sharedlayer, training=True)
    myinit = keras.initializers.Constant(-4.)
    hidden = []
    for i in range(len(Targets)):
        hl = Dense(units=whl,  activation='tanh', kernel_regularizer=regularizers.l2(0.05) )(dropout)
        hidden.append( Dense(1, kernel_initializer=myinit, activity_regularizer=regularizers.l1(0.0001) )(hl) )

    MTL=Model(inputs=inputs, outputs=hidden)
    MTL.compile(loss=masked_loss_function, optimizer=tf.optimizers.Adam(lr=lr))
    return MTL

Labels_Targ = dict()
indx=0
for x in Targets:
    Labels_Targ[x]=indx
    indx+=1
    
Labels_Comp = dict()
indx=0
for x in Compounds:
    Labels_Comp[x]=indx
    indx+=1

# Initialize sparse matrix - this will be binary
DTI = 10*np.ones((nC,nT),dtype=float)

for edge in Interactions_train:
    # each edge has "target-compound-value-active"
    DTI[ Labels_Comp[edge[1]], Labels_Targ[edge[0]] ] = edge[2]
DTI.shape

(23167, 110)

In [None]:
wsh=200; whl=20; dr=0.04

# we assume that CV for model selection has already been performed!
MTLDSF = MTL_Drop(wsh,whl,dr,0.0001)
t0=time()
MTLDSF.fit( np.array([Fingerprints[x] for x in Compounds]), [x for x in DTI.T], epochs=1, batch_size=64, verbose=0, use_multiprocessing=True )
print("Duration for fitting = ", time()-t0)
temp = Evaluate( Interactions_valid, Compounds, MTLDSF, Fingerprints)

theta=0.1

train_size = len(Interactions)
count = 1 # just a trigger for the next loop
while (train_size<2e6) & (count>0):
# we need to stop after we have no new predictions or we have enough (10K+)
    count=0
    for x_new in Compounds:
        preds, H = mulpredict(MTLDSF, np.array( Fingerprints[x_new]).reshape(1,-1), 110, 10, True)
        # impute accordingly
        for t in range(110):
            if (H[t]< theta) & (DTI[Labels_Comp[x_new],t] == 10):
                DTI[Labels_Comp[x_new],t] = preds[t] # update the train set
                count+=1
        if list(Compounds).index(x_new) % 100 == 0:
            print(f"\rMore than", list(Compounds).index(x_new) ,"compounds have been parsed with",count,"new values.", end =" ")
    print(count," new values where imputed.")
    train_size += count
    MTLDSF.save("MTLDSF0.h5")
    # print(count,'printing count')
    

# save model
MTLDSF.save("MTLDSF.h5")

  super(Adam, self).__init__(name, **kwargs)


Duration for fitting =  64.2316210269928
More than  11000  predictions have been parsed. Mean performance so far = -0.15085318008232185  
Performance for MTL-D NN = -0.159116
More than 20500 compounds have been parsed with 5 new values. 

KeyboardInterrupt: ignored

In [None]:
if count >0 :
        t0 = time()
        MTLDSF.fit( np.array([Fingerprints[x] for x in Compounds]), [x for x in DTI.T], epochs=1, batch_size=128, verbose=0, use_multiprocessing=True )
        print("Duration for fitting = ", time()-t0)
        t0=time()
        temp = Evaluate( Interactions_valid, Compounds, MTLDSF, Fingerprints)
        print("Duration per 1000 predictions = ",1000*(time()-t0)/len(Compounds)/len(Targets) )

Duration for fitting =  25.075188875198364
More than  11000  predictions have been parsed. Mean performance so far = 0.04810860441275866  
Performance for MTL-D NN = 0.040442
Duration per 1000 predictions =  2.0037954349470146


In [None]:
MTLDSF.save("MTLDSF.h5")