In [None]:
import numpy as np
import pandas as pd
import keras
import os, pickle

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE

from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
from keras.wrappers.scikit_learn import KerasRegressor # this is for making a model like every other in scikit
from keras.models import load_model, model_from_json
# from sklearn.decomposition import TruncatedSVD as tSVD

import  matplotlib.pyplot as plt
from time import time

random_seed = 2022
np.random.seed(random_seed)
nfolds=4
njobs =3
pathtosaved = 'D:/Sem8_FYP/TrainedModals/'
#pathtosaved = 'D:/Sem8_FYP/Kanner/'
print(pathtosaved)

In [None]:
##

In [None]:
if os.path.isfile("Interactions_Trainset.tab"):
    
    print("Loading train/valid sets...")
    Interactions_train = []    
    with open("Interactions_Trainset.tab",'r') as f:
        for line in f:
            tokens = line.split()
            # 'Target-ID', 'Compound-ID', 'pIC50'  
            Interactions_train.append( [tokens[0], tokens[1], float(tokens[2]) ])
    
    Interactions_valid = []        
    with open("Interactions_Validset.tab",'r') as f:
        for line in f:
            tokens = line.split()
            # 'Target-ID', 'Compound-ID', 'pIC50'  
            Interactions_valid.append( [tokens[0], tokens[1], float(tokens[2]) ])

            
Interactions = [x for x in Interactions_train]
Interactions.extend(Interactions_valid)
print("Basic stats about whole - train - validation sets:")
print( np.mean([x[2] for x in Interactions]), '\t', np.mean([x[2] for x in Interactions_valid]), '\t', np.mean([x[2] for x in Interactions_train]) )
print( np.std([x[2] for x in Interactions]) , '\t', np.std([x[2] for x in Interactions_valid]) , '\t', np.std([x[2] for x in Interactions_train])  )

In [None]:
DF = pd.DataFrame( Interactions, columns =['Target-ID', 'Compound-ID','Std-value']) 
temp = DF.groupby(['Target-ID']).agg('count').sort_values(by='Compound-ID') # count the number of molecules
Targets = list(temp.index)
Compounds = np.unique(DF['Compound-ID'])

nT=len(Targets); nC=len(Compounds)

print("There are {0} targets and {1} compounds currently loaded with {2} interactions.".format(nT,nC,len(Interactions)))
print("A DTI matrix would be {0:.4}% dense!".format(100.0*len(Interactions)/nT/nC ))


Fingerprints={} 
with open('Compound_Fingerprints.tab', 'r') as f:
    header = f.readline()
    for line in f:
        # each line is Comp-ID, SMILES, FP
        tokens = line.split()
        
        if tokens[2] != 'NOFP':
            fp = [int(c) for c in tokens[2] ]
            Fingerprints[ tokens[0] ] = fp
print("%d fingerprints were loaded!" % len(Fingerprints))

#del temp, DF, Interactions

## Random Forests

In [None]:
Target_info = {} 

RF_all = dict()
Scores_RF_train=[]
count=0
param_grid={'n_estimators':[10,25,50,100,150], 'max_depth':[3,4,5,7,10,15,20], 'max_features':['sqrt','auto']}
for target in Targets:
    Target_info[target] = {}
    
    X_train=[]; Y_train=[]
    for point in Interactions_train:
        if point[0]==target:
            X_train.append( Fingerprints[point[1]] )
            Y_train.append( float(point[2]) )
    Target_info[target]['train_size']=len(Y_train) # add info
    if len(Y_train)>40:
        if os.path.isfile(pathtosaved+'RF_'+target+'_'+'pIC50new.sav'):
            
            with open( pathtosaved+'RF_'+target+'_'+'pIC50new.sav', 'rb') as f:
                RFR = pickle.load( f )
        else:
            print("training...")
            
            cvr = GridSearchCV(RandomForestRegressor(random_state=2019), param_grid, cv=nfolds, n_jobs=njobs, iid=True)
            cvr.fit(X_train, Y_train)
            
            RFR = RandomForestRegressor( n_estimators= cvr.best_params_['n_estimators'],max_features=cvr.best_params_['max_features'], max_depth=cvr.best_params_['max_depth'], random_state=2019)
            RFR.fit(X_train,Y_train)
            # save model
            pickle.dump(RFR, open(pathtosaved+'RF_'+target+'_'+'pIC50new.sav', 'wb'))
        RF_all[target] = RFR
        Scores_RF_train.append( RFR.score( X_train,  Y_train))
        Target_info[target]['RF_train_r2'] = Scores_RF_train[-1] # add info
#         print(Scores_RFR_train[-1])
    else:
        print("Not enough data for %s" % target)
    if count%25==0:
        print("More than %d targets are processed" % count)
        print("Mean score so far: %f" % np.mean(Scores_RF_train))
    count+=1
    
print("Mean score for RF during training = %f" % np.mean(Scores_RF_train) )

## Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

LR_all = dict()
Scores_LR_train=[]
param_grid={'alpha':[1, 0.5, 0.1, 0.01]}
count=0
for target in Targets:
    # define the train set
    X_train=[]; Y_train=[]
    for point in Interactions_train:
        if point[0]==target:
            X_train.append( Fingerprints[point[1]] )
            Y_train.append( float(point[2]) )
    
    if os.path.isfile(pathtosaved+'LR_'+target+'_'+'pIC50new.sav'):
        # model is already trained - just load
        with open( pathtosaved+'LR_'+target+'_'+'pIC50new.sav', 'rb') as f:
            LR = pickle.load( f )
    else:
        print("cross validation")
        cvr = GridSearchCV(Lasso(random_state=2019, max_iter=3000), param_grid, cv=nfolds, n_jobs=njobs, iid=True)
        cvr.fit(X_train, Y_train)
        # select best parametrisation
        LR = Lasso( alpha= cvr.best_params_['alpha'], max_iter=3000, random_state=2019)
        LR.fit(X_train,Y_train)
        pickle.dump(LR, open(pathtosaved+'LR_'+target+'_'+'pIC50new.sav', 'wb'))
        
    Scores_LR_train.append( LR.score( X_train,  Y_train))
    Target_info[target]['LR_train_r2'] = Scores_LR_train[-1] # add info
    LR_all[target] = LR
    if count%25==0:
        print("More than %d targets are processed" % count)
        print("Mean score so far: %f" % np.mean(Scores_LR_train))
    count+=1 

print("Mean score for LR during training = %f" % np.mean(Scores_LR_train) )

## Neural Networks


In [None]:
NN_all = dict()
Scores_NN_train=[]
param_grid={'hidden_layer_sizes':[(50),(100,20),(100,50),(500,20,10)] }
count=0
for target in Targets:
    # define the train set
    X_train=[]; Y_train=[]
    for point in Interactions_train:
        if point[0]==target:
            X_train.append( Fingerprints[point[1]] )
            Y_train.append( float(point[2]) )
    if os.path.isfile(pathtosaved+'NN_'+target+'_'+'pIC50new.sav'):
        # model is already trained - just load
        with open( pathtosaved+'NN_'+target+'_'+'pIC50new.sav', 'rb') as f:
            MLPR = pickle.load( f )
    else:
        # we need to ensure there is enough data for CV
        cvr = GridSearchCV(MLPRegressor(activation='tanh', solver='lbfgs', random_state=2019), param_grid, cv=nfolds, n_jobs=njobs, iid=True)
        cvr.fit(X_train, Y_train)
        # select best parametrisation and train to the complete train-set
        MLPR = MLPRegressor( hidden_layer_sizes = cvr.best_params_['hidden_layer_sizes'], activation='tanh', solver='lbfgs', random_state=2019)
        MLPR.fit(X_train,Y_train)
        pickle.dump(MLPR, open(pathtosaved+'NN_'+target+'_'+'pIC50new.sav', 'wb'))
    NN_all[target] = MLPR
    Scores_NN_train.append( MLPR.score( X_train,  Y_train))
    Target_info[target]['NN_train_r2'] = Scores_NN_train[-1] # add info
    if count%25==0:
        print("More than %d targets are processed" % count)
        print("Mean score so far: %f" % np.mean(Scores_NN_train))
    count+=1
    
print("Mean score for NN during training = %f" % np.mean(Scores_NN_train))

### Evaluating RF model

In [14]:

Pred_RF  = []
True_vals = []
Pred_pertarget = dict() 

Time_RF=0; Time_NN=0; Time_LR=0; Time_my=0
with open("SingleTL_final_results.txt",'w') as f:

    f.write("Target\tCompound\tTrue\tRFR\n")
    for point in Interactions_valid:
        # point = [ target, compound, pIC50 ]
        True_vals.append( float(point[2]) )
        x_test = np.array( Fingerprints[point[1]] ).reshape(1,-1) # prepare for prediction
        
        t0=time()
        model = RF_all[point[0]]
        Pred_RF.append( model.predict( x_test ) )
        Time_RF+=time()-t0
        #print("Random Forest Time", Time_RF)
        f.write("{0}\t{1}\t{2}\t{3}\n".format(point[0], point[1], point[2], Pred_RF[-1][0]))

        if point[0] in Pred_pertarget:
            Pred_pertarget[point[0]].append( (True_vals[-1], Pred_RF[-1][0])  )
            #print(Pred_pertarget[point[0]])
        else:
            # first time for this protein
            Pred_pertarget[point[0]] = [ (True_vals[-1], Pred_RF[-1][0]) ]
            #print(Pred_pertarget[point[0]])
        
print("Performance for RF = %f" % r2_score( True_vals, Pred_RF ))


Performance for RF = 0.648058


In [15]:
print("RF: Duration per 1000 predictions = {0}".format(1000*Time_RF/len(Interactions_valid) ))

RF: Duration per 1000 predictions = 17.15110715641834


In [16]:
Scores_RF_valid_pertarget = []

for target in Pred_pertarget:
    true=[]
    pred_RF=[]
    # aggregate predictions
    for point in Pred_pertarget[target]:
        true.append( point[0] )
        pred_RF.append( point[1] )
        
    Target_info[target]['test_size']=len(true) # add info
    
    # calculate performance for each method
    r2 = r2_score(true, pred_RF)
    Target_info[target]['RF_valid_r2'] = r2 # add info
    Scores_RF_valid_pertarget.append( r2 )

    print("R2 score for {0}, RF = {1:.2f}".format(target, Scores_RF_valid_pertarget[-1]))

R2 score for CHEMBL260, RF = 0.59
R2 score for CHEMBL4722, RF = 0.68
R2 score for CHEMBL2695, RF = 0.72
R2 score for CHEMBL3038477, RF = 0.44
R2 score for CHEMBL2996, RF = 0.51
R2 score for CHEMBL2148, RF = 0.74
R2 score for CHEMBL2147, RF = 0.66
R2 score for CHEMBL5147, RF = 0.42
R2 score for CHEMBL308, RF = 0.51
R2 score for CHEMBL3234, RF = 0.54
R2 score for CHEMBL4523, RF = 0.62
R2 score for CHEMBL2358, RF = 0.31
R2 score for CHEMBL1936, RF = 0.46
R2 score for CHEMBL3629, RF = 0.47
R2 score for CHEMBL279, RF = 0.51
R2 score for CHEMBL1824, RF = 0.60
R2 score for CHEMBL203, RF = 0.58
R2 score for CHEMBL1862, RF = 0.64
R2 score for CHEMBL3553, RF = 0.50
R2 score for CHEMBL3529, RF = 0.55
R2 score for CHEMBL299, RF = 0.83
R2 score for CHEMBL4040, RF = 0.33
R2 score for CHEMBL2828, RF = 0.79
R2 score for CHEMBL2599, RF = 0.62
R2 score for CHEMBL3973, RF = 0.65
R2 score for CHEMBL1957, RF = 0.64
R2 score for CHEMBL2095942, RF = 0.29
R2 score for CHEMBL2185, RF = 0.63
R2 score for CHEMBL