In [5]:
import numpy as np
import pandas as pd
import keras, os, pickle

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE

from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from time import time

from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
from keras.wrappers.scikit_learn import KerasRegressor # this is for making a model like every other in scikit
import  matplotlib.pyplot as plt

np.random.seed(42)
from tensorflow.random import set_seed
set_seed(42)

In [6]:
Interactions_train = []    
with open("Interactions_Trainset.tab",'r') as f:
    for line in f:
        tokens = line.split()
        # 'Target-ID', 'Compound-ID', 'pIC50'  
        Interactions_train.append( [tokens[0], tokens[1], float(tokens[2]) ])

Interactions_valid = []        
with open("Interactions_Validset.tab",'r') as f:
    for line in f:
        tokens = line.split()
        # 'Target-ID', 'Compound-ID', 'pIC50'  
        Interactions_valid.append( [tokens[0], tokens[1], float(tokens[2]) ])

Interactions = [x for x in Interactions_train]
Interactions.extend(Interactions_valid)
# we use a dataframe to quickly sort targets wrt #compounds:
DF = pd.DataFrame( Interactions, columns =['Target-ID', 'Compound-ID','Std-value']) 
temp = DF.groupby(['Target-ID']).agg('count').sort_values(by='Compound-ID') # count the number of molecules
Targets = list(temp.index)
Compounds = np.unique(DF['Compound-ID'])
del temp, DF

nT=len(Targets); nC=len(Compounds)

print("There are {0} targets and {1} compounds currently loaded with {2} interactions.".format(nT,nC,len(Interactions)))
print("A DTI matrix would be {0:.4}% dense!".format(100.0*len(Interactions)/nT/nC ))

# first we need to prepare each fp as a feature vector
Fingerprints={} # this contains one list per fingerprint - not efficient...
with open('Compound_Fingerprints.tab', 'r') as f:
    header = f.readline()
    for line in f:
        # each line is Comp-ID, SMILES, FP
        tokens = line.split()
        # we keep only those compounds which have FPs
        if tokens[2] != 'NOFP':
            fp = [int(c) for c in tokens[2] ]
            Fingerprints[ tokens[0] ] = fp
print("%d fingerprints were loaded!" % len(Fingerprints))

# data standardisation - no need after using pIC50 !
values = [x[2] for x in Interactions]
print("Stats for values : {0} | {1}".format(np.mean(values), np.std(values)))

There are 110 targets and 23167 compounds currently loaded with 56392 interactions.
A DTI matrix would be 2.213% dense!
23167 fingerprints were loaded!
Stats for values : -4.604582905766776 | 2.5887050795505413


## Random Forests with Self-Train

In [12]:
from scipy.stats import sem
from scipy.stats import t as tstat
def Imputer(model, NewID, Fingerprints, threshold=0.2):
    
    X_new = []
    for cid in NewID:
        x_test = np.array( Fingerprints[cid] ).reshape(1,-1)
        preds=[]
        for DTR in model.estimators_:
            preds.append(DTR.predict(x_test) )
        std_err = sem(preds)
        h = std_err * tstat.ppf((1 + 0.95) / 2, len(preds) - 1)
        if 2*h<=threshold:
            X_new.append( [ cid, np.mean(preds)] )
    if len(X_new)>0:
        print("{0} new values were imputed!".format(len(X_new)))
    else:
        print("No confident values were found.")
    return X_new

def Evaluate_RF( TARGET, MODEL, validationset, prnt=False ):
    True_temp = []; Pred_temp = []
    with open( validationset, 'r') as file:
        # no header on this file
        for line in file:
            tokens = line.split()
            if tokens[0]==TARGET:
                True_temp.append( float(tokens[2]) )
                x_test = np.array( Fingerprints[tokens[1]] ).reshape(1,-1)
                Pred_temp.append( MODEL.predict( x_test ) )
    r2 = r2_score(True_temp,Pred_temp)
    if prnt:
        print("R2-score after {0} points = {1:.4f} ".format(len(True_temp), r2 ) )
    return r2

In [None]:
Target_info = {} # this is a "global" variable
theta=0.1
count=0
for target in Targets:
    Target_info[target] = {}

    # define the train set
    X_train=[]; Y_train=[]
    Train_CIDs = []
    for point in Interactions:
        if point[0]==target:
            X_train.append( Fingerprints[point[1]] )
            Y_train.append( float(point[2]) )
            Train_CIDs.append( point[1] )
    Target_info[target]['train_size']=len(Y_train) # add info
    
    with open( 'D:/Sem8_FYP/TrainedModals/RF_'+target+'_'+'pIC50new.sav', 'rb') as f:
            MODEL = pickle.load( f )
            
    Target_info[target]['first_r2'] = MODEL.score( X_train,  Y_train) # add info
    print("Evaluation without imputation = %.4f " % Target_info[target]["first_r2"] )    
    
    print("Imputing confident values...")
    X_new = [1] # just a trigger for the next loop
    while  (len(X_new)>0) & (len(Train_CIDs)<2000):
        # we need to stop after we have no new predictions or we have enough (10K+)
        # update the train set
        NewIDs = [x for x in Compounds if x not in Train_CIDs] # terra incognito
        X_new = Imputer(MODEL, NewIDs, Fingerprints, threshold=theta)
        
        for point in X_new:
            X_train.append( Fingerprints[point[0]] )
            Y_train.append( float(point[1]) )
            Train_CIDs.append( point[0] )
        # re-train
        MODEL.fit(np.array( X_train ),Y_train)

    Target_info[target]["model"] = MODEL
    # evaluate again as before 
    True_temp = []
    Pred_temp = []
    for point in Interactions_valid:
        if point[0]==target:
            True_temp.append( float(point[2]) )
            x_test = np.array( Fingerprints[point[1]] ).reshape(1,-1)
            Pred_temp.append( MODEL.predict( x_test ) )
    Target_info[target]["after_r2"] = r2_score(True_temp,Pred_temp)
    print("Re-evaluate after imputation: %.4f " % Target_info[target]["after_r2"] )
    
    if count%25==0:
        print("More than %d targets are processed" % count)
        temp = np.mean( [Target_info[t]["after_r2"] for t in Target_info.keys()] )
        print("Mean score so far: %f" %  temp)
    count+=1
print("Overall accuracy for self-trained RF = ",  np.mean( [Target_info[t]["after_r2"] for t in Targets] ))

Evaluation without imputation = 0.8617 
Imputing confident values...
No confident values were found.
Re-evaluate after imputation: 0.9250 
More than 0 targets are processed
Mean score so far: 0.924978
Evaluation without imputation = 0.6512 
Imputing confident values...
No confident values were found.
Re-evaluate after imputation: 0.8970 
Evaluation without imputation = 0.8648 
Imputing confident values...
No confident values were found.
Re-evaluate after imputation: 0.8834 
Evaluation without imputation = 0.8377 
Imputing confident values...
No confident values were found.
Re-evaluate after imputation: 0.8971 
Evaluation without imputation = 0.6192 
Imputing confident values...
No confident values were found.
Re-evaluate after imputation: 0.7425 
Evaluation without imputation = 0.9859 
Imputing confident values...
No confident values were found.
Re-evaluate after imputation: 0.9984 
Evaluation without imputation = 0.8527 
Imputing confident values...
No confident values were found.
Re-