### DRUG REPURPOSING ON CNS DRUGS

In [6]:
import numpy as np
import pandas as pd
import keras, os, pickle

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE

from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from time import time

from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
from keras.wrappers.scikit_learn import KerasRegressor # this is for making a model like every other in scikit
import  matplotlib.pyplot as plt

np.random.seed(42)
from tensorflow.random import set_seed
set_seed(42)

In [7]:
Interactions_train = []    
with open("Interactions_Trainset.tab",'r') as f:
    for line in f:
        tokens = line.split()
        # 'Target-ID', 'Compound-ID', 'pIC50'  
        Interactions_train.append( [tokens[0], tokens[1], float(tokens[2]) ])

Anitiviral_train = []    
with open("drugs.tab",'r') as f:
    for line in f:
        tokens = line.split()
        #Compound-ID'
        Anitiviral_train.append( [tokens[0]])        
        
Interactions_valid = []        
with open("Interactions_Validset.tab",'r') as f:
    for line in f:
        tokens = line.split()
        # 'Target-ID', 'Compound-ID', 'pIC50'  
        Interactions_valid.append( [tokens[0], tokens[1], float(tokens[2]) ])

Interactions = [x for x in Interactions_train]
Interactions.extend(Interactions_valid)
# we use a dataframe to quickly sort targets wrt #compounds:
DF = pd.DataFrame( Interactions, columns =['Target-ID', 'Compound-ID','Std-value']) 
temp = DF.groupby(['Target-ID']).agg('count').sort_values(by='Compound-ID') # count the number of molecules
Targets = list(temp.index)
Compounds = np.unique(DF['Compound-ID'])
del temp, DF

nT=len(Targets); nC=len(Compounds)

print("There are {0} targets are present".format(nT))


# first we need to prepare each fp as a feature vector
Fingerprints={} # this contains one list per fingerprint - not efficient...
with open('drugs_Fingerprints.tab', 'r') as f:
    header = f.readline()
    for line in f:
        # each line is Comp-ID, SMILES, FP
        tokens = line.split()
        # we keep only those compounds which have FPs
        #print(tokens)
        fp = [int(c) for c in tokens[2] ]
        Fingerprints[ tokens[0] ] = fp
            
print("%d fingerprints were loaded!" % len(Fingerprints))
#print(Fingerprints["441300"])
# data standardisation - no need after using pIC50 !
values = [x[2] for x in Interactions]
#print("Stats for values : {0} | {1}".format(np.mean(values), np.std(values)))
#print("Antiviral Drugs", Fingerprints['2022'])

There are 110 targets are present
1690 fingerprints were loaded!


In [8]:
from scipy.stats import sem
from scipy.stats import t as tstat
def Imputer(model, NewID, Fingerprints, threshold=0.2):
    
    X_new = []
    #print("Compund ID's List", NewID)
    for cid in NewID:
        #print("For Compound Id : ", cid)
        x_test = np.array( Fingerprints[cid] ).reshape(1,-1)
        preds=[]
        for DTR in model.estimators_:
            #print("Pred-score from model", DTR.predict(x_test))
            preds.append(DTR.predict(x_test) )
        std_err = sem(preds)
        #print("stderr", std_err)
        h = std_err * tstat.ppf((1 + 0.95) / 2, len(preds) - 1)
        #print("Pred-score from model", h)
        if 2*h<=threshold:
            X_new.append( [ cid, np.mean(preds)] )
            print("Interaction found for Compound-id : ", cid)
            print("Prediction Score : ", np.mean(preds))
        #else:
            #print("No Interaction found!!")
    if len(X_new)>0:
        print("{0} new values were imputed!".format(len(X_new)))
    else:
        print("No confident values were found.")
    return X_new

In [10]:
Target_info = {} # this is a "global" variable
theta=0.2
count=0
def listToString(s):
    str1 = " "
    return (str1.join(s))
    
count = 1
for target in Targets:
    Target_info[target] = {}
    print("--------------------------------------------------------------------------------------------------------------------")
    print("Target {0} is: {1}".format(count, target))
    count = count + 1
    # define the train set
    X_train=[]; Y_train=[]
    Train_CIDs = []
    for point in Anitiviral_train:
        #fingerprint of pubchem compound id
        #print(point[0])
        if point[0] != 'Abacavir':
            X_train.append( Fingerprints[point[0]])
            Train_CIDs.append( point[0] )
        #print(Train_CIDs)
        #Y_train.append( float(point[2]) )


    #Target_info[target]['train_size']=len(Y_train) # add info

    with open( 'D:/Sem8_FYP/TrainedModals/RF_'+target+'_'+'pIC50new.sav', 'rb') as f:
            MODEL = pickle.load( f )

    #Target_info[target]['first_r2'] = MODEL.score( X_train,  Y_train) # add info
    #print("Evaluation without imputation = %.4f " % Target_info[target]["first_r2"] )    

    print("Imputing confident values...")
    X_new = [1] # just a trigger for the next loop
    while  (len(X_new)>0) :
        # we need to stop after we have no new predictions or we have enough (10K+)
        # update the train set
        NewIDs = [x for x in Train_CIDs] # terra incognito
        X_new = Imputer(MODEL, NewIDs, Fingerprints, threshold=theta)
        break

--------------------------------------------------------------------------------------------------------------------
Target 1 is: CHEMBL5122
Imputing confident values...
No confident values were found.
--------------------------------------------------------------------------------------------------------------------
Target 2 is: CHEMBL2095942
Imputing confident values...
No confident values were found.
--------------------------------------------------------------------------------------------------------------------
Target 3 is: CHEMBL3116
Imputing confident values...
No confident values were found.
--------------------------------------------------------------------------------------------------------------------
Target 4 is: CHEMBL3430907
Imputing confident values...
No confident values were found.
--------------------------------------------------------------------------------------------------------------------
Target 5 is: CHEMBL4203
Imputing confident values...
No confident val