#### This notebook ranks the top 100 cas antiviral compounds which are within the applicability domain of the models as described in applicability_domain_and_blinded_molecules_pIC50_computation_from_best_models.ipynb

In [1]:
from keras.models import Sequential,model_from_json
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D,UpSampling2D, Dropout
from keras.callbacks import EarlyStopping,ModelCheckpoint
import pandas as pd
import numpy as np
import sys
import pickle
pd.set_option('display.max_rows', None)
from scipy.stats import norm
import math
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from scipy import stats
import math
from scipy.stats import chisquare
from urllib.request import urlopen
import re
from numpy import percentile

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# get best models according to r2 test. Same as best models used in 
# applicability domain_and_blinded_molecules_pIC50_computation_from_best_models.ipynb
best_models = {"best_models":['1','10','13','14','15','18',
                              '20','21','23','26','28','3',
                              '31','32','35','38','39','40',
                              '42','45','49','5','54','56',
                              '57','59','63','9','24','53','62',
                              '16','61','4','7']}

In [3]:
# Read scaled cas molecular descriptors
cas_antiviral_csv = pd.read_csv('molecular_descriptors_csv/min_max_scaled_cas_antiviral_molecular_descriptors.csv')

In [4]:
compound_names = cas_antiviral_csv.loc[:,'Name']
df_attributes = cas_antiviral_csv.loc[:,'nAcid':'Zagreb']
# Dummy zeros column added to resize descriptors into 35x32 image
df_attributes['zeros'] = 0
df_attributes_imgs = np.reshape(np.array(df_attributes),(-1,35,32,1))

In [5]:
# Compute values for antiviral set
cnt = 0
scaler = pickle.load(open('scaler_data/scaler.dat','rb'))
predicted_antiviral_values_modelwise = []
predicted_antiviral_values_modelwise_inverse_transform = []
model_names = []
for key,values in best_models.items():

    for value in values:
        model_names.append(key+","+value)
        json_file = open(key+'/model_'+value+'.json')
        loaded_model_json = json_file.read()
        json_file.close()
        loaded_model = model_from_json(loaded_model_json)
        loaded_model.load_weights(key+'/model'+value+'.h5')
        predicted_antiviral_values = loaded_model.predict(df_attributes_imgs)
        predicted_antiviral_values = np.reshape(predicted_antiviral_values,-1)

        # min max scaler inverse transform for predicted train values
        antiviral_data_predicted = df_attributes.loc[:,'nAcid':'Zagreb']
        antiviral_data_predicted['values'] = predicted_antiviral_values
        antiviral_data_inverse_transform_predicted = scaler.inverse_transform(antiviral_data_predicted)
        antiviral_value_predicted_inverse_transform = antiviral_data_inverse_transform_predicted[:,len(antiviral_data_inverse_transform_predicted[0])-1]

        predicted_antiviral_values_modelwise.append(predicted_antiviral_values)
        predicted_antiviral_values_modelwise_inverse_transform.append(antiviral_value_predicted_inverse_transform)
        cnt+=1
df_modelwise_antiviral = pd.DataFrame(np.transpose(np.array(predicted_antiviral_values_modelwise)),columns = model_names)
# pickle.dump(df_modelwise_antiviral,open('df_modelwise_antiviral_prediction','wb'))

df_modelwise_antiviral_inverse_transform = pd.DataFrame(np.transpose(np.array(predicted_antiviral_values_modelwise_inverse_transform)),columns = model_names)
# pickle.dump(df_modelwise_antiviral_inverse_transform,open('df_modelwise_antiviral_prediction_inverse_transform','wb'))




#### Distance to model of various CAS antiviral compounds. Distance to model algorithm is described in applicability_domain_and_blinded_molecules_pIC50_computation_from_best_models.ipynb

In [14]:
num_of_models = len(model_names)

# Calculate non parametric CI at 95% confidence (cas antiviral database)
df_modelwise_antiviral_inverse_transform_array = np.array(df_modelwise_antiviral_inverse_transform)
two_point_five_percentile = percentile(df_modelwise_antiviral_inverse_transform_array,2.5,axis=1)
ninety_seven_point_five_percentile = percentile(df_modelwise_antiviral_inverse_transform_array,97.5,axis=1)
lower_df = pd.DataFrame(two_point_five_percentile,columns=['lower'])
upper_df = pd.DataFrame(ninety_seven_point_five_percentile,columns=['upper'])
median = np.reshape(np.array(df_modelwise_antiviral_inverse_transform.median(axis = 1)),(-1,1))
median_df = pd.DataFrame(median,columns=['predicted_pIC50_cas_values'])

# Get mid 95 percent values
sorted_df_modelwise_antiviral_inverse_transform_array = np.sort(df_modelwise_antiviral_inverse_transform_array,axis = 1)
lower_index = int(round(0.025 * num_of_models))
upper_index =int(round(0.95* num_of_models))

mid_ninety_five_percent_values = sorted_df_modelwise_antiviral_inverse_transform_array[:,lower_index:upper_index]
number_of_values_in_mid_95_percent = len(mid_ninety_five_percent_values[0])

median_tiled = np.tile(median,(1,number_of_values_in_mid_95_percent))

subtract_median_from_predicted_values = np.sqrt(np.sum(np.square(median_tiled - mid_ninety_five_percent_values),axis = 1,keepdims=True))
magnitude_of_median_vector = np.sqrt(np.sum(np.square(median_tiled),axis = 1,keepdims=True))

weighted_subtraction_values = subtract_median_from_predicted_values/magnitude_of_median_vector
distance_to_model = pd.DataFrame(weighted_subtraction_values,columns = ['distance_to_ensemble_models_for_cas_compounds'])

In [15]:
cas_antiviral_predicted_df_with_compound_names_and_dm_info = pd.concat([compound_names,median_df,distance_to_model],axis = 1)

In [18]:
# threshold_dm represents the threshold distance to model above which predictions are not considered reliable.
# Only those molecules are chosen which have distance to model less than threshold_dm
threshold_dm = 0.3
# Select compounds with distance to ensemble model values less than threshold_dm
cas_antiviral_predicted_df_with_compound_names_and_dm_info = cas_antiviral_predicted_df_with_compound_names_and_dm_info[cas_antiviral_predicted_df_with_compound_names_and_dm_info['distance_to_ensemble_models_for_cas_compounds']<threshold_dm]

In [19]:
# Display top 100 cas antiviral compounds ranked in descending order of predicted cas pIC50 values and within a
# distance to model of 0.3
cas_top_100 = cas_antiviral_predicted_df_with_compound_names_and_dm_info.nlargest(100,'predicted_pIC50_cas_values')
cas_top_100

Unnamed: 0,Name,predicted_pIC50_cas_values,distance_to_ensemble_models_for_cas_compounds
16702,"148312-35-0:1,3,5-Triazin-2-amine, N,4-dicyclo...",-0.177451,0.296504
16693,"148238-42-0:Furo[3Ã¢â‚¬Â²,4Ã¢â‚¬Â²:6,7]naphtho...",-0.314458,0.196064
35646,"39497-40-0:Hydrazinecarboximidothioic acid, 2-...",-0.376986,0.265043
21346,"173987-47-8:Thymidine, 3Ã¢â‚¬Â²-azido-3Ã¢â‚¬Â²...",-0.559947,0.219156
35517,"38748-49-1:Inosine, 8-[[(2-methylphenyl)methyl...",-0.60622,0.078232
35633,394729-77-2:3-Azabicyclo[3.1.0]hexane-2-carbox...,-0.60714,0.277818
44247,"865104-59-2:Carbamic acid, [(1S,2R)-3-[[[(3Z)-...",-0.642733,0.154817
5092,"1184957-98-9:2-Naphthalenesulfonic acid, 5-ami...",-0.665529,0.218043
20371,"1651207-29-2:L-Alanine-3,3,3-d3, N-[[P(S),2Ã¢â...",-0.728287,0.248241
2537,"1075210-13-7:Carbamic acid, N-[(1S,2R)-2-hydro...",-0.734387,0.250679


In [20]:
# Get CAS IDs
cas_ids_and_names = list(cas_top_100['Name'])
cas_ids = []
comp_names = []
for elem in cas_ids_and_names:
    split = elem.split(':', 1)
    cas_ids.append(split[0])
    comp_names.append(split[1])
print(cas_ids)

['148312-35-0', '148238-42-0', '39497-40-0', '173987-47-8', '38748-49-1', '394729-77-2', '865104-59-2', '1184957-98-9', '1651207-29-2', '1075210-13-7', '1184942-58-2', '1228966-45-7', '2243714-80-7', '2242826-11-3', '1184942-57-1', '2242826-17-9', '2242429-36-1', '1192538-53-6', '672288-88-9', '1184942-74-2', '1184942-60-6', '2230747-29-0', '2243581-19-1', '2243715-67-3', '69264-38-6', '1948282-09-4', '2242429-39-4', '2242453-16-1', '1185110-50-2', '2243715-83-3', '2243715-86-6', '144848-58-8', '502623-93-0', '723263-22-7', '937397-88-1', '1951464-81-5', '1312908-77-2', '853792-82-2', '2243245-34-1', '2243245-37-4', '2363060-74-4', '2243245-33-0', '2243245-35-2', '2361527-81-1', '853792-92-4', '2243079-72-1', '2243106-01-4', '1184942-48-0', '1985607-02-0', '2364589-90-0', '72687-19-5', '1453350-73-6', '612485-40-2', '103025-08-7', '890312-32-0', '1208118-97-1', '1860842-89-2', '1338781-08-0', '1418741-52-2', '622865-55-8', '1432060-78-0', '2361932-53-6', '2243245-28-3', '2243245-32-9',

In [21]:
# Function to generate SMILES from cas ids
def CIRconvert(ids):
    try:
        url = 'http://cactus.nci.nih.gov/chemical/structure/' + ids + '/smiles'
        ans = urlopen(url).read().decode('utf8')
        return ans
    except:
        return 'Not available'

In [22]:
smiles_from_cas_ids = []
for ids in cas_ids:
    smiles_from_cas_ids.append(CIRconvert(ids))

In [25]:
# Write top 100 cas antiviral compounds along with predicted pIC50 values and smiles to csv file
pd.concat([pd.DataFrame(cas_ids,columns = ['CAS-ID']),pd.DataFrame(comp_names,columns = ['Compound Names']),
          pd.DataFrame(smiles_from_cas_ids,columns = ["Smiles from CAS IDs"]),pd.DataFrame(np.array(cas_top_100['predicted_pIC50_cas_values']),columns = ['predicted_pIC50_cas_values']),
          pd.DataFrame(np.array(cas_top_100['distance_to_ensemble_models_for_cas_compounds']),columns =['distance_to_ensemble_models_for_cas_compounds'])],axis=1).to_csv('pIC50_values_csv/cas_antiviral_top_100_pIC50_vals.csv')