# VS of compounds in clinical trials for SARS-CoV-2  by (M<sup>pro</sup>) QSAR models of SARS-CoV

## Importing modules and functions        
    

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import _pickle as cPickle
import gzip

from BalanceBySim import *
from stats import *

from collections import Counter

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.model_selection import permutation_test_score, StratifiedKFold

def warn(*args, **kwargs):
    pass
import warnings
warnings.filterwarnings("ignore")
warnings.warn = warn

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
%reload_ext autoreload
%autoreload 2
Draw.DrawingOptions.atomLabelFontFace = "DejaVu Sans"
Draw.DrawingOptions.atomLabelFontSize = 18

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Import screening data

In [2]:
# Set file path and format
file = '../datasets/curated_data/clinical_trials.sdf.gz'

# Read SDF
sdfInfo = dict(molColName='ROMol')
moldf = PandasTools.LoadSDF(file, **sdfInfo);
#print('Original data: ', moldf.shape)
# Rename ROMol
moldf = moldf.rename(columns={'ROMol': 'Mol'})
# Remove missing RDKit molecules
moldf = moldf[pd.notnull(moldf['Mol'])]
if 'StandardizerResult' in moldf.columns:
    moldf = moldf.drop(columns='StandardizerResult')

In [3]:
# Columns
print('Kept data: ', moldf.shape)
moldf.head(1)
from molvs.validate import Validator
fmt = '%(asctime)s - %(levelname)s - %(validation)s - %(message)s'
validator = Validator(log_format=fmt)
print('\n Problematic structures: \n', validator.validate(moldf))

Kept data:  (23, 4)


Unnamed: 0,Compound_name,InChIKey,ID,Mol
0,Sildenafil,BNRNXUUZRGQAQC-UHFFFAOYSA-N,,



 Problematic structures: 
 []


### Import SIRMS descriptors

##### Import descriptor of training set

In [4]:
train_desc = pd.read_csv('../descriptors/sirms-chembl-sars-cov-3C-like-proteinase-processed.txt', sep='\t')
desc_list = train_desc.columns.tolist()
train_desc.head()

Unnamed: 0,Fr1(chg)/A,Fr1(chg)/B,Fr1(chg)/C,Fr1(elm)/Cl,Fr1(elm)/F,Fr1(elm)/N,Fr1(elm)/O,Fr1(elm)/S,Fr1(lip)/A,Fr1(lip)/B,...,"S_A(type)/C.AR_H_H_N.AM/2_4s,3_4s/4","S_A(type)/C.AR_H_N.AM_N.AM/1_3s,2_4s/3","S_A(type)/C.AR_H_N.AM_O.2/1_3s,2_3s/4","S_A(type)/C.AR_H_N.AR_N.AR/1_2s,1_4a/4","S_A(type)/C.AR_H_N.AR_O.3/1_2s,1_3a/4","S_A(type)/C.AR_H_N.AR_O.3/1_3a,2_3s/4","S_A(type)/C.AR_N.AR_N.AR_O.3/1_2a,1_3a/4","S_A(type)/C.AR_N.AR_N.AR_O.3/1_3a,2_3a/4","S_A(type)/C.AR_N.AR_O.3_S.3/1_2a,1_4s/4","S_A(type)/H_N.2_N.3_O.2/1_3s,2_4d/3"
0,14,10,36,0,0,7,7,0,8,12,...,0,0,0,1,3,2,1,0,0,0
1,9,16,37,0,0,7,3,0,8,7,...,0,3,3,3,0,0,0,0,0,0
2,12,13,34,0,0,7,6,0,7,10,...,3,0,0,0,0,0,0,0,0,0
3,5,22,26,0,0,2,3,0,2,6,...,0,0,0,0,0,0,0,0,0,0
4,18,10,30,0,0,6,12,0,9,16,...,6,0,0,0,0,0,0,0,0,0


##### Import descriptor of VS set

In [6]:
vs_desc = pd.read_csv('../descriptors/sirms-clinical-trials.txt', skiprows=[0,1], sep='\t')
vs_desc.drop(vs_desc.columns[0:2], axis=1,inplace=True)
vs_desc.head()

Unnamed: 0,Fr1(chg)/A,Fr1(chg)/B,Fr1(chg)/C,Fr1(chg)/D,Fr2(chg)/A_A/1_2a/,Fr2(chg)/A_B/1_2a/,Fr2(chg)/A_B/1_2s/,Fr2(chg)/A_C/1_2a/,Fr2(chg)/A_C/1_2s/,Fr2(chg)/A_D/1_2a/,...,"S_A(type)/N.AR_O.2_O.2_S.O2/2_4d,3_4d/4","S_A(type)/N.AR_O.2_O.3_P.3D/2_4d,3_4s/4","S_A(type)/N.AR_O.3_O.3_P.3D/2_4s,3_4s/4","S_A(type)/O.2_O.2_O.2_S.O2/2_4d,3_4d/4","S_A(type)/O.2_O.2_O.3_P.3D/2_4d,3_4s/4","S_A(type)/O.2_O.2_O.3_S.O2/1_4d,2_4d/4","S_A(type)/O.2_O.3_O.3_P.3D/1_4d,2_4s,3_4s/5","S_A(type)/O.2_O.3_O.3_P.3D/1_4d,3_4s/4","S_A(type)/O.2_O.3_O.3_P.3D/2_4s,3_4s/4","S_A(type)/O.3_O.3_O.3_P.3D/2_4s,3_4s/4"
0,10,7,37,9,1,0,0,0,6,2,...,2,0,0,1,0,1,0,0,0,0
1,10,16,38,11,0,0,0,0,4,0,...,0,0,0,1,0,4,0,0,0,0
2,9,0,7,13,1,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,6,9,35,4,0,0,0,2,6,0,...,0,0,0,0,0,0,0,0,0,0
4,9,26,48,11,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,0


#### Filter out descriptors not present in the model

In [7]:
miss_desc = train_desc.columns.difference(vs_desc.columns).tolist()
miss_desc = pd.DataFrame([[0]*len(miss_desc)]*vs_desc.shape[0], columns=miss_desc)
vs_desc = pd.concat([vs_desc, miss_desc], axis=1)
X_vs = vs_desc[desc_list]
X_vs.shape

(23, 1262)

# Virtual screening SiRMS

##### Load the model

In [8]:
with gzip.open('../model/sars-cov-3clpro-sirms_RF_ad_balanced.pgz', 'rb') as f:
    model = cPickle.load(f)

##### Predict molecules

In [9]:
%%time
ad_threshold = 0.70

y_pred = model.predict(X_vs)
ad = model.predict_proba(X_vs)
ad = np.amax(ad, axis=1) >= ad_threshold

CPU times: user 61 ms, sys: 9.19 ms, total: 70.2 ms
Wall time: 219 ms


In [10]:
pred = pd.DataFrame({'Prediction': y_pred,'AD': ad}, index=None)
pred.AD[pred.AD == False] = np.nan
pred.AD[pred.AD == True] = pred.Prediction.astype(int)

In [11]:
pred_ad = pred.dropna().astype(int)
coverage_ad = len(pred_ad) * 100 / len(pred)

print('VS pred: %s' % Counter(pred.Prediction))
print('VS pred AD: %s' % Counter(pred_ad.Prediction))
print('Coverage of AD: %.2f%%' % coverage_ad)

VS pred: Counter({0: 14, 1: 9})
VS pred AD: Counter({1: 4, 0: 2})
Coverage of AD: 26.09%


###  Visualize predictions

In [12]:
sirms_predictions = pd.concat([moldf, pred], axis=1)

### Import DRAGON descriptors

##### Import descriptor of training set

In [13]:
train_desc = pd.read_csv('../descriptors/dragon-chembl-sars-cov-3C-like-proteinase-processed.txt', sep='\t')
desc_list = train_desc.columns.tolist()
train_desc.head()

Unnamed: 0,AECC,ALOGP,ALOGP2,AMW,ATS4m,ATSC1s,ATSC3e,B01[C-Cl],B01[C-S],B01[N-O],...,nCconj,nCrs,nCrt,nCsp2,nR06,nR=Cs,nRCONR2,nRCOOR,nThiophenes,piPC08
0,14.718,-1.294,1.675,6.897,4.419,26.457,1.207,0,0,0,...,0,0,0,9,0,0,0,0,0,4.953
1,10.789,3.748,14.05,7.263,4.435,12.193,0.642,0,0,0,...,0,0,0,19,2,0,1,0,0,7.126
2,14.514,0.317,0.1,6.864,4.317,26.744,1.05,0,0,1,...,1,0,0,9,0,0,0,0,0,4.894
3,12.187,4.107,16.863,7.143,3.955,12.555,0.391,0,0,0,...,3,0,0,22,3,2,0,0,0,5.923
4,14.8,-3.732,13.929,7.695,4.561,78.238,1.917,0,0,0,...,0,0,0,14,1,0,0,0,0,5.424


##### Import descriptor of VS set

In [15]:
vs_desc = pd.read_csv('../descriptors/dragon-clinical_trials.txt', sep='\t')
vs_desc.drop(vs_desc.columns[0:1], axis=1,inplace=True)
vs_desc.head()

Unnamed: 0,MW,AMW,Sv,Se,Sp,Si,Mv,Me,Mp,Mi,...,Psychotic-80,Psychotic-50,Hypertens-80,Hypertens-50,Hypnotic-80,Hypnotic-50,Neoplastic-80,Neoplastic-50,Infective-80,Infective-50
0,474.65,7.534,38.496,63.6,40.637,71.73,0.611,1.01,0.645,1.139,...,0,0,1,0,0,0,1,0,0,0
1,547.74,7.303,45.21,75.694,47.79,84.939,0.603,1.009,0.637,1.133,...,0,0,0,0,0,0,0,0,0,0
2,244.24,8.422,17.767,30.578,17.341,33.701,0.613,1.054,0.598,1.162,...,0,0,0,0,0,0,0,0,0,0
3,477.46,8.842,34.721,54.094,37.512,60.369,0.643,1.002,0.695,1.118,...,0,0,0,0,0,0,0,0,0,0
4,628.89,6.69,56.249,93.483,60.046,106.175,0.598,0.994,0.639,1.13,...,0,0,0,0,0,0,0,0,0,0


#### Filter out descriptors not present in the model

In [16]:
miss_desc = train_desc.columns.difference(vs_desc.columns).tolist()
miss_desc = pd.DataFrame([[0]*len(miss_desc)]*vs_desc.shape[0], columns=miss_desc)
vs_desc = pd.concat([vs_desc, miss_desc], axis=1)
X_vs = vs_desc[desc_list]
X_vs = X_vs.apply(pd.to_numeric, errors='coerce')
X_vs.fillna(0, inplace=True)
X_vs.shape

(23, 378)

# Virtual screening Dragon

##### Load the model

In [17]:
with gzip.open('../model/sars-cov-3clpro-dragon_RF_ad_balanced.pgz', 'rb') as f:
    model = cPickle.load(f)

##### Predict molecules

In [18]:
%%time
ad_threshold = 0.70

y_pred = model.predict(X_vs)
ad = model.predict_proba(X_vs)
ad = np.amax(ad, axis=1) >= ad_threshold

CPU times: user 188 ms, sys: 52.6 ms, total: 241 ms
Wall time: 209 ms


In [19]:
pred = pd.DataFrame({'Prediction': y_pred,'AD': ad}, index=None)
pred.AD[pred.AD == False] = np.nan
pred.AD[pred.AD == True] = pred.Prediction.astype(int)

In [20]:
pred_ad = pred.dropna().astype(int)
coverage_ad = len(pred_ad) * 100 / len(pred)

print('VS pred: %s' % Counter(pred.Prediction))
print('VS pred AD: %s' % Counter(pred_ad.Prediction))
print('Coverage of AD: %.2f%%' % coverage_ad)

VS pred: Counter({0: 22, 1: 1})
VS pred AD: Counter({0: 3})
Coverage of AD: 13.04%


###  Visualize predictions

In [21]:
dragon_predictions = pd.concat([moldf, pred], axis=1)

# Consensus predictions

In [22]:
sirms = sirms_predictions.rename(columns={'Prediction':'sirms', 'AD': 'sirms_ad'})
dragon = dragon_predictions.rename(columns={'Prediction':'dragon', 'AD': 'dragon_ad'})
predictions = pd.concat([sirms, dragon[['dragon', 'dragon_ad']]], axis=1)
predictions.drop(columns='ID', inplace=True)

In [23]:
predictions.head()

Unnamed: 0,Compound_name,InChIKey,Mol,sirms,sirms_ad,dragon,dragon_ad
0,Sildenafil,BNRNXUUZRGQAQC-UHFFFAOYSA-N,,0,,0,
1,Darunavir,CJBJHOAVZSMMDJ-UHFFFAOYNA-N,,0,,0,
2,Ribavirin,IWUCXVSUMQZMFG-UHFFFAOYNA-N,,1,,0,
3,Arbidol,KCFYEAOKVJSACF-UHFFFAOYSA-N,,0,,0,0.0
4,Lopinavir,KJHKTHWMRKYKJE-UHFFFAOYNA-N,,1,1.0,1,


In [23]:
predictions['consensus'] = (predictions.sirms + predictions.dragon)/2
predictions['consensus'] = np.where(predictions['consensus'] > 0.5, 1, np.where(predictions['consensus'] < 0.5, 0, np.nan))

for i in range(0, predictions.shape[0]):
    if all([np.isnan(predictions.sirms_ad[i]) == False, np.isnan(predictions.dragon_ad[i]) == False]):
        predictions.loc[i,'consensus_ad'] = (predictions.sirms_ad[i] + predictions.dragon_ad[i])/2
        predictions.loc[i,'consensus_ad'] = np.where(predictions.loc[i,'consensus_ad'] > 0.5, 1, np.where(predictions.loc[i,'consensus_ad'] < 0.5, 0, np.nan))
    elif all([np.isnan(predictions.sirms_ad[i]) == True, np.isnan(predictions.dragon_ad[i]) == False]):
        predictions.loc[i,'consensus_ad'] = predictions.dragon_ad[i]
    elif all([np.isnan(predictions.sirms_ad[i]) == False, np.isnan(predictions.dragon_ad[i]) == True]):
        predictions.loc[i,'consensus_ad'] = predictions.sirms_ad[i]
    else:
        predictions.loc[i,'consensus_ad']  = np.nan

In [24]:
for col in predictions.columns:
    predictions[col].replace(0,'Inactive',inplace=True)
    predictions[col].replace(1,'Active',inplace=True)

In [25]:
predictions = predictions[['Compound_name', 'InChIKey', 'Mol','sirms_ad','sirms','dragon','dragon_ad','consensus','consensus_ad']]
predictions

Unnamed: 0,Compound_name,InChIKey,Mol,sirms_ad,sirms,dragon,dragon_ad,consensus,consensus_ad
0,Sildenafil,BNRNXUUZRGQAQC-UHFFFAOYSA-N,,,Inactive,Inactive,,Inactive,
1,Darunavir,CJBJHOAVZSMMDJ-UHFFFAOYNA-N,,,Inactive,Inactive,,Inactive,
2,Ribavirin,IWUCXVSUMQZMFG-UHFFFAOYNA-N,,,Active,Inactive,,,
3,Arbidol,KCFYEAOKVJSACF-UHFFFAOYSA-N,,,Inactive,Inactive,Inactive,Inactive,Inactive
4,Lopinavir,KJHKTHWMRKYKJE-UHFFFAOYNA-N,,Active,Active,Active,,Active,Active
5,Fingolimod,KKGQTZUTZRNORY-UHFFFAOYSA-N,,,Inactive,Inactive,,Inactive,
6,Thymosin,LCJVIYPJPCBWKS-UHFFFAOYNA-N,,,Inactive,Inactive,Inactive,Inactive,Inactive
7,Nitric Oxide,MWUXSHHQAYIFBG-UHFFFAOYSA-N,,,Inactive,Inactive,,Inactive,
8,Ritonavir,NCDNCNXCDXHOMX-UHFFFAOYNA-N,,Active,Active,Inactive,,,Active
9,Bromhexine,OJGDCBLYJGHCIH-UHFFFAOYSA-N,,,Inactive,Inactive,,Inactive,


## Export Predictions

In [None]:
with pd.ExcelWriter('datasets/screened_compounds/clinical-trials_qsar_pred-sirms-dragon.xlsx') as writer:
    predictions.to_excel(writer, columns=['Compound_name', 'InChIKey', 'sirms', 'sirms_ad', 'dragon', 'dragon_ad', 'consensus', 'consensus_ad'], sheet_name='consensus', index=False)