# Virtual Screening of DrugBank by QSAR models of SARS-CoV (M<sup>pro</sup>) 

## Importing modules and functions        
    

In [1]:
import bz2
import gzip
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import _pickle as cPickle
import sys

sys.path.append('../')

from cheminformatics import *
from collections import Counter

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole

from sklearn import metrics

def warn(*args, **kwargs):
    pass
import warnings
warnings.filterwarnings("ignore")
warnings.warn = warn

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
%reload_ext autoreload
%autoreload 2
Draw.DrawingOptions.atomLabelFontFace = "DejaVu Sans"
Draw.DrawingOptions.atomLabelFontSize = 18

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Import screening data

In [2]:
# Set file path and format
file = '../datasets/curated_data/drugbank.sdf.gz'

# Read SDF
moldf = PandasTools.LoadSDF(file, molColName='Mol', smilesName='SMILES');
print('Original data: ', moldf.shape)

# Remove missing RDKit molecules
moldf = moldf[pd.notnull(moldf['Mol'])]
if 'StandardizerResult' in moldf.columns:
    moldf = moldf.drop(columns='StandardizerResult')

moldf.drop(columns='ID', inplace=True)
# Columns
print('Kept data: ', moldf.shape)
moldf.head(1)
from molvs.validate import Validator
fmt = '%(asctime)s - %(levelname)s - %(validation)s - %(message)s'
validator = Validator(log_format=fmt)
print('\n Problematic structures: \n', validator.validate(moldf))

RDKit ERROR: [17:15:00] Explicit valence for atom # 36 N, 5, is greater than permitted
RDKit ERROR: [17:15:00] ERROR: Could not sanitize molecule ending on line 660489


Original data:  (9614, 7)
Kept data:  (9614, 6)


Unnamed: 0,GENERIC_NAME,DRUGBANK_ID,DRUG_GROUPS,InChIKey,SMILES,Mol
0,Elenbecestat,DB15391,investigational,AACUJFVOHGRMTR-DPXNYUHVSA-N,CC1OCC2(c3cc(NC(=O)c4cnc(C(F)F)cn4)ccc3F)N=C(N...,



 Problematic structures: 
 []


### Calculate Morgan Fingerprints

In [3]:
def calcfp(mol,funcFPInfo=dict(radius=3, nBits=2048, useFeatures=False, useChirality=False)):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, **funcFPInfo)
    fp = pd.Series(np.asarray(fp))
    fp = fp.add_prefix('Bit_')
    return fp

X_vs = moldf.Mol.apply(calcfp)
X_vs.shape

(9614, 2048)

# Virtual screening Morgan

In [4]:
##### Load the model and model parameters

with gzip.open('../model/sars-cov-3clpro-morgan_RF_ad_unbalanced.pgz', 'rb') as f:
    model = cPickle.load(f)
    
with bz2.BZ2File('../model/sars-cov-3clpro-morgan_RF_ad_unbalanced.pbz2', 'rb') as f:
    model_data = cPickle.load(f)  

X_train = model_data['Descriptors']
D_cutoff = model_data['D_cutoff']
del model_data

##### Predict molecules

In [8]:
conf_threshold = 0.7

#### Make predictions
y_pred = model.predict(X_vs)
pred_prob = model.predict_proba(X_vs)
pred_prob = np.amax(pred_prob, axis=1).round(2)

#### Estimate AD
pred_dist = calc_test_distances(X_vs, X_train)
pred_ad = []
for i in np.arange(0, len(pred_dist), 1):
    if pred_dist[i] < D_cutoff:
        pred_ad.append('Inside')
    else:
        pred_ad.append('Outside')

#### Prepare data
pred = pd.DataFrame({'Prediction': y_pred, 'Confidence': pred_prob, 'AD': pred_ad})
# AD
pred.AD[pred.AD == 'Outside'] = np.nan
pred.AD[pred.AD == 'Inside'] = pred.Prediction
pred.sort_index(inplace=True)
pred_ad = pred.dropna().astype(int)
coverage_pred = len(pred_ad) / len(pred)

# Confidence
high_conf = pred[(pred['Confidence'] >= conf_threshold)]
high_conf.sort_index(inplace=True)
high_conf_ad = high_conf.dropna().astype(int)
coverage_high_conf = len(high_conf_ad) / len(high_conf)

#### Print results
pred_count = Counter(pred.Prediction)
pred_ad_count = Counter(pred_ad.Prediction)
pred_high_conf_count = Counter(high_conf.Prediction)
pred_high_conf_count_ad = Counter(high_conf_ad.Prediction)

print('\033[1m' + 'Virtual screening results:' + '\n' + '\033[0m')
for key, value in pred_count.items():
    print('\t\t Class %d: %d' % (key, value))
for key, value in pred_ad_count.items():
    print('\t\t Class %d (AD): %d' % (key, value))

print('\t\t AD Coverage: %.2f%%\n' % coverage_pred)

for key, value in pred_high_conf_count.items():
    print('\t\t Class %d (high-confidence): %d' % (key, value))
    
for key, value in sorted(pred_high_conf_count_ad.items()):
    print('\t\t Class %d (high-confidence AD): %d' % (key, value))

[1mVirtual screening results:
[0m
		 Class 0: 7955
		 Class 1: 1659
		 Class 0 (AD): 6254
		 Class 1 (AD): 1131
		 AD Coverage: 0.77%

		 Class 0 (high-confidence): 37
		 Class 1 (high-confidence): 8
		 Class 0 (high-confidence AD): 24
		 Class 1 (high-confidence AD): 7


###  Visualize predictions

In [22]:
predictions = pd.concat([moldf, pred], axis=1)
for col in ['Prediction', 'AD']:
    predictions[col].replace(0,'Inactive',inplace=True)
    predictions[col].replace(1,'Active',inplace=True)
    
predictions[(predictions['Confidence'] >= conf_threshold) & (predictions['Prediction'] == 'Active')].sort_values(by=['Confidence'], ascending=False)

Unnamed: 0,GENERIC_NAME,DRUGBANK_ID,DRUG_GROUPS,InChIKey,SMILES,Mol,Prediction,Confidence,AD
465,Diloxanide furoate,DB14638,approved,BDYYDXJSHYEDGB-UHFFFAOYSA-N,CN(C(=O)C(Cl)Cl)c1ccc(OC(=O)c2ccco2)cc1,,Active,0.79,Active
2817,"(E)-(4S,6S)-8-METHYL-6-((S)-3-METHYL-2-{(S)-2-...",DB04595,experimental,IDBWWEGDLCFCTD-VNEMRZQUSA-N,Cc1cc(C(=O)NC(C)C(=O)NC(C(=O)NC(CC(C)C)C(=O)NC...,,Active,0.76,
1870,Tolnaftate,DB00525,approved; investigational; vet_approved,FUSNMLFNXJSCDI-UHFFFAOYSA-N,Cc1cccc(N(C)C(=S)Oc2ccc3ccccc3c2)c1,,Active,0.73,Active
4457,Litoxetine,DB15038,investigational,MJJDYOLPMGIWND-UHFFFAOYSA-N,c1ccc2cc(COC3CCNCC3)ccc2c1,,Active,0.73,Active
4031,CARBOBENZYLOXY-(L)-LEUCINYL-(L)LEUCINYL METHOX...,DB08526,experimental,LHCNZPLAATYYPI-SLFFLAALSA-N,COCC(O)C(CC(C)C)NC(=O)C(CC(C)C)NC(=O)OCc1ccccc1,,Active,0.72,Active
6116,Netivudine,DB12606,investigational,QLOCVMVCRJOTTM-SDNRWEOFSA-N,CC#Cc1cn(C2OC(CO)C(O)C2O)c(=O)[nH]c1=O,,Active,0.72,Active
2275,Cryptotanshinone,DB15579,experimental,GVKKJJOMQCNPGB-JTQLQIEISA-N,CC1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)C,,Active,0.7,Active
2441,"Dibenzyl (carbonylbis{2,1-hydrazinediyl[(2S)-4...",DB03891,experimental,HGDUWJVGIGLVOH-ZEQRLZLVSA-N,CC(C)CC(NC(=O)OCc1ccccc1)C(=O)NNC(=O)NNC(=O)C(...,,Active,0.7,Active


## Export SDF and Excel

In [25]:
predictions.drop(columns='Mol', inplace=True)
with pd.ExcelWriter('../datasets/screened_compounds/drugbank_hits_qsar_morgan.xlsx') as writer:
    predictions.to_excel(writer, sheet_name='morgan', index=False)