# VS of phenotypic compounds tested in SARS-CoV-2  by (M<sup>pro</sup>) QSAR models of SARS-CoV

## Importing modules and functions        
    

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import _pickle as cPickle
import gzip

from stats import *

from collections import Counter

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.model_selection import permutation_test_score, StratifiedKFold

def warn(*args, **kwargs):
    pass
import warnings
warnings.filterwarnings("ignore")
warnings.warn = warn

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
%reload_ext autoreload
%autoreload 2
Draw.DrawingOptions.atomLabelFontFace = "DejaVu Sans"
Draw.DrawingOptions.atomLabelFontSize = 18

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Import screening data

In [2]:
# Set file path and format
file = '../datasets/curated_data/ncats_cpe_final.sdf.gz'

# Read SDF
sdfInfo = dict(molColName='ROMol')
moldf = PandasTools.LoadSDF(file, **sdfInfo);
#print('Original data: ', moldf.shape)
# Rename ROMol
moldf = moldf.rename(columns={'ROMol': 'Mol'})
# Remove missing RDKit molecules
moldf = moldf[pd.notnull(moldf['Mol'])]
if 'StandardizerResult' in moldf.columns:
    moldf = moldf.drop(columns='StandardizerResult')

In [3]:
# Columns
print('Kept data: ', moldf.shape)
moldf.head(1)
from molvs.validate import Validator
fmt = '%(asctime)s - %(levelname)s - %(validation)s - %(message)s'
validator = Validator(log_format=fmt)
print('\n Problematic structures: \n', validator.validate(moldf))

Kept data:  (3957, 15)


Unnamed: 0,SAMPLE_ID,SAMPLE_ID_dup,SAMPLE_NAME,InChIKey,Prim_Assay_Outcome,Prim_Assay_pAC50_Median,Prim_Assay_curve_class,Molecular weight,GENERIC_NAMES,Outcome,ID,Mol,GENERIC_NAME,DRUGBANK_ID,DRUG_GROUPS
0,NCGC00013037-01,,Thanite,IXEVGHXRXDBAOB-UHFFFAOYNA-N,Inactive,,4.0,253.36045837402344,"(1,7,7-trimethyl-6-bicyclo[2.2.1]heptanyl) 2-t...",Inactive,,,,,



 Problematic structures: 
 []


### Calculate Morgan Fingerprints

In [4]:
def calcfp(mol,funcFPInfo=dict(radius=3, nBits=2048, useFeatures=False, useChirality=False)):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, **funcFPInfo)
    fp = pd.Series(np.asarray(fp))
    fp = fp.add_prefix('Bit_')
    return fp

X_vs = moldf.Mol.apply(calcfp)
X_vs.shape

(3957, 2048)

# Virtual screening Morgan

##### Load the model

In [5]:
with gzip.open('../model/sars-cov-3clpro-morgan_RF_ad_balanced.pgz', 'rb') as f:
    model = cPickle.load(f)

##### Predict molecules

In [6]:
%%time
ad_threshold = 0.70

y_pred = model.predict(X_vs)
confidence = model.predict_proba(X_vs)
confidence = np.amax(confidence, axis=1).round(2)
ad = confidence >= ad_threshold

pred = pd.DataFrame({'Prediction': y_pred, 'AD': ad, 'Confidence': confidence}, index=None)
pred.AD[pred.AD == False] = np.nan
pred.AD[pred.AD == True] = pred.Prediction.astype(int)

CPU times: user 531 ms, sys: 53.8 ms, total: 585 ms
Wall time: 433 ms


In [7]:
pred_ad = pred.dropna().astype(int)
coverage_ad = len(pred_ad) * 100 / len(pred)

print('VS pred: %s' % Counter(pred.Prediction))
print('VS pred AD: %s' % Counter(pred_ad.Prediction))
print('Coverage of AD: %.2f%%' % coverage_ad)

VS pred: Counter({0: 3926, 1: 31})
VS pred AD: Counter({0: 2102})
Coverage of AD: 53.12%


###  Visualize predictions

In [8]:
predictions = pd.concat([moldf, pred], axis=1)
for col in ['Prediction', 'AD']:
    predictions[col].replace(0,'Inactive',inplace=True)
    predictions[col].replace(1,'Active',inplace=True)
predictions.head()

Unnamed: 0,SAMPLE_ID,SAMPLE_ID_dup,SAMPLE_NAME,InChIKey,Prim_Assay_Outcome,Prim_Assay_pAC50_Median,Prim_Assay_curve_class,Molecular weight,GENERIC_NAMES,Outcome,ID,Mol,GENERIC_NAME,DRUGBANK_ID,DRUG_GROUPS,Prediction,AD,Confidence
0,NCGC00013037-01,,Thanite,IXEVGHXRXDBAOB-UHFFFAOYNA-N,Inactive,,4.0,253.36045837402344,"(1,7,7-trimethyl-6-bicyclo[2.2.1]heptanyl) 2-t...",Inactive,,,,,,Inactive,Inactive,0.72
1,NCGC00013082-04,,trans-Aconitic acid,GTZCVFVGUGFEME-UHFFFAOYSA-N,Inactive,,4.0,174.10824584960938,,Inactive,,,,,,Inactive,Inactive,0.81
2,NCGC00013095-10,,Geraniol,GLZPCOQZEFWAFX-UHFFFAOYSA-N,Inactive,,4.0,154.24932861328125,,Inactive,,,,,,Inactive,Inactive,0.85
3,NCGC00013109-03,,Phenol red,BELBBZDIHDAJOR-UHFFFAOYSA-N,Inactive,,4.0,354.37646484375,"4-[3-(4-hydroxyphenyl)-1,1-dioxobenzo[c]oxathi...",Inactive,,,Phenolsulfonphthalein,DB13212,experimental,Inactive,Inactive,0.76
4,NCGC00013216-05,,"2,3-Dimercapto-1-propanol",WQABCVAJNWAXTE-UHFFFAOYNA-N,Inactive,,4.0,124.22502136230467,"2,3-bis-sulfanylpropan-1-ol\n2,3-Dimercaptopro...",Inactive,,,Dimercaprol,DB06782,approved,Inactive,Inactive,0.87


## Export SDF and Excel

In [9]:
# SDF
#PandasTools.WriteSDF(predictions, 'datasets/screened_compounds/drugbank_hits_qsar_sirms_dragon.sdf.gz',
#                     molColName='Mol', properties=list(predictions.columns))

predictions.drop(columns='Mol', inplace=True)
with pd.ExcelWriter('../datasets/screened_compounds/cpe_ncats_hits_qsar_morgan.xlsx') as writer:
    predictions.to_excel(writer, sheet_name='morgan', index=False)