# Preprocess the unknown compounds and generate features

Clean, check and featurize the unknown commercially available drugs for model validation

In [1]:
__author__ = "Jing-Quan Wang"

In [11]:
# Autoreload modules
%load_ext autoreload
%autoreload 2

import pandas as pd
from mrp7pred.mrp7pred import MRP7Pred
from mrp7pred.utils import (
    DATA,
    MODEL_DIR,
    OUTPUT,
)
from mrp7pred.feats.gen_all_features import featurize
import pickle

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
DATA_FOLDER = "../data/manual"

## 1. Load data

In [4]:
df = pd.read_csv(f"{DATA_FOLDER}/unknown.csv")
df

Unnamed: 0,name,synonym,cas,target,status,url,smiles
0,Fluphenazine,Prolixin,146-56-5,D1DR and D2DR inhibitor,On market,http://www.selleckchem.com/products/fluphenazi...,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...
1,Citarinostat,ACY241,1316215-12-9,histone deacetylase (HDAC) inhibitor,Phase 1,https://www.medchemexpress.com//Citarinostat.html,C1=CC=C(C=C1)N(C2=CC=CC=C2Cl)C3=NC=C(C=N3)C(=O...
2,Chloroquine diphosphate,,50-63-5,autophagy and toll-like receptors (TLRs) inhib...,Phase 1/2/3/4,https://www.medchemexpress.com/Chloroquine-dip...,CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl.OP(=O)(...
3,Arbidol hydrochloride,Umifenovir hydrochloride,131707-23-8,anti-influenza virus agent,Phase 4,https://www.medchemexpress.com/Arbidol-hydroch...,CCOC(=O)C1=C(N(C2=CC(=C(C(=C21)CN(C)C)O)Br)C)C...
4,Nitazoxanide,,55981-09-4,synthetic nitrothiazolyl-salicylamide derivati...,Phase 1/2/3/4,https://www.medchemexpress.com/nitazoxanide.html,CC(=O)OC1=CC=CC=C1C(=O)NC2=NC=C(S2)[N+](=O)[O-]
...,...,...,...,...,...,...,...
75,Gefitinib,ZD1839,184475-35-2,EGFR inhibitor,Phase 1/2/3,https://www.selleckchem.com/products/Gefitinib...,COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OC...
76,Erlotinib,CP358774,183321-74-6,EGFR inhibitor,Phase 1/2/4,https://www.selleckchem.com/products/erlotinib...,COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C...
77,FRAX486,,1232030-35-1,PAK inhibitor,,https://www.selleckchem.com/products/frax486.html,CCN1C2=NC(=NC=C2C=C(C1=O)C3=C(C=C(C=C3)Cl)Cl)N...
78,AZD4635,HTL1071,1321514-06-0,A2AR antagonist,Phase 1/2,https://www.medchemexpress.com/AZD4635.html,NC1=NC(C2=CC=C(F)C=C2)=C(C3=CC(Cl)=NC(C)=C3)N=N1


In [5]:
len(df)

80

## 2. Check null smiles

In [6]:
df[df["smiles"].isna()]

Unnamed: 0,name,synonym,cas,target,status,url,smiles


## 3. Check duplicates

In [7]:
len(df.drop_duplicates())

80

## 4. Standardize smiles and generate features

In [12]:
with open("./df_feats.pkl", "rb") as fi:
    df_feats_prev = pickle.load(fi)
df_feats_prev.head()

Unnamed: 0,FractionCSP3,HeavyAtomCount,HeavyAtomMolWt,NHOHCount,NOCount,RingCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,...,ATSe7,ATSe8,ATSp1,ATSp2,ATSp3,ATSp4,ATSp5,ATSp6,ATSp7,ATSp8
0,0.25,33.0,441.749,3.0,8.0,3.0,0.0,0.0,0.0,2.0,...,3.671,3.562,3.424,3.643,3.663,3.678,3.775,3.713,3.438,3.179
1,0.083333,21.0,298.215,1.0,8.0,2.0,0.0,0.0,0.0,1.0,...,3.133,2.964,2.934,3.173,3.062,3.005,2.977,2.858,2.673,2.456
2,0.181818,28.0,350.276,2.0,6.0,5.0,0.0,1.0,1.0,1.0,...,3.834,3.538,3.362,3.707,3.748,3.724,3.779,3.764,3.591,3.219
3,0.28,37.0,522.218,3.0,9.0,4.0,0.0,1.0,1.0,2.0,...,4.043,3.96,3.523,3.851,3.916,3.784,3.701,3.703,3.626,3.474
4,0.26087,30.0,385.269,3.0,6.0,5.0,1.0,0.0,1.0,2.0,...,3.818,3.641,3.373,3.663,3.63,3.575,3.582,3.555,3.498,3.3


In [8]:
df_data = df[["name", "smiles"]]

In [9]:
m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210112-023538.pkl")
out = m7p.predict(df_all = df_data)

Loading trained model ... Done!
Generating features ... 
Featurization failed
Smiles: C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F)F)CCO.Cl.Cl
Error: Time out (10s)
1. Citarinostat
SMILES: C1=CC=C(C=C1)N(C2=CC=CC=C2Cl)C3=NC=C(C=N3)C(=O)NCCCCCCC(=O)NO

Featurization failed
Smiles: CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl.OP(=O)(O)O.OP(=O)(O)O
Error: Time out (10s)
Featurization failed
Smiles: CCOC(=O)C1=C(N(C2=CC(=C(C(=C21)CN(C)C)O)Br)C)CSC3=CC=CC=C3.Cl
Error: Time out (10s)
4. Nitazoxanide
SMILES: CC(=O)OC1=CC=CC=C1C(=O)NC2=NC=C(S2)[N+](=O)[O-]

5. Galunisertib
SMILES: CC1=NC(=CC=C1)C2=NN3CCCC3=C2C4=C5C=C(C=CC5=NC=C4)C(=O)N

6. Ensartinib
SMILES: CC(C1=C(C=CC(=C1Cl)F)Cl)OC2=CC(=NN=C2N)C(=O)NC3=CC=C(C=C3)C(=O)N4CCN(CC4)C

7. Anlotinib
SMILES: CC1=CC2=C(N1)C=CC(=C2F)OC3=C4C=C(C(=CC4=NC=C3)OCC5(CC5)N)OC

8. Berzosertib
SMILES: CC(C)S(=O)(=O)C1=CC=C(C=C1)C2=CN=C(C(=N2)C3=CC(=NO3)C4=CC=C(C=C4)CNC)N

9. Ribociclib
SMILES: CN(C)C(=O)C1=CC2=CN=C(N=C2N1C3CCCC3)NC4=NC=C(C=C4)N5CCNCC5

10. Binime

In [13]:
expected = ['FractionCSP3', 'HeavyAtomCount', 'HeavyAtomMolWt', 'NHOHCount', 'NOCount', 'RingCount', 'NumAliphaticCarbocycles', 
            'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 'NumAromaticRings', 
            'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRadicalElectrons', 'NumRotatableBonds', 'NumSaturatedCarbocycles', 
            'NumSaturatedHeterocycles', 'NumSaturatedRings', 'NumValenceElectrons', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN',
            'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 
            'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 
            'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 
            'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 
            'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 
            'fr_isocyan', 'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy', 'fr_morpholine', 
            'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 
            'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 
            'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 
            'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi1', 'Chi0v', 
            'Chi1v', 'Chi2v', 'Chi3v', 'Chi4v', 'Chi0n', 'Chi1n', 'Chi2n', 'Chi3n', 'Chi4n', 'EState_VSA1', 'EState_VSA2', 'EState_VSA3', 
            'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'EState_VSA10', 'EState_VSA11', 'ExactMolWt', 
            'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'MolLogP', 'MolMR', 'MolWt', 'PEOE_VSA1', 'PEOE_VSA2', 'PEOE_VSA3', 
            'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 
            'PEOE_VSA14', 'SMR_VSA1', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SMR_VSA10', 
            'SlogP_VSA1', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'SlogP_VSA10', 
            'SlogP_VSA11', 'SlogP_VSA12', 'TPSA', 'VSA_EState1', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 
            'VSA_EState8', 'VSA_EState9', 'VSA_EState10', 'MaxAbsEStateIndex', 'MaxAbsPartialCharge', 'MaxEStateIndex', 'MaxPartialCharge', 
            'MinAbsEStateIndex', 'MinAbsPartialCharge', 'MinEStateIndex', 'MinPartialCharge', 'Weight', 'AWeight', 'nhyd', 'nhal', 'nhet', 'nhev', 
            'ncof', 'ncocl', 'ncobr', 'ncoi', 'ncarb', 'nphos', 'nsulph', 'noxy', 'nnitro', 'nring', 'nrot', 'ndonr', 'naccr', 'nsb', 'ndb', 'naro', 
            'ntb', 'nta', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'W', 'AW', 'J', 'Tigdi', 'Xu', 'GMTI', 'Pol', 'DZ', 'Thara', 'Tsch', 'ZM1', 'ZM2', 
            'MZM1', 'MZM2', 'Qindex', 'Platt', 'diametert', 'radiust', 'petitjeant', 'Sito', 'Hato', 'Geto', 'Arto', 'ISIZ', 'TIAC', 'IDET', 'IDE', 'IVDE', 
            'Gravto', 'Hatov', 'Sitov', 'Getov', 'GMTIV', 'mChi1', 'Chi2', 'Chi3', 'Chi4', 'Chi5', 'Chi6', 'Chi7', 'Chi8', 'Chi9', 'Chi10', 'Chi3c', 'Chi4c', 
            'Chi4pc', 'Chi3ch', 'Chi4ch', 'Chi5ch', 'Chi6ch', 'knotp', 'Chiv0', 'Chiv1', 'Chiv2', 'Chiv3', 'Chiv4', 'Chiv5', 'Chiv6', 'Chiv7', 'Chiv8', 'Chiv9', 
            'Chiv10', 'dchi0', 'dchi1', 'dchi2', 'dchi3', 'dchi4', 'Chiv3c', 'Chiv4c', 'Chiv4pc', 'Chiv3ch', 'Chiv4ch', 'Chiv5ch', 'Chiv6ch', 'knotpv', 'kappa1', 
            'kappa2', 'kappa3', 'kappam1', 'kappam2', 'kappam3', 'phi', 'bcutm16', 'bcutm15', 'bcutm14', 'bcutm13', 'bcutm12', 'bcutm11', 'bcutm10', 'bcutm9', 
            'bcutm8', 'bcutm7', 'bcutm6', 'bcutm5', 'bcutm4', 'bcutm3', 'bcutm2', 'bcutm1', 'bcutv16', 'bcutv15', 'bcutv14', 'bcutv13', 'bcutv12', 'bcutv11', 'bcutv10', 
            'bcutv9', 'bcutv8', 'bcutv7', 'bcutv6', 'bcutv5', 'bcutv4', 'bcutv3', 'bcutv2', 'bcutv1', 'bcute16', 'bcute15', 'bcute14', 'bcute13', 'bcute12', 'bcute11', 
            'bcute10', 'bcute9', 'bcute8', 'bcute7', 'bcute6', 'bcute5', 'bcute4', 'bcute3', 'bcute2', 'bcute1', 'bcutp16', 'bcutp15', 'bcutp14', 'bcutp13', 'bcutp12', 
            'bcutp11', 'bcutp10', 'bcutp9', 'bcutp8', 'bcutp7', 'bcutp6', 'bcutp5', 'bcutp4', 'bcutp3', 'bcutp2', 'bcutp1', 'CIC0', 'CIC1', 'CIC2', 'CIC3', 'CIC4', 'CIC5', 
            'CIC6', 'SIC0', 'SIC1', 'SIC2', 'SIC3', 'SIC4', 'SIC5', 'SIC6', 'IC0', 'IC1', 'IC2', 'IC3', 'IC4', 'IC5', 'IC6', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 
            'S8', 'S9', 'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20', 'S21', 'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29', 'S30', 
            'S31', 'S32', 'S33', 'S34', 'S35', 'S36', 'S37', 'S38', 'S39', 'S40', 'S41', 'S42', 'S43', 'S44', 'S45', 'S46', 'S47', 'S48', 'S49', 'S50', 'S51', 'S52', 'S53', 
            'S54', 'S55', 'S56', 'S57', 'S58', 'S59', 'S60', 'S61', 'S62', 'S63', 'S64', 'S65', 'S66', 'S67', 'S68', 'S69', 'S70', 'S71', 'S72', 'S73', 'S74', 'S75', 'S76', 
            'S77', 'S78', 'S79', 'Smax0', 'Smax1', 'Smax2', 'Smax3', 'Smax4', 'Smax5', 'Smax6', 'Smax7', 'Smax8', 'Smax9', 'Smax10', 'Smax11', 'Smax12', 'Smax13', 'Smax14', 
            'Smax15', 'Smax16', 'Smax17', 'Smax18', 'Smax19', 'Smax20', 'Smax21', 'Smax22', 'Smax23', 'Smax24', 'Smax25', 'Smax26', 'Smax27', 'Smax28', 'Smax29', 'Smax30', 
            'Smax31', 'Smax32', 'Smax33', 'Smax34', 'Smax35', 'Smax36', 'Smax37', 'Smax38', 'Smax39', 'Smax40', 'Smax41', 'Smax42', 'Smax43', 'Smax44', 'Smax45', 'Smax46', 
            'Smax47', 'Smax48', 'Smax49', 'Smax50', 'Smax51', 'Smax52', 'Smax53', 'Smax54', 'Smax55', 'Smax56', 'Smax57', 'Smax58', 'Smax59', 'Smax60', 'Smax61', 'Smax62', 
            'Smax63', 'Smax64', 'Smax65', 'Smax66', 'Smax67', 'Smax68', 'Smax69', 'Smax70', 'Smax71', 'Smax72', 'Smax73', 'Smax74', 'Smax75', 'Smax76', 'Smax77', 'Smax78', 
            'Smin0', 'Smin1', 'Smin2', 'Smin3', 'Smin4', 'Smin5', 'Smin6', 'Smin7', 'Smin8', 'Smin9', 'Smin10', 'Smin11', 'Smin12', 'Smin13', 'Smin14', 'Smin15', 'Smin16', 
            'Smin17', 'Smin18', 'Smin19', 'Smin20', 'Smin21', 'Smin22', 'Smin23', 'Smin24', 'Smin25', 'Smin26', 'Smin27', 'Smin28', 'Smin29', 'Smin30', 'Smin31', 'Smin32', 
            'Smin33', 'Smin34', 'Smin35', 'Smin36', 'Smin37', 'Smin38', 'Smin39', 'Smin40', 'Smin41', 'Smin42', 'Smin43', 'Smin44', 'Smin45', 'Smin46', 'Smin47', 'Smin48', 
            'Smin49', 'Smin50', 'Smin51', 'Smin52', 'Smin53', 'Smin54', 'Smin55', 'Smin56', 'Smin57', 'Smin58', 'Smin59', 'Smin60', 'Smin61', 'Smin62', 'Smin63', 'Smin64', 
            'Smin65', 'Smin66', 'Smin67', 'Smin68', 'Smin69', 'Smin70', 'Smin71', 'Smin72', 'Smin73', 'Smin74', 'Smin75', 'Smin76', 'Smin77', 'Smin78', 'Shev', 'Scar', 
            'Shal', 'Shet', 'Save', 'Smax', 'Smin', 'DS', 'MATSm1', 'MATSm2', 'MATSm3', 'MATSm4', 'MATSm5', 'MATSm6', 'MATSm7', 'MATSm8', 'MATSv1', 'MATSv2', 'MATSv3', 
            'MATSv4', 'MATSv5', 'MATSv6', 'MATSv7', 'MATSv8', 'MATSe1', 'MATSe2', 'MATSe3', 'MATSe4', 'MATSe5', 'MATSe6', 'MATSe7', 'MATSe8', 'MATSp1', 'MATSp2', 'MATSp3', 
            'MATSp4', 'MATSp5', 'MATSp6', 'MATSp7', 'MATSp8', 'GATSm1', 'GATSm2', 'GATSm3', 'GATSm4', 'GATSm5', 'GATSm6', 'GATSm7', 'GATSm8', 'GATSv1', 'GATSv2', 'GATSv3', 
            'GATSv4', 'GATSv5', 'GATSv6', 'GATSv7', 'GATSv8', 'GATSe1', 'GATSe2', 'GATSe3', 'GATSe4', 'GATSe5', 'GATSe6', 'GATSe7', 'GATSe8', 'GATSp1', 'GATSp2', 'GATSp3', 
            'GATSp4', 'GATSp5', 'GATSp6', 'GATSp7', 'GATSp8', 'LogP', 'LogP2', 'MR', 'Hy', 'UI', 'SPP', 'LDI', 'Rnc', 'Rpc', 'Mac', 'Tac', 'Mnc', 'Tnc', 'Mpc', 'Tpc', 'Qass', 
            'QOss', 'QNss', 'QCss', 'QHss', 'Qmin', 'Qmax', 'QOmin', 'QNmin', 'QCmin', 'QHmin', 'QOmax', 'QNmax', 'QCmax', 'QHmax', 'TPSA1', 'slogPVSA0', 'slogPVSA1', 'slogPVSA2', 
            'slogPVSA3', 'slogPVSA4', 'slogPVSA5', 'slogPVSA6', 'slogPVSA7', 'slogPVSA8', 'slogPVSA9', 'slogPVSA10', 'slogPVSA11', 'MRVSA0', 'MRVSA1', 'MRVSA2', 'MRVSA3', 
            'MRVSA4', 'MRVSA5', 'MRVSA6', 'MRVSA7', 'MRVSA8', 'MRVSA9', 'PEOEVSA0', 'PEOEVSA1', 'PEOEVSA2', 'PEOEVSA3', 'PEOEVSA4', 'PEOEVSA5', 'PEOEVSA6', 'PEOEVSA7', 
            'PEOEVSA8', 'PEOEVSA9', 'PEOEVSA10', 'PEOEVSA11', 'PEOEVSA12', 'PEOEVSA13', 'EstateVSA0', 'EstateVSA1', 'EstateVSA2', 'EstateVSA3', 'EstateVSA4', 'EstateVSA5', 
            'EstateVSA6', 'EstateVSA7', 'EstateVSA8', 'EstateVSA9', 'EstateVSA10', 'VSAEstate0', 'VSAEstate1', 'VSAEstate2', 'VSAEstate3', 'VSAEstate4', 'VSAEstate5', 'VSAEstate6', 
            'VSAEstate7', 'VSAEstate8', 'VSAEstate9', 'ATSm1', 'ATSm2', 'ATSm3', 'ATSm4', 'ATSm5', 'ATSm6', 'ATSm7', 'ATSm8', 'ATSv1', 'ATSv2', 'ATSv3', 'ATSv4', 'ATSv5', 'ATSv6',
            'ATSv7', 'ATSv8', 'ATSe1', 'ATSe2', 'ATSe3', 'ATSe4', 'ATSe5', 'ATSe6', 'ATSe7', 'ATSe8', 'ATSp1', 'ATSp2', 'ATSp3', 'ATSp4', 'ATSp5', 'ATSp6', 'ATSp7', 'ATSp8'] 
len(expected)

822