# Preprocess the unknown compounds and generate features

Clean, check and featurize the unknown commercially available drugs for model validation

In [10]:
__author__ = "Jing-Quan Wang"

In [2]:
# Autoreload modules
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from mrp7pred.mrp7pred import MRP7Pred
from mrp7pred.utils import (
    DATA,
    MODEL_DIR,
    OUTPUT,
)
from mrp7pred.feats.gen_all_features import featurize
import pickle

In [5]:
DATA_FOLDER = "../data/manual"

## 1. Load data

In [6]:
df = pd.read_csv(f"{DATA_FOLDER}/unknown.csv")
df

Unnamed: 0,name,synonym,cas,target,status,url,smiles
0,Fluphenazine,Prolixin,146-56-5,D1DR and D2DR inhibitor,On market,http://www.selleckchem.com/products/fluphenazi...,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...
1,Citarinostat,ACY241,1316215-12-9,histone deacetylase (HDAC) inhibitor,Phase 1,https://www.medchemexpress.com//Citarinostat.html,C1=CC=C(C=C1)N(C2=CC=CC=C2Cl)C3=NC=C(C=N3)C(=O...
2,Chloroquine diphosphate,,50-63-5,autophagy and toll-like receptors (TLRs) inhib...,Phase 1/2/3/4,https://www.medchemexpress.com/Chloroquine-dip...,CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl.OP(=O)(...
3,Arbidol hydrochloride,Umifenovir hydrochloride,131707-23-8,anti-influenza virus agent,Phase 4,https://www.medchemexpress.com/Arbidol-hydroch...,CCOC(=O)C1=C(N(C2=CC(=C(C(=C21)CN(C)C)O)Br)C)C...
4,Nitazoxanide,,55981-09-4,synthetic nitrothiazolyl-salicylamide derivati...,Phase 1/2/3/4,https://www.medchemexpress.com/nitazoxanide.html,CC(=O)OC1=CC=CC=C1C(=O)NC2=NC=C(S2)[N+](=O)[O-]
...,...,...,...,...,...,...,...
75,Gefitinib,ZD1839,184475-35-2,EGFR inhibitor,Phase 1/2/3,https://www.selleckchem.com/products/Gefitinib...,COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OC...
76,Erlotinib,CP358774,183321-74-6,EGFR inhibitor,Phase 1/2/4,https://www.selleckchem.com/products/erlotinib...,COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C...
77,FRAX486,,1232030-35-1,PAK inhibitor,,https://www.selleckchem.com/products/frax486.html,CCN1C2=NC(=NC=C2C=C(C1=O)C3=C(C=C(C=C3)Cl)Cl)N...
78,AZD4635,HTL1071,1321514-06-0,A2AR antagonist,Phase 1/2,https://www.medchemexpress.com/AZD4635.html,NC1=NC(C2=CC=C(F)C=C2)=C(C3=CC(Cl)=NC(C)=C3)N=N1


In [7]:
len(df)

80

## 2. Check null smiles

In [8]:
df[df["smiles"].isna()]

Unnamed: 0,name,synonym,cas,target,status,url,smiles


## 3. Check duplicates

In [9]:
len(df.drop_duplicates())

80

## 4. Standardize smiles and generate features

In [10]:
with open("./df_feats.pkl", "rb") as fi:
    df_feats_prev = pickle.load(fi)
df_feats_prev.head()

Unnamed: 0,name,smiles,rdk_FractionCSP3,rdk_HeavyAtomCount,rdk_HeavyAtomMolWt,rdk_NHOHCount,rdk_NOCount,rdk_RingCount,rdk_NumAliphaticCarbocycles,rdk_NumAliphaticHeterocycles,...,pychem_ATSe7,pychem_ATSe8,pychem_ATSp1,pychem_ATSp2,pychem_ATSp3,pychem_ATSp4,pychem_ATSp5,pychem_ATSp6,pychem_ATSp7,pychem_ATSp8
0,Fluphenazine,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...,0.25,33.0,441.749,3.0,8.0,3.0,0.0,0.0,...,3.671,3.562,3.424,3.643,3.663,3.678,3.775,3.713,3.438,3.179
1,Citarinostat,C1=CC=C(C=C1)N(C2=CC=CC=C2Cl)C3=NC=C(C=N3)C(=O...,0.083333,21.0,298.215,1.0,8.0,2.0,0.0,0.0,...,3.133,2.964,2.934,3.173,3.062,3.005,2.977,2.858,2.673,2.456
2,Chloroquine diphosphate,CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl.OP(=O)(...,0.181818,28.0,350.276,2.0,6.0,5.0,0.0,1.0,...,3.834,3.538,3.362,3.707,3.748,3.724,3.779,3.764,3.591,3.219
3,Arbidol hydrochloride,CCOC(=O)C1=C(N(C2=CC(=C(C(=C21)CN(C)C)O)Br)C)C...,0.28,37.0,522.218,3.0,9.0,4.0,0.0,1.0,...,4.043,3.96,3.523,3.851,3.916,3.784,3.701,3.703,3.626,3.474
4,Nitazoxanide,CC(=O)OC1=CC=CC=C1C(=O)NC2=NC=C(S2)[N+](=O)[O-],0.26087,30.0,385.269,3.0,6.0,5.0,1.0,0.0,...,3.818,3.641,3.373,3.663,3.63,3.575,3.582,3.555,3.498,3.3


In [11]:
df_data = df[["name", "smiles"]]

In [12]:
# m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210112-032455.pkl")
# out = m7p.predict(df_all = df_data)

## Manual data only

Model is trained on manual data only

### Feature selector

In [53]:
support_similar = np.array([  0,   7,   8,  15,  16,  17,  18,  19,  22,  23,  24,  25,  30,
        33,  34,  35,  37,  39,  40,  41,  42,  43,  44,  46,  47,  48,
        49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  61,  64,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  82,  83,
        84,  85,  87,  88,  90,  91,  92,  93,  94,  95,  97,  98, 100,
       102, 103, 104, 105, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 133, 138, 160, 162, 163, 164, 165, 173, 174, 175, 184, 190,
       192, 193, 197, 213, 214, 216, 217, 228, 245, 246, 250, 257, 263,
       271, 276, 277, 278, 279, 291, 295, 298, 302, 303, 304, 310, 319,
       325, 326, 327, 335, 343, 351, 359, 374, 384, 385, 388, 390, 391,
       392, 397, 398, 399, 400, 401, 402, 405, 407, 409, 410, 412, 413,
       415, 416, 418, 419, 420, 423, 426, 429, 432, 435, 436, 437, 438,
       439, 440, 441, 443, 444, 451, 452, 453, 454, 455, 456, 457, 458,
       459, 460, 461, 462, 463, 464, 465, 467, 468, 469, 470, 472, 473,
       474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 488, 489,
       491, 492, 495, 496, 497, 498, 499, 500, 502, 503, 504, 505, 506,
       507, 508, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524,
       525, 526, 527, 528, 530, 531, 532, 533, 534, 535, 536, 537, 538,
       539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551,
       552, 553, 554, 555, 556, 557, 558, 559, 560, 563, 564, 565, 566,
       567, 568, 569, 570, 571, 572, 573, 574, 576, 577, 581, 587, 588,
       589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 601, 602,
       608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620,
       621, 622, 623, 625, 626, 627, 628, 630, 631, 632, 633, 635, 636,
       638, 640, 641, 642, 643, 644, 645, 647, 648, 649, 650, 652, 653,
       655, 656, 657, 658, 659, 661, 662, 663, 664, 665, 666, 667, 668,
       670, 671, 674, 675, 676, 677, 678, 679, 680, 681, 684, 685, 686,
       691, 692, 693, 695, 696, 697, 699, 700, 701, 702, 703, 704, 705,
       707, 711, 712, 713, 714, 715, 716, 718, 725, 727, 728, 729, 730,
       731, 732, 733, 734, 735, 736, 738, 740, 741, 742, 743, 744, 745,
       746, 747, 751, 752, 753, 754, 758, 761, 762, 763, 764, 765, 766,
       767, 768, 769, 770, 771, 772, 773, 774, 775, 786, 787, 788, 789,
       790, 791, 793, 794, 826])

support_lowvar = np.array([  0,   1,   2,   4,   5,   6,   7,   8,   9,  10,  11,  12,  14,
        16,  17,  18,  19,  20,  23,  24,  25,  27,  28,  32,  34,  37,
        40,  44,  45,  47,  48,  53,  54,  55,  56,  57,  60,  62,  63,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  83,  84,  85,  87,  88,  89,  90,  91,  92,  93,
        94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106,
       107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
       120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 137, 138,
       139, 141, 142, 143, 147, 149, 151, 190, 191, 192, 193, 195, 196,
       198, 201, 202, 204, 205, 206, 207, 208, 217, 220, 221, 222, 223,
       224, 240, 245, 256, 257, 258, 259, 260, 262, 263, 264, 265, 266,
       272, 273, 274, 275, 276, 286, 310, 311, 312, 313, 314, 315, 316,
       317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
       330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342,
       343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355,
       356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368,
       369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381,
       382, 383, 384, 385, 386, 387, 388, 389, 390, 392, 393, 394, 395,
       397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409,
       410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420])

In [54]:
df_data = pd.read_csv("./unknown_full_features_828_20210121-233159.csv", index_col=0)

In [55]:
df_data.dropna(inplace=True)

In [56]:
df_data

Unnamed: 0,name,smiles,rdk_FractionCSP3,rdk_HeavyAtomCount,rdk_HeavyAtomMolWt,rdk_NHOHCount,rdk_NOCount,rdk_RingCount,rdk_NumAliphaticCarbocycles,rdk_NumAliphaticHeterocycles,...,pychem_ATSe7,pychem_ATSe8,pychem_ATSp1,pychem_ATSp2,pychem_ATSp3,pychem_ATSp4,pychem_ATSp5,pychem_ATSp6,pychem_ATSp7,pychem_ATSp8
0,Fluphenazine,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...,0.250000,33.0,441.749,3.0,8.0,3.0,0.0,0.0,...,3.671,3.562,3.424,3.643,3.663,3.678,3.775,3.713,3.438,3.179
1,Citarinostat,C1=CC=C(C=C1)N(C2=CC=CC=C2Cl)C3=NC=C(C=N3)C(=O...,0.083333,21.0,298.215,1.0,8.0,2.0,0.0,0.0,...,3.133,2.964,2.934,3.173,3.062,3.005,2.977,2.858,2.673,2.456
2,Chloroquine diphosphate,CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl.OP(=O)(...,0.181818,28.0,350.276,2.0,6.0,5.0,0.0,1.0,...,3.834,3.538,3.362,3.707,3.748,3.724,3.779,3.764,3.591,3.219
3,Arbidol hydrochloride,CCOC(=O)C1=C(N(C2=CC(=C(C(=C21)CN(C)C)O)Br)C)C...,0.280000,37.0,522.218,3.0,9.0,4.0,0.0,1.0,...,4.043,3.960,3.523,3.851,3.916,3.784,3.701,3.703,3.626,3.474
4,Nitazoxanide,CC(=O)OC1=CC=CC=C1C(=O)NC2=NC=C(S2)[N+](=O)[O-],0.260870,30.0,385.269,3.0,6.0,5.0,1.0,0.0,...,3.818,3.641,3.373,3.663,3.630,3.575,3.582,3.555,3.498,3.300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Spebrutinib,COCCOC1=CC=C(C=C1)NC2=NC=C(C(=N2)NC3=CC(=CC=C3...,0.363636,31.0,422.718,1.0,7.0,4.0,0.0,1.0,...,3.821,3.779,3.329,3.630,3.658,3.497,3.478,3.485,3.488,3.393
73,AZD3463,COC1=C(C=CC(=C1)N2CCC(CC2)N)NC3=NC=C(C(=N3)C4=...,0.272727,29.0,370.259,1.0,7.0,3.0,0.0,0.0,...,3.852,3.714,3.234,3.502,3.564,3.478,3.522,3.547,3.491,3.318
74,Osimertinib,CN1C=C(C2=CC=CC=C21)C3=NC(=NC=C3)NC4=C(C=C(C(=...,0.240000,35.0,490.220,2.0,7.0,5.0,0.0,1.0,...,3.895,3.760,3.526,3.872,3.961,3.905,3.806,3.700,3.569,3.455
75,Gefitinib,COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OC...,0.066667,22.0,304.651,2.0,5.0,3.0,0.0,0.0,...,3.278,3.001,3.060,3.348,3.372,3.324,3.276,3.192,2.941,2.447


In [57]:
features = df_data.iloc[:, 2:]
print(features.shape)
features

(77, 828)


Unnamed: 0,rdk_FractionCSP3,rdk_HeavyAtomCount,rdk_HeavyAtomMolWt,rdk_NHOHCount,rdk_NOCount,rdk_RingCount,rdk_NumAliphaticCarbocycles,rdk_NumAliphaticHeterocycles,rdk_NumAliphaticRings,rdk_NumAromaticCarbocycles,...,pychem_ATSe7,pychem_ATSe8,pychem_ATSp1,pychem_ATSp2,pychem_ATSp3,pychem_ATSp4,pychem_ATSp5,pychem_ATSp6,pychem_ATSp7,pychem_ATSp8
0,0.250000,33.0,441.749,3.0,8.0,3.0,0.0,0.0,0.0,2.0,...,3.671,3.562,3.424,3.643,3.663,3.678,3.775,3.713,3.438,3.179
1,0.083333,21.0,298.215,1.0,8.0,2.0,0.0,0.0,0.0,1.0,...,3.133,2.964,2.934,3.173,3.062,3.005,2.977,2.858,2.673,2.456
2,0.181818,28.0,350.276,2.0,6.0,5.0,0.0,1.0,1.0,1.0,...,3.834,3.538,3.362,3.707,3.748,3.724,3.779,3.764,3.591,3.219
3,0.280000,37.0,522.218,3.0,9.0,4.0,0.0,1.0,1.0,2.0,...,4.043,3.960,3.523,3.851,3.916,3.784,3.701,3.703,3.626,3.474
4,0.260870,30.0,385.269,3.0,6.0,5.0,1.0,0.0,1.0,2.0,...,3.818,3.641,3.373,3.663,3.630,3.575,3.582,3.555,3.498,3.300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,0.363636,31.0,422.718,1.0,7.0,4.0,0.0,1.0,1.0,2.0,...,3.821,3.779,3.329,3.630,3.658,3.497,3.478,3.485,3.488,3.393
73,0.272727,29.0,370.259,1.0,7.0,3.0,0.0,0.0,0.0,2.0,...,3.852,3.714,3.234,3.502,3.564,3.478,3.522,3.547,3.491,3.318
74,0.240000,35.0,490.220,2.0,7.0,5.0,0.0,1.0,1.0,2.0,...,3.895,3.760,3.526,3.872,3.961,3.905,3.806,3.700,3.569,3.455
75,0.066667,22.0,304.651,2.0,5.0,3.0,0.0,0.0,0.0,1.0,...,3.278,3.001,3.060,3.348,3.372,3.324,3.276,3.192,2.941,2.447


In [58]:
features_remove_similar = feaatures.iloc[:, support_similar]

In [59]:
features_remove_similar.shape

(77, 421)

In [60]:
features_remove_lowvar = features_remove_similar.iloc[:, support_lowvar]

In [61]:
features_remove_lowvar.shape

(77, 258)

In [62]:
features_remove_lowvar["name"] = df_data["name"]
features_remove_lowvar["smiles"] = df_data["smiles"]

In [63]:
features_remove_lowvar

Unnamed: 0,rdk_FractionCSP3,rdk_NumAliphaticHeterocycles,rdk_NumAliphaticRings,rdk_NumRotatableBonds,rdk_NumSaturatedCarbocycles,rdk_NumSaturatedHeterocycles,rdk_NumSaturatedRings,rdk_fr_Al_OH,rdk_fr_Al_OH_noTert,rdk_fr_ArN,...,pychem_VSAEstate1,pychem_VSAEstate2,pychem_VSAEstate3,pychem_VSAEstate4,pychem_VSAEstate5,pychem_VSAEstate7,pychem_VSAEstate8,pychem_ATSp7,name,smiles
0,0.250000,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,...,34.065,11.872,3.563,-0.227,17.063,0.516,0.000,3.438,Fluphenazine,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...
1,0.083333,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,36.760,12.877,0.130,-1.023,6.158,1.221,0.000,2.673,Citarinostat,C1=CC=C(C=C1)N(C2=CC=CC=C2Cl)C3=NC=C(C=N3)C(=O...
2,0.181818,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20.901,5.783,12.774,-0.447,13.345,2.887,0.000,3.591,Chloroquine diphosphate,CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl.OP(=O)(...
3,0.280000,1.0,1.0,6.0,0.0,1.0,1.0,0.0,0.0,1.0,...,29.504,10.375,7.050,-1.278,10.441,4.607,2.024,3.626,Arbidol hydrochloride,CCOC(=O)C1=C(N(C2=CC(=C(C(=C21)CN(C)C)O)Br)C)C...
4,0.260870,0.0,1.0,6.0,1.0,0.0,1.0,0.0,0.0,0.0,...,7.533,1.188,8.154,1.329,10.468,2.310,1.572,3.498,Nitazoxanide,CC(=O)OC1=CC=CC=C1C(=O)NC2=NC=C(S2)[N+](=O)[O-]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,0.363636,1.0,1.0,8.0,0.0,1.0,1.0,0.0,0.0,0.0,...,11.035,3.966,1.318,1.311,8.090,5.005,1.600,3.488,Spebrutinib,COCCOC1=CC=C(C=C1)NC2=NC=C(C(=N2)NC3=CC(=CC=C3...
73,0.272727,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.754,4.097,2.343,4.443,11.258,1.726,3.250,3.491,AZD3463,COC1=C(C=CC(=C1)N2CCC(CC2)N)NC3=NC=C(C(=N3)C4=...
74,0.240000,1.0,1.0,5.0,0.0,1.0,1.0,0.0,0.0,0.0,...,24.254,7.882,2.380,-0.043,11.741,5.446,0.000,3.569,Osimertinib,CN1C=C(C2=CC=CC=C21)C3=NC(=NC=C3)NC4=C(C=C(C(=...
75,0.066667,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,...,8.341,8.255,8.796,-0.289,9.407,1.825,0.000,2.941,Gefitinib,COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OC...


In [73]:
m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210124-023923.pkl")
out = m7p.predict(featurized_df = features_remove_lowvar, prefix="unknown_")

Loading trained model ... Done!
Start predicting ...Done!
Writing output ...Done! Results saved to: ../output/unknown_predicted_20210124-025942.csv


In [12]:
expected = ['FractionCSP3', 'HeavyAtomCount', 'HeavyAtomMolWt', 'NHOHCount', 'NOCount', 'RingCount', 'NumAliphaticCarbocycles', 
            'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 'NumAromaticRings', 
            'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRadicalElectrons', 'NumRotatableBonds', 'NumSaturatedCarbocycles', 
            'NumSaturatedHeterocycles', 'NumSaturatedRings', 'NumValenceElectrons', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN',
            'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 
            'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 
            'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 
            'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 
            'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 
            'fr_isocyan', 'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy', 'fr_morpholine', 
            'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 
            'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 
            'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 
            'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi1', 'Chi0v', 
            'Chi1v', 'Chi2v', 'Chi3v', 'Chi4v', 'Chi0n', 'Chi1n', 'Chi2n', 'Chi3n', 'Chi4n', 'EState_VSA1', 'EState_VSA2', 'EState_VSA3', 
            'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'EState_VSA10', 'EState_VSA11', 'ExactMolWt', 
            'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'MolLogP', 'MolMR', 'MolWt', 'PEOE_VSA1', 'PEOE_VSA2', 'PEOE_VSA3', 
            'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 
            'PEOE_VSA14', 'SMR_VSA1', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SMR_VSA10', 
            'SlogP_VSA1', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'SlogP_VSA10', 
            'SlogP_VSA11', 'SlogP_VSA12', 'TPSA', 'VSA_EState1', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 
            'VSA_EState8', 'VSA_EState9', 'VSA_EState10', 'MaxAbsEStateIndex', 'MaxAbsPartialCharge', 'MaxEStateIndex', 'MaxPartialCharge', 
            'MinAbsEStateIndex', 'MinAbsPartialCharge', 'MinEStateIndex', 'MinPartialCharge', 'Weight', 'AWeight', 'nhyd', 'nhal', 'nhet', 'nhev', 
            'ncof', 'ncocl', 'ncobr', 'ncoi', 'ncarb', 'nphos', 'nsulph', 'noxy', 'nnitro', 'nring', 'nrot', 'ndonr', 'naccr', 'nsb', 'ndb', 'naro', 
            'ntb', 'nta', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'W', 'AW', 'J', 'Tigdi', 'Xu', 'GMTI', 'Pol', 'DZ', 'Thara', 'Tsch', 'ZM1', 'ZM2', 
            'MZM1', 'MZM2', 'Qindex', 'Platt', 'diametert', 'radiust', 'petitjeant', 'Sito', 'Hato', 'Geto', 'Arto', 'ISIZ', 'TIAC', 'IDET', 'IDE', 'IVDE', 
            'Gravto', 'Hatov', 'Sitov', 'Getov', 'GMTIV', 'mChi1', 'Chi2', 'Chi3', 'Chi4', 'Chi5', 'Chi6', 'Chi7', 'Chi8', 'Chi9', 'Chi10', 'Chi3c', 'Chi4c', 
            'Chi4pc', 'Chi3ch', 'Chi4ch', 'Chi5ch', 'Chi6ch', 'knotp', 'Chiv0', 'Chiv1', 'Chiv2', 'Chiv3', 'Chiv4', 'Chiv5', 'Chiv6', 'Chiv7', 'Chiv8', 'Chiv9', 
            'Chiv10', 'dchi0', 'dchi1', 'dchi2', 'dchi3', 'dchi4', 'Chiv3c', 'Chiv4c', 'Chiv4pc', 'Chiv3ch', 'Chiv4ch', 'Chiv5ch', 'Chiv6ch', 'knotpv', 'kappa1', 
            'kappa2', 'kappa3', 'kappam1', 'kappam2', 'kappam3', 'phi', 'bcutm16', 'bcutm15', 'bcutm14', 'bcutm13', 'bcutm12', 'bcutm11', 'bcutm10', 'bcutm9', 
            'bcutm8', 'bcutm7', 'bcutm6', 'bcutm5', 'bcutm4', 'bcutm3', 'bcutm2', 'bcutm1', 'bcutv16', 'bcutv15', 'bcutv14', 'bcutv13', 'bcutv12', 'bcutv11', 'bcutv10', 
            'bcutv9', 'bcutv8', 'bcutv7', 'bcutv6', 'bcutv5', 'bcutv4', 'bcutv3', 'bcutv2', 'bcutv1', 'bcute16', 'bcute15', 'bcute14', 'bcute13', 'bcute12', 'bcute11', 
            'bcute10', 'bcute9', 'bcute8', 'bcute7', 'bcute6', 'bcute5', 'bcute4', 'bcute3', 'bcute2', 'bcute1', 'bcutp16', 'bcutp15', 'bcutp14', 'bcutp13', 'bcutp12', 
            'bcutp11', 'bcutp10', 'bcutp9', 'bcutp8', 'bcutp7', 'bcutp6', 'bcutp5', 'bcutp4', 'bcutp3', 'bcutp2', 'bcutp1', 'CIC0', 'CIC1', 'CIC2', 'CIC3', 'CIC4', 'CIC5', 
            'CIC6', 'SIC0', 'SIC1', 'SIC2', 'SIC3', 'SIC4', 'SIC5', 'SIC6', 'IC0', 'IC1', 'IC2', 'IC3', 'IC4', 'IC5', 'IC6', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 
            'S8', 'S9', 'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20', 'S21', 'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29', 'S30', 
            'S31', 'S32', 'S33', 'S34', 'S35', 'S36', 'S37', 'S38', 'S39', 'S40', 'S41', 'S42', 'S43', 'S44', 'S45', 'S46', 'S47', 'S48', 'S49', 'S50', 'S51', 'S52', 'S53', 
            'S54', 'S55', 'S56', 'S57', 'S58', 'S59', 'S60', 'S61', 'S62', 'S63', 'S64', 'S65', 'S66', 'S67', 'S68', 'S69', 'S70', 'S71', 'S72', 'S73', 'S74', 'S75', 'S76', 
            'S77', 'S78', 'S79', 'Smax0', 'Smax1', 'Smax2', 'Smax3', 'Smax4', 'Smax5', 'Smax6', 'Smax7', 'Smax8', 'Smax9', 'Smax10', 'Smax11', 'Smax12', 'Smax13', 'Smax14', 
            'Smax15', 'Smax16', 'Smax17', 'Smax18', 'Smax19', 'Smax20', 'Smax21', 'Smax22', 'Smax23', 'Smax24', 'Smax25', 'Smax26', 'Smax27', 'Smax28', 'Smax29', 'Smax30', 
            'Smax31', 'Smax32', 'Smax33', 'Smax34', 'Smax35', 'Smax36', 'Smax37', 'Smax38', 'Smax39', 'Smax40', 'Smax41', 'Smax42', 'Smax43', 'Smax44', 'Smax45', 'Smax46', 
            'Smax47', 'Smax48', 'Smax49', 'Smax50', 'Smax51', 'Smax52', 'Smax53', 'Smax54', 'Smax55', 'Smax56', 'Smax57', 'Smax58', 'Smax59', 'Smax60', 'Smax61', 'Smax62', 
            'Smax63', 'Smax64', 'Smax65', 'Smax66', 'Smax67', 'Smax68', 'Smax69', 'Smax70', 'Smax71', 'Smax72', 'Smax73', 'Smax74', 'Smax75', 'Smax76', 'Smax77', 'Smax78', 
            'Smin0', 'Smin1', 'Smin2', 'Smin3', 'Smin4', 'Smin5', 'Smin6', 'Smin7', 'Smin8', 'Smin9', 'Smin10', 'Smin11', 'Smin12', 'Smin13', 'Smin14', 'Smin15', 'Smin16', 
            'Smin17', 'Smin18', 'Smin19', 'Smin20', 'Smin21', 'Smin22', 'Smin23', 'Smin24', 'Smin25', 'Smin26', 'Smin27', 'Smin28', 'Smin29', 'Smin30', 'Smin31', 'Smin32', 
            'Smin33', 'Smin34', 'Smin35', 'Smin36', 'Smin37', 'Smin38', 'Smin39', 'Smin40', 'Smin41', 'Smin42', 'Smin43', 'Smin44', 'Smin45', 'Smin46', 'Smin47', 'Smin48', 
            'Smin49', 'Smin50', 'Smin51', 'Smin52', 'Smin53', 'Smin54', 'Smin55', 'Smin56', 'Smin57', 'Smin58', 'Smin59', 'Smin60', 'Smin61', 'Smin62', 'Smin63', 'Smin64', 
            'Smin65', 'Smin66', 'Smin67', 'Smin68', 'Smin69', 'Smin70', 'Smin71', 'Smin72', 'Smin73', 'Smin74', 'Smin75', 'Smin76', 'Smin77', 'Smin78', 'Shev', 'Scar', 
            'Shal', 'Shet', 'Save', 'Smax', 'Smin', 'DS', 'MATSm1', 'MATSm2', 'MATSm3', 'MATSm4', 'MATSm5', 'MATSm6', 'MATSm7', 'MATSm8', 'MATSv1', 'MATSv2', 'MATSv3', 
            'MATSv4', 'MATSv5', 'MATSv6', 'MATSv7', 'MATSv8', 'MATSe1', 'MATSe2', 'MATSe3', 'MATSe4', 'MATSe5', 'MATSe6', 'MATSe7', 'MATSe8', 'MATSp1', 'MATSp2', 'MATSp3', 
            'MATSp4', 'MATSp5', 'MATSp6', 'MATSp7', 'MATSp8', 'GATSm1', 'GATSm2', 'GATSm3', 'GATSm4', 'GATSm5', 'GATSm6', 'GATSm7', 'GATSm8', 'GATSv1', 'GATSv2', 'GATSv3', 
            'GATSv4', 'GATSv5', 'GATSv6', 'GATSv7', 'GATSv8', 'GATSe1', 'GATSe2', 'GATSe3', 'GATSe4', 'GATSe5', 'GATSe6', 'GATSe7', 'GATSe8', 'GATSp1', 'GATSp2', 'GATSp3', 
            'GATSp4', 'GATSp5', 'GATSp6', 'GATSp7', 'GATSp8', 'LogP', 'LogP2', 'MR', 'Hy', 'UI', 'SPP', 'LDI', 'Rnc', 'Rpc', 'Mac', 'Tac', 'Mnc', 'Tnc', 'Mpc', 'Tpc', 'Qass', 
            'QOss', 'QNss', 'QCss', 'QHss', 'Qmin', 'Qmax', 'QOmin', 'QNmin', 'QCmin', 'QHmin', 'QOmax', 'QNmax', 'QCmax', 'QHmax', 'TPSA1', 'slogPVSA0', 'slogPVSA1', 'slogPVSA2', 
            'slogPVSA3', 'slogPVSA4', 'slogPVSA5', 'slogPVSA6', 'slogPVSA7', 'slogPVSA8', 'slogPVSA9', 'slogPVSA10', 'slogPVSA11', 'MRVSA0', 'MRVSA1', 'MRVSA2', 'MRVSA3', 
            'MRVSA4', 'MRVSA5', 'MRVSA6', 'MRVSA7', 'MRVSA8', 'MRVSA9', 'PEOEVSA0', 'PEOEVSA1', 'PEOEVSA2', 'PEOEVSA3', 'PEOEVSA4', 'PEOEVSA5', 'PEOEVSA6', 'PEOEVSA7', 
            'PEOEVSA8', 'PEOEVSA9', 'PEOEVSA10', 'PEOEVSA11', 'PEOEVSA12', 'PEOEVSA13', 'EstateVSA0', 'EstateVSA1', 'EstateVSA2', 'EstateVSA3', 'EstateVSA4', 'EstateVSA5', 
            'EstateVSA6', 'EstateVSA7', 'EstateVSA8', 'EstateVSA9', 'EstateVSA10', 'VSAEstate0', 'VSAEstate1', 'VSAEstate2', 'VSAEstate3', 'VSAEstate4', 'VSAEstate5', 'VSAEstate6', 
            'VSAEstate7', 'VSAEstate8', 'VSAEstate9', 'ATSm1', 'ATSm2', 'ATSm3', 'ATSm4', 'ATSm5', 'ATSm6', 'ATSm7', 'ATSm8', 'ATSv1', 'ATSv2', 'ATSv3', 'ATSv4', 'ATSv5', 'ATSv6',
            'ATSv7', 'ATSv8', 'ATSe1', 'ATSe2', 'ATSe3', 'ATSe4', 'ATSe5', 'ATSe6', 'ATSe7', 'ATSe8', 'ATSp1', 'ATSp2', 'ATSp3', 'ATSp4', 'ATSp5', 'ATSp6', 'ATSp7', 'ATSp8'] 
len(expected)

822

## Predictions

In [19]:
df_res = pd.read_csv("../output/predicted_20210112-115218.csv", index_col=0)
df_res.head()

Unnamed: 0,name,smiles,pred,score
0,Fluphenazine,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...,1,0.6255
1,Citarinostat,C1=CC=C(C=C1)N(C2=CC=CC=C2Cl)C3=NC=C(C=N3)C(=O...,0,0.274
2,Chloroquine diphosphate,CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl.OP(=O)(...,0,0.174
3,Arbidol hydrochloride,CCOC(=O)C1=C(N(C2=CC(=C(C(=C21)CN(C)C)O)Br)C)C...,0,0.374889
4,Nitazoxanide,CC(=O)OC1=CC=CC=C1C(=O)NC2=NC=C(S2)[N+](=O)[O-],0,0.192


In [20]:
df_res.shape

(80, 4)

In [21]:
df_pred_pos = df_res[df_res["pred"]==1]

In [22]:
print(len(df_pred_pos))
df_pred_pos

37


Unnamed: 0,name,smiles,pred,score
0,Fluphenazine,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...,1,0.6255
5,Galunisertib,CC1=NC(=CC=C1)C2=NN3CCCC3=C2C4=C5C=C(C=CC5=NC=...,1,0.7915
8,Berzosertib,CC(C)S(=O)(=O)C1=CC=C(C=C1)C2=CN=C(C(=N2)C3=CC...,1,0.857126
10,Binimetinib,CN1C=NC2=C1C=C(C(=C2F)NC3=C(C=C(C=C3)Br)F)C(=O...,1,0.516336
12,Selumetinib,CN1C=NC2=C1C=C(C(=C2F)NC3=C(C=C(C=C3)Br)Cl)C(=...,1,0.706901
14,Selinexor,C1=CN=C(C=N1)NNC(=O)C=CN2C=NC(=N2)C3=CC(=CC(=C...,1,0.910742
16,Cobimetinib,C1CCNC(C1)C2(CN(C2)C(=O)C3=C(C(=C(C=C3)F)F)NC4...,1,0.627
20,Peficitinib,C1C2CC3CC(C2)(CC1C3NC4=C5C=CNC5=NC=C4C(=O)N)O,1,0.530309
21,Buparlisib,C1COCCN1C2=NC(=NC(=C2)C3=CN=C(C=C3C(F)(F)F)N)N...,1,0.873849
22,Semaxanib,CC1=CC(=C(N1)C=C2C3=CC=CC=C3NC2=O)C,1,0.69112


## Predictions (remove similar and lowvar)

In [74]:
df_res = pd.read_csv("../output/unknown_predicted_20210124-025942.csv", index_col=0)
df_res.head()

Unnamed: 0,name,smiles,pred,score
0,Fluphenazine,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...,1,0.567
1,Citarinostat,C1=CC=C(C=C1)N(C2=CC=CC=C2Cl)C3=NC=C(C=N3)C(=O...,0,0.425
2,Chloroquine diphosphate,CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl.OP(=O)(...,1,0.652
3,Arbidol hydrochloride,CCOC(=O)C1=C(N(C2=CC(=C(C(=C21)CN(C)C)O)Br)C)C...,1,0.593
4,Nitazoxanide,CC(=O)OC1=CC=CC=C1C(=O)NC2=NC=C(S2)[N+](=O)[O-],1,0.648


In [75]:
df_res.shape

(77, 4)

In [76]:
df_pred_pos = df_res[df_res["pred"]==1]

In [80]:
print(len(df_pred_pos))
df_pred_pos.sort_values(by=["score"], ascending=False)

58


Unnamed: 0,name,smiles,pred,score
43,Varespladib,CCC1=C(C2=C(N1CC3=CC=CC=C3)C=CC=C2OCC(=O)O)C(=...,1,0.881
73,AZD3463,COC1=C(C=CC(=C1)N2CCC(CC2)N)NC3=NC=C(C(=N3)C4=...,1,0.846
50,Niraparib,C1CC(CNC1)C2=CC=C(C=C2)N3C=C4C=CC=C(C4=N3)C(=O)N,1,0.821
65,TAE226,CNC(=O)C1=CC=CC=C1NC2=NC(=NC=C2Cl)NC3=C(C=C(C=...,1,0.766
66,Defactinib,CNC(=O)C1=CC=C(C=C1)NC2=NC=C(C(=N2)NCC3=NC=CN=...,1,0.754
60,GSK2982772,CN1C2=CC=CC=C2OCC(C1=O)NC(=O)C3=NNC(=N3)CC4=CC...,1,0.748
26,Disulfiram,CCN(CC)C(=S)SSC(=S)N(CC)CC,1,0.745
34,AZD8186,CC(C1=CC(=CC2=C1OC(=CC2=O)N3CCOCC3)C(=O)N(C)C)...,1,0.738
74,Osimertinib,CN1C=C(C2=CC=CC=C21)C3=NC(=NC=C3)NC4=C(C=C(C(=...,1,0.716
48,IWR-1-endo,C1C2C=CC1C3C2C(=O)N(C3=O)C4=CC=C(C=C4)C(=O)NC5...,1,0.716
