# Preprocess the unknown compounds and generate features

Clean, check and featurize the unknown commercially available drugs for model validation

In [2]:
__author__ = "Jing-Quan Wang"

In [3]:
# Autoreload modules
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from mrp7pred.mrp7pred import MRP7Pred
from mrp7pred.utils import (
    DATA,
    MODEL_DIR,
    OUTPUT,
)
from mrp7pred.feats.gen_all_features import featurize
import pickle

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
DATA_FOLDER = "../data/manual"

## 1. Load data

In [5]:
df = pd.read_csv(f"{DATA_FOLDER}/unknown.csv")
df

Unnamed: 0,name,synonym,cas,target,status,url,smiles
0,Fluphenazine,Prolixin,146-56-5,D1DR and D2DR inhibitor,On market,http://www.selleckchem.com/products/fluphenazi...,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...
1,Citarinostat,ACY241,1316215-12-9,histone deacetylase (HDAC) inhibitor,Phase 1,https://www.medchemexpress.com//Citarinostat.html,C1=CC=C(C=C1)N(C2=CC=CC=C2Cl)C3=NC=C(C=N3)C(=O...
2,Chloroquine diphosphate,,50-63-5,autophagy and toll-like receptors (TLRs) inhib...,Phase 1/2/3/4,https://www.medchemexpress.com/Chloroquine-dip...,CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl.OP(=O)(...
3,Arbidol hydrochloride,Umifenovir hydrochloride,131707-23-8,anti-influenza virus agent,Phase 4,https://www.medchemexpress.com/Arbidol-hydroch...,CCOC(=O)C1=C(N(C2=CC(=C(C(=C21)CN(C)C)O)Br)C)C...
4,Nitazoxanide,,55981-09-4,synthetic nitrothiazolyl-salicylamide derivati...,Phase 1/2/3/4,https://www.medchemexpress.com/nitazoxanide.html,CC(=O)OC1=CC=CC=C1C(=O)NC2=NC=C(S2)[N+](=O)[O-]
...,...,...,...,...,...,...,...
75,Gefitinib,ZD1839,184475-35-2,EGFR inhibitor,Phase 1/2/3,https://www.selleckchem.com/products/Gefitinib...,COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OC...
76,Erlotinib,CP358774,183321-74-6,EGFR inhibitor,Phase 1/2/4,https://www.selleckchem.com/products/erlotinib...,COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C...
77,FRAX486,,1232030-35-1,PAK inhibitor,,https://www.selleckchem.com/products/frax486.html,CCN1C2=NC(=NC=C2C=C(C1=O)C3=C(C=C(C=C3)Cl)Cl)N...
78,AZD4635,HTL1071,1321514-06-0,A2AR antagonist,Phase 1/2,https://www.medchemexpress.com/AZD4635.html,NC1=NC(C2=CC=C(F)C=C2)=C(C3=CC(Cl)=NC(C)=C3)N=N1


In [6]:
len(df)

80

## 2. Check null smiles

In [7]:
df[df["smiles"].isna()]

Unnamed: 0,name,synonym,cas,target,status,url,smiles


## 3. Check duplicates

In [8]:
len(df.drop_duplicates())

80

## 4. Generate features

In [9]:
# with open("./df_feats.pkl", "rb") as fi:
#     df_feats_prev = pickle.load(fi)
# df_feats_prev.head()

df = df[["name", "smiles"]]
df_feats = featurize(df, prefix="featurized_unknown_")

Fluphenazine Featurization failed
Smiles: Cl.Cl.OCCN1CCN(CCCN2c3ccccc3Sc3ccc(C(F)(F)F)cc32)CC1
Error: Time out (10s)
1. Citarinostat
SMILES: O=C(CCCCCCNC(=O)c1cnc(N(c2ccccc2)c2ccccc2Cl)nc1)NO

Chloroquine diphosphate Featurization failed
Smiles: CCN(CC)CCCC(C)Nc1ccnc2cc(Cl)ccc12.O=P(O)(O)O.O=P(O)(O)O
Error: Time out (10s)
Arbidol hydrochloride Featurization failed
Smiles: CCOC(=O)c1c(CSc2ccccc2)n(C)c2cc(Br)c(O)c(CN(C)C)c12.Cl
Error: Time out (10s)
4. Nitazoxanide
SMILES: CC(=O)Oc1ccccc1C(=O)Nc1ncc([N+](=O)[O-])s1

5. Galunisertib
SMILES: Cc1cccc(-c2nn3c(c2-c2ccnc4ccc(C(N)=O)cc24)CCC3)n1

6. Ensartinib
SMILES: CC(Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2)nnc1N)c1c(Cl)ccc(F)c1Cl

7. Anlotinib
SMILES: COc1cc2c(Oc3ccc4[nH]c(C)cc4c3F)ccnc2cc1OCC1(N)CC1

8. Berzosertib
SMILES: CNCc1ccc(-c2cc(-c3nc(-c4ccc(S(=O)(=O)C(C)C)cc4)cnc3N)on2)cc1

9. Ribociclib
SMILES: CN(C)C(=O)c1cc2cnc(Nc3ccc(N4CCNCC4)cn3)nc2n1C1CCCC1

10. Binimetinib
SMILES: Cn1cnc2c(F)c(Nc3ccc(Br)cc3F)c(C(=O)NOCCO)cc21

11. Odanacati

In [9]:
# m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210112-032455.pkl")
# out = m7p.predict(df_all = df_data)

# Model trained by manual data only

Model is trained on manual data only

### Feature selector

In [12]:
support_similar = np.array([  0,   7,   8,  15,  16,  17,  18,  19,  22,  23,  24,  25,  30,
        33,  34,  35,  37,  39,  40,  41,  42,  43,  44,  46,  47,  48,
        49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  61,  64,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  82,  83,
        84,  85,  87,  88,  90,  91,  92,  93,  94,  95,  97,  98, 100,
       102, 103, 104, 105, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 133, 138, 160, 162, 163, 164, 165, 173, 174, 175, 184, 190,
       192, 193, 197, 213, 214, 216, 217, 228, 245, 246, 250, 257, 263,
       271, 276, 277, 278, 279, 291, 295, 298, 302, 303, 304, 310, 319,
       325, 326, 327, 335, 343, 351, 359, 374, 384, 385, 388, 390, 391,
       392, 397, 398, 399, 400, 401, 402, 405, 407, 409, 410, 412, 413,
       415, 416, 418, 419, 420, 423, 426, 429, 432, 435, 436, 437, 438,
       439, 440, 441, 443, 444, 451, 452, 453, 454, 455, 456, 457, 458,
       459, 460, 461, 462, 463, 464, 465, 467, 468, 469, 470, 472, 473,
       474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 488, 489,
       491, 492, 495, 496, 497, 498, 499, 500, 502, 503, 504, 505, 506,
       507, 508, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524,
       525, 526, 527, 528, 530, 531, 532, 533, 534, 535, 536, 537, 538,
       539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551,
       552, 553, 554, 555, 556, 557, 558, 559, 560, 563, 564, 565, 566,
       567, 568, 569, 570, 571, 572, 573, 574, 576, 577, 581, 587, 588,
       589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 601, 602,
       608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620,
       621, 622, 623, 625, 626, 627, 628, 630, 631, 632, 633, 635, 636,
       638, 640, 641, 642, 643, 644, 645, 647, 648, 649, 650, 652, 653,
       655, 656, 657, 658, 659, 661, 662, 663, 664, 665, 666, 667, 668,
       670, 671, 674, 675, 676, 677, 678, 679, 680, 681, 684, 685, 686,
       691, 692, 693, 695, 696, 697, 699, 700, 701, 702, 703, 704, 705,
       707, 711, 712, 713, 714, 715, 716, 718, 725, 727, 728, 729, 730,
       731, 732, 733, 734, 735, 736, 738, 740, 741, 742, 743, 744, 745,
       746, 747, 751, 752, 753, 754, 758, 761, 762, 763, 764, 765, 766,
       767, 768, 769, 770, 771, 772, 773, 774, 775, 786, 787, 788, 789,
       790, 791, 793, 794, 826])

support_lowvar = np.array([  0,   1,   2,   4,   5,   6,   7,   8,   9,  10,  11,  12,  14,
        16,  17,  18,  19,  20,  23,  24,  25,  27,  28,  32,  34,  37,
        40,  44,  45,  47,  48,  53,  54,  55,  56,  57,  60,  62,  63,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  83,  84,  85,  87,  88,  89,  90,  91,  92,  93,
        94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106,
       107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
       120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 137, 138,
       139, 141, 142, 143, 147, 149, 151, 190, 191, 192, 193, 195, 196,
       198, 201, 202, 204, 205, 206, 207, 208, 217, 220, 221, 222, 223,
       224, 240, 245, 256, 257, 258, 259, 260, 262, 263, 264, 265, 266,
       272, 273, 274, 275, 276, 286, 310, 311, 312, 313, 314, 315, 316,
       317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
       330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342,
       343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355,
       356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368,
       369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381,
       382, 383, 384, 385, 386, 387, 388, 389, 390, 392, 393, 394, 395,
       397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409,
       410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420])

In [13]:
df_data = pd.read_csv("./featurized_unknown__full_features_828_20210206-132138.csv", index_col=0)

In [14]:
df_data

Unnamed: 0,rdk_FractionCSP3,rdk_HeavyAtomCount,rdk_HeavyAtomMolWt,rdk_NHOHCount,rdk_NOCount,rdk_RingCount,rdk_NumAliphaticCarbocycles,rdk_NumAliphaticHeterocycles,rdk_NumAliphaticRings,rdk_NumAromaticCarbocycles,...,pychem_ATSp1,pychem_ATSp2,pychem_ATSp3,pychem_ATSp4,pychem_ATSp5,pychem_ATSp6,pychem_ATSp7,pychem_ATSp8,name,smiles
0,0.250000,33,441.749,3,8,3,0,0,0,2,...,3.424,3.643,3.663,3.678,3.775,3.713,3.438,3.179,Citarinostat,O=C(CCCCCCNC(=O)c1cnc(N(c2ccccc2)c2ccccc2Cl)nc...
1,0.083333,21,298.215,1,8,2,0,0,0,1,...,2.934,3.173,3.062,3.005,2.977,2.858,2.673,2.456,Nitazoxanide,CC(=O)Oc1ccccc1C(=O)Nc1ncc([N+](=O)[O-])s1
2,0.181818,28,350.276,2,6,5,0,1,1,1,...,3.362,3.707,3.748,3.724,3.779,3.764,3.591,3.219,Galunisertib,Cc1cccc(-c2nn3c(c2-c2ccnc4ccc(C(N)=O)cc24)CCC3)n1
3,0.280000,37,522.218,3,9,4,0,1,1,2,...,3.523,3.851,3.916,3.784,3.701,3.703,3.626,3.474,Ensartinib,CC(Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2)nnc1...
4,0.260870,30,385.269,3,6,5,1,0,1,2,...,3.373,3.663,3.630,3.575,3.582,3.555,3.498,3.300,Anlotinib,COc1cc2c(Oc3ccc4[nH]c(C)cc4c3F)ccnc2cc1OCC1(N)CC1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,0.363636,31,422.718,1,7,4,0,1,1,2,...,3.329,3.630,3.658,3.497,3.478,3.485,3.488,3.393,Gefitinib,COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1OCCCN1CCOCC1
73,0.272727,29,370.259,1,7,3,0,0,0,2,...,3.234,3.502,3.564,3.478,3.522,3.547,3.491,3.318,Erlotinib,C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1
74,0.240000,35,490.220,2,7,5,0,1,1,2,...,3.526,3.872,3.961,3.905,3.806,3.700,3.569,3.455,FRAX486,CCn1c(=O)c(-c2ccc(Cl)cc2Cl)cc2cnc(Nc3ccc(N4CCN...
75,0.066667,22,304.651,2,5,3,0,0,0,1,...,3.060,3.348,3.372,3.324,3.276,3.192,2.941,2.447,AZD4635,Cc1cc(-c2nnc(N)nc2-c2ccc(F)cc2)cc(Cl)n1


In [15]:
features = df_data.iloc[:, :-2]
print(features.shape)
features

(77, 828)


Unnamed: 0,rdk_FractionCSP3,rdk_HeavyAtomCount,rdk_HeavyAtomMolWt,rdk_NHOHCount,rdk_NOCount,rdk_RingCount,rdk_NumAliphaticCarbocycles,rdk_NumAliphaticHeterocycles,rdk_NumAliphaticRings,rdk_NumAromaticCarbocycles,...,pychem_ATSe7,pychem_ATSe8,pychem_ATSp1,pychem_ATSp2,pychem_ATSp3,pychem_ATSp4,pychem_ATSp5,pychem_ATSp6,pychem_ATSp7,pychem_ATSp8
0,0.250000,33,441.749,3,8,3,0,0,0,2,...,3.671,3.562,3.424,3.643,3.663,3.678,3.775,3.713,3.438,3.179
1,0.083333,21,298.215,1,8,2,0,0,0,1,...,3.133,2.964,2.934,3.173,3.062,3.005,2.977,2.858,2.673,2.456
2,0.181818,28,350.276,2,6,5,0,1,1,1,...,3.834,3.538,3.362,3.707,3.748,3.724,3.779,3.764,3.591,3.219
3,0.280000,37,522.218,3,9,4,0,1,1,2,...,4.043,3.960,3.523,3.851,3.916,3.784,3.701,3.703,3.626,3.474
4,0.260870,30,385.269,3,6,5,1,0,1,2,...,3.818,3.641,3.373,3.663,3.630,3.575,3.582,3.555,3.498,3.300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,0.363636,31,422.718,1,7,4,0,1,1,2,...,3.821,3.779,3.329,3.630,3.658,3.497,3.478,3.485,3.488,3.393
73,0.272727,29,370.259,1,7,3,0,0,0,2,...,3.852,3.714,3.234,3.502,3.564,3.478,3.522,3.547,3.491,3.318
74,0.240000,35,490.220,2,7,5,0,1,1,2,...,3.895,3.760,3.526,3.872,3.961,3.905,3.806,3.700,3.569,3.455
75,0.066667,22,304.651,2,5,3,0,0,0,1,...,3.278,3.001,3.060,3.348,3.372,3.324,3.276,3.192,2.941,2.447


In [16]:
features_remove_similar = features.iloc[:, support_similar]
features_remove_lowvar = features_remove_similar.iloc[:, support_lowvar]

In [17]:
features_remove_lowvar["name"] = df_data["name"]
features_remove_lowvar["smiles"] = df_data["smiles"]

In [18]:
m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210112-110103.pkl")
out = m7p.predict(featurized_df = features_remove_lowvar, prefix="unknown_")

Loading trained model ... Done!
Start predicting ...

ValueError: X has 258 features, but Normalizer is expecting 828 features as input.

# -1. Model `roc91_best_model_20210127-220922.pkl`

59/77 positive

In [26]:
support_similar = np.array([  0,   7,   8,  11,  12,  15,  17,  18,  19,  23,  25,  26,  29,
        33,  34,  35,  36,  37,  41,  43,  44,  45,  46,  47,  48,  49,
        50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  61,  62,  64,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  82,  83,
        84,  85,  87,  88,  90,  91,  92,  93,  94,  95,  97,  98, 100,
       102, 103, 104, 105, 130, 133, 162, 163, 164, 165, 173, 174, 175,
       184, 189, 190, 192, 193, 197, 204, 212, 213, 228, 245, 246, 250,
       254, 257, 263, 271, 274, 278, 279, 291, 295, 300, 301, 302, 303,
       311, 325, 326, 335, 351, 359, 373, 374, 375, 385, 386, 389, 390,
       391, 392, 397, 398, 399, 400, 401, 402, 405, 407, 410, 412, 413,
       415, 416, 418, 419, 423, 424, 426, 429, 430, 431, 434, 435, 436,
       437, 438, 439, 440, 441, 443, 444, 451, 452, 453, 454, 455, 456,
       457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469,
       470, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483,
       484, 488, 489, 491, 492, 495, 496, 497, 498, 499, 500, 502, 503,
       508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520,
       522, 523, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540,
       541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553,
       554, 555, 556, 557, 558, 559, 560, 563, 564, 565, 566, 567, 568,
       569, 570, 571, 572, 573, 574, 576, 577, 581, 582, 583, 584, 585,
       586, 587, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603,
       604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616,
       617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 630,
       631, 632, 633, 635, 638, 640, 641, 642, 643, 644, 645, 647, 648,
       649, 650, 652, 655, 657, 658, 660, 661, 663, 664, 665, 666, 667,
       668, 669, 670, 671, 672, 674, 675, 676, 677, 678, 679, 680, 681,
       684, 685, 689, 691, 692, 693, 694, 695, 696, 697, 699, 700, 701,
       702, 703, 704, 706, 707, 711, 712, 713, 714, 715, 716, 718, 723,
       724, 725, 727, 728, 730, 731, 732, 733, 734, 735, 736, 738, 740,
       741, 742, 743, 744, 745, 746, 747, 751, 752, 754, 756, 758, 761,
       762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774,
       775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 786, 789, 790,
       791, 793, 794, 826])

In [27]:
len(support_similar)

420

In [31]:
df_data = pd.read_csv("./featurized_unknown__full_features_828_20210206-132138.csv", index_col=0)

In [32]:
df_data.dropna(inplace=True)
features = df_data.iloc[:, :-2]
print(features.shape)
features_remove_similar = features.iloc[:, support_similar]
features_remove_similar["name"] = df_data["name"]
features_remove_similar["smiles"] = df_data["smiles"]
features_remove_similar.shape

(77, 828)


(77, 422)

In [33]:
m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/roc91_best_model_20210127-220922.pkl")
out = m7p.predict(featurized_df = features_remove_similar, prefix="unknown_")

Loading trained model ... Done!
Start predicting ...Done!
Writing output ...Done! Results saved to: ../output/unknown_predicted_20210206-132857.csv


In [34]:
df_res_2 = pd.read_csv("../output/unknown_predicted_20210206-132857.csv", index_col=0)
df_pred_pos_2 = df_res_2[df_res_2["pred"]==1]
print(len(df_pred_pos_2), len(df_res_2))
df_pred_pos_2.sort_values(by=["score"], ascending=False)

59 77


Unnamed: 0,name,smiles,pred,score
43,Azacitidine,Nc1ncn(C2OC(CO)C(O)C2O)c(=O)n1,1,0.87
50,Darapladib,CCN(CC)CCN(Cc1ccc(-c2ccc(C(F)(F)F)cc2)cc1)C(=O...,1,0.838
60,RXDX-106,COc1cc2nccc(Oc3ccc(NC(=O)c4cn(C(C)C)c(=O)n(-c5...,1,0.829
46,AZ628,Cc1ccc(NC(=O)c2cccc(C(C)(C)C#N)c2)cc1Nc1ccc2nc...,1,0.769
73,Erlotinib,C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1,1,0.76
10,Pilaralisib,COc1ccc(Cl)c(Nc2nc3ccccc3nc2NS(=O)(=O)c2cccc(N...,1,0.755
67,Rociletinib,C=CC(=O)Nc1cccc(Nc2nc(Nc3ccc(N4CCN(C(C)=O)CC4)...,1,0.742
34,TAE684,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,1,0.738
5,Berzosertib,CNCc1ccc(-c2cc(-c3nc(-c4ccc(S(=O)(=O)C(C)C)cc4...,1,0.729
74,FRAX486,CCn1c(=O)c(-c2ccc(Cl)cc2Cl)cc2cnc(Nc3ccc(N4CCN...,1,0.725


# 0. What if we use the model to predict positive or negative training data?

In [14]:
df_pos = pd.read_csv("../data/manual/positive.csv")
df_neg = pd.read_csv("../data/manual/negative.csv")
pos_l = df_pos["compound_name"].values.tolist()
neg_l = df_neg["compound_name"].values.tolist()
total_l = pos_l + neg_l

In [19]:
df_featurized_all = pd.read_csv("../data/all_compounds_828_features_renamed.csv", index_col=0)
df_featurized_pos = df_featurized_all[(df_featurized_all["name"].isin(total_l))&(df_featurized_all["label"]==1)]
df_featurized_neg = df_featurized_all[(df_featurized_all["name"].isin(total_l))&(df_featurized_all["label"]==0)]

In [20]:
len(df_featurized_pos), len(df_featurized_neg)

(53, 63)

In [21]:
print("positive:", df_featurized_pos["name"].values.tolist())

positive: ['paclitaxel', 'vincristine', 'LTC4', 'E217bG', 'gemcitabine', 'araC', 'docetaxel', 'epothiloneB', 'vinblastine', 'vinorelbine', 'tenofovir', 'Lapatinib', 'Erlotinib', 'PD-173074', 'AlstolucinesB ', 'Nilotinib', 'Vemurafenib', 'Tandutinib', 'BBA', 'Tariquidar', 'Ibrutinib', '\ufeffNVP-BHG712', '\ufeffLinsitinib', '\ufeffPonatinib', '\ufeffCepharanthine', 'Imatinib', 'Sildenafil', 'Vardenafil', 'DNP-SG', 'Doxorubicin', 'Sulfinpyrazone', '\ufeffGlycolithocholate-3-sulfate', 'MK571', '\ufeffTrequinsin', '\ufeffTaurocholate', '17beta-Estradiol 3-sulfate-17-(beta-D-glucuronide)', '\ufeffZaprinast', '\ufeff16alpha,17beta-Estriol 3-sulfate (Estradiol-3-sulfate)', '\ufeff17alpha-Ethynyl-17beta-estradiol (Ethinyl estradiol)', '\ufeff16alpha,17beta-Estriol 16-(beta-d-glucuronide)', '\ufeffGlycocholate', '\ufeff16alpha,17beta-Estriol 3-(beta-d-glucuronide) (Estriol 3-glucuronide)', '\ufeff17beta-Estradiol 3-(beta-d-glucuronide)', '\ufeff17-betaEstradiol', 'Verapamil', '\ufeffCyclosporin

In [22]:
print("negative:", df_featurized_neg["name"].values.tolist())

negative: ['5-azacytidine', '\ufeffProbenecid', 'cAMP', '\ufeffGlucuronic acid', 'PSC833', 'sn38', 'daunorubicin', 'dasatinib', '5-FU', '5-dFUrd', '5-FdUrd', "CdA (2'-chloro-2'-deoxyadenosine)", '6-TG', '6-MP', 'mitoxantrone', 'epothilone A', 'phomopsin A', 'MAC321', 'MST997', 'HTI286', 'tadalafil', 'Sipholenone E', 'Sipholenol L', 'Siphonellinol D', 'Sipholenol J', 'motesanib', 'cabazitaxel', 'ribociclib', 'pd0325901', 'mek162', 'A-803467', 'WHI‐P154', 'icotinib', 'telatinib', 'CCTA-2', 'CCTA-3', 'CCTA-5', 'CCTA-7', 'CCTA-8', 'CCTA-9', 'CCTA-10', 'CCTA-11', 'CCTA-13', 'CCTA-14', 'CCTA-20', 'CCTA-21', 'CCTA-22', 'CCTA-23', 'CCTA-27', 'CCTA-31', 'CCTA-32', 'CCTA-33', 'CCTA-34', 'CCTA-37', 'CCTA-40', 'CCTA-41', 'CCTA-42', 'CCTA-43', 'CCTA-45', 'CCTA-46', 'CCTA-47', 'CCTA-48', 'CCTA-49']


In [24]:
support_similar = np.array([  0,   3,   4,   5,   6,   7,   8,  10,  11,  12,  13,  14,  15,
         17,  18,  19,  21,  22,  23,  24,  25,  26,  29,  31,  32,  33,
         34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,
         47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
         61,  62,  64,  65,  67,  68,  69,  70,  71,  72,  73,  74,  75,
         76,  77,  78,  82,  83,  84,  85,  87,  88,  90,  91,  92,  93,
         94,  95,  96,  97,  98, 100, 101, 102, 103, 104, 105, 106, 107,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 132, 133, 156, 157, 158, 159, 160, 161, 162, 173, 174, 175,
        176, 189, 191, 192, 193, 194, 195, 197, 198, 206, 209, 210, 212,
        215, 216, 225, 227, 228, 231, 235, 236, 237, 240, 241, 242, 244,
        245, 246, 247, 248, 249, 250, 251, 254, 255, 256, 257, 258, 259,
        260, 263, 265, 266, 271, 273, 274, 275, 276, 277, 278, 279, 280,
        282, 290, 292, 293, 294, 295, 296, 297, 298, 299, 301, 302, 303,
        304, 305, 306, 307, 311, 312, 313, 314, 315, 316, 317, 318, 319,
        320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332,
        333, 334, 335, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346,
        347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
        367, 368, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384,
        385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 396, 397, 398,
        399, 400, 401, 402, 403, 405, 407, 408, 409, 410, 411, 412, 413,
        414, 415, 416, 418, 419, 420, 421, 422, 423, 425, 426, 428, 429,
        430, 431, 432, 434, 435, 436, 437, 438, 439, 440, 441, 443, 444,
        450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462,
        463, 464, 465, 467, 468, 469, 470, 472, 473, 474, 475, 476, 477,
        478, 479, 480, 481, 482, 483, 484, 486, 487, 488, 489, 491, 492,
        493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
        508, 509, 510, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523,
        524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536,
        537, 538, 539, 540, 541, 542, 543, 544, 546, 547, 548, 549, 551,
        552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 563, 564, 565,
        566, 567, 568, 569, 570, 571, 572, 573, 574, 576, 577, 580, 581,
        584, 585, 586, 587, 589, 590, 591, 592, 593, 594, 595, 596, 597,
        598, 599, 601, 602, 609, 610, 611, 612, 613, 614, 615, 616, 617,
        618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630,
        631, 632, 633, 634, 635, 636, 637, 638, 639, 641, 642, 643, 644,
        645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657,
        658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670,
        671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683,
        684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696,
        697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709,
        710, 711, 712, 713, 714, 715, 716, 718, 720, 721, 722, 723, 724,
        725, 726, 728, 729, 730, 731, 732, 733, 734, 735, 736, 739, 740,
        741, 742, 743, 744, 745, 746, 747, 751, 752, 758, 759, 760, 761,
        762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774,
        786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798,
        799, 803, 813, 814, 816, 817, 818, 819, 820, 823, 824, 827])

In [26]:
features_pos = df_featurized_pos.iloc[:, 2:]
features_neg = df_featurized_neg.iloc[:, 2:]

features_remove_similar_pos = features_pos.iloc[:, support_similar]
features_remove_similar_neg = features_neg.iloc[:, support_similar]

features_remove_similar_pos["name"] = df_featurized_pos["name"]
features_remove_similar_pos["smiles"] = df_featurized_pos["smiles"]

features_remove_similar_neg["name"] = df_featurized_neg["name"]
features_remove_similar_neg["smiles"] = df_featurized_neg["smiles"]

features_remove_similar_pos.shape, features_remove_similar_neg.shape

((53, 612), (63, 612))

In [30]:
m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210211-031248.pkl")
out_pos = m7p.predict(featurized_df = features_remove_similar_pos, prefix="train_pos_")
out_neg = m7p.predict(featurized_df = features_remove_similar_neg, prefix="train_neg_")

Loading trained model ... Done!
Start predicting ...Done!
Writing output ...Done! Results saved to: ../output/train_pos_predicted_20210211-031558.csv
Start predicting ...Done!
Writing output ...Done! Results saved to: ../output/train_neg_predicted_20210211-031558.csv


In [32]:
df_res_pos = pd.read_csv("../output/train_pos_predicted_20210211-031558.csv", index_col=0)
df_pred_pos_pos = df_res_pos[df_res_pos["pred"]==1]
print(len(df_pred_pos_pos), len(df_res_pos))
df_pred_pos_pos.sort_values(by=["score"], ascending=False)

53 53


Unnamed: 0,name,smiles,pred,score
35,17beta-Estradiol 3-sulfate-17-(beta-D-glucuron...,CC12CCC3c4ccc(OS(=O)(=O)O)cc4CCC3C1CCC2OC1OC(C...,1,0.84477
19,Tariquidar,COc1cc2c(cc1OC)CN(CCc1ccc(NC(=O)c3cc(OC)c(OC)c...,1,0.83625
3,E217bG,CC12CCC3c4ccc(O)cc4CCC3C1CCC2OC1OC(C(=O)O)C(O)...,1,0.827021
20,Ibrutinib,C=CC(=O)N1CCCC(n2nc(-c3ccc(Oc4ccccc4)cc3)c3c(N...,1,0.823265
41,"﻿16alpha,17beta-Estriol 3-(beta-d-glucuronide)...",CC12CCC3c4ccc(OC5OC(C(=O)O)C(O)C(O)C5O)cc4CCC3...,1,0.823003
27,Vardenafil,CCCc1nc(C)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(CC...,1,0.819685
9,vinorelbine,CCC1=CC2CN(C1)Cc1c([nH]c3ccccc13)C(C(=O)OC)(c1...,1,0.818585
26,Sildenafil,CCCc1nn(C)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)...,1,0.81831
8,vinblastine,CCC1(O)CC2CN(CCc3c([nH]c4ccccc34)C(C(=O)OC)(c3...,1,0.818279
42,﻿17beta-Estradiol 3-(beta-d-glucuronide),CC12CCC3c4ccc(OC5OC(C(=O)O)C(O)C(O)C5O)cc4CCC3...,1,0.817264


In [33]:
df_res_neg = pd.read_csv("../output/train_neg_predicted_20210211-031558.csv", index_col=0)
df_pred_pos_neg = df_res_neg[df_res_neg["pred"]==1]
print(len(df_pred_pos_neg), len(df_res_neg))
df_pred_pos_neg.sort_values(by=["score"], ascending=False)

19 63


Unnamed: 0,name,smiles,pred,score
80,ribociclib,CN(C)C(=O)c1cc2cnc(Nc3ccc(N4CCNCC4)cn3)nc2n1C1...,1,0.788399
67,epothilone A,CC(=Cc1csc(C)n1)C1CC2OC2CCCC(C)C(O)C(C)C(=O)C(...,1,0.779341
57,sn38,CCc1c2c(nc3ccc(O)cc13)-c1cc3c(c(=O)n1C2)COC(=O...,1,0.778239
58,daunorubicin,COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)CC(O)(C(C...,1,0.763442
79,cabazitaxel,COC1C(=O)C2(C)C(OC)CC3OCC3(OC(C)=O)C2C(OC(=O)c...,1,0.738808
77,motesanib,CC1(C)CNc2cc(NC(=O)c3cccnc3NCc3ccncc3)ccc21,1,0.726574
59,dasatinib,Cc1nc(Nc2ncc(C(=O)Nc3c(C)cccc3Cl)s2)cc(N2CCN(C...,1,0.68949
69,MAC321,CCC(=O)OC1CC2OCC2(OC(C)=O)C2C(OC(=O)c3ccccc3)C...,1,0.683299
85,icotinib,C#Cc1cccc(Nc2ncnc3cc4c(cc23)OCCOCCOCCO4)c1,1,0.635481
72,tadalafil,CN1CC(=O)N2C(Cc3c([nH]c4ccccc34)C2c2ccc3c(c2)O...,1,0.622491


# 1. 2021.2.5 Only remove similar (420 features)

55/77 positive

In [36]:
support_similar = np.array([  0,   6,   7,  11,  15,  16,  18,  19,  23,  24,  25,  29,  33,
        34,  35,  37,  39,  41,  43,  44,  45,  46,  47,  48,  49,  50,
        51,  52,  53,  54,  55,  56,  57,  58,  59,  61,  62,  64,  67,
        68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  82,  83,
        84,  85,  87,  88,  90,  91,  92,  93,  94,  95,  97,  98, 100,
       101, 102, 103, 104, 105, 130, 133, 138, 156, 162, 163, 164, 165,
       168, 173, 174, 175, 178, 184, 190, 192, 193, 197, 199, 213, 214,
       228, 245, 246, 250, 257, 259, 263, 271, 274, 278, 279, 291, 295,
       300, 301, 302, 303, 319, 325, 326, 327, 343, 351, 359, 366, 367,
       374, 385, 386, 389, 390, 391, 392, 397, 398, 399, 400, 401, 402,
       405, 407, 410, 412, 413, 415, 416, 418, 419, 420, 422, 423, 425,
       426, 429, 435, 436, 437, 438, 439, 440, 441, 443, 444, 451, 452,
       453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465,
       467, 468, 469, 470, 472, 473, 474, 475, 476, 477, 478, 479, 480,
       481, 482, 483, 484, 488, 489, 491, 492, 495, 496, 497, 498, 499,
       500, 501, 502, 503, 505, 506, 507, 508, 514, 515, 516, 517, 518,
       519, 520, 522, 523, 530, 531, 532, 533, 534, 535, 536, 537, 538,
       539, 540, 541, 542, 543, 544, 546, 547, 548, 549, 551, 552, 553,
       554, 555, 556, 557, 558, 559, 560, 563, 564, 565, 566, 567, 568,
       569, 570, 571, 572, 573, 574, 576, 577, 580, 581, 587, 588, 589,
       590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602,
       603, 604, 605, 606, 607, 609, 610, 611, 612, 613, 614, 615, 616,
       617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629,
       630, 631, 632, 633, 635, 638, 640, 641, 642, 643, 644, 645, 646,
       647, 648, 649, 652, 654, 656, 657, 658, 659, 660, 661, 663, 664,
       665, 666, 668, 669, 671, 674, 675, 676, 677, 678, 679, 680, 681,
       682, 684, 685, 691, 692, 693, 694, 695, 696, 697, 699, 700, 701,
       702, 703, 704, 705, 707, 711, 712, 713, 714, 715, 716, 718, 720,
       725, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 738, 740,
       741, 743, 744, 745, 746, 747, 751, 752, 753, 754, 758, 761, 762,
       763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775,
       776, 777, 778, 779, 780, 781, 782, 783, 784, 787, 789, 790, 791,
       793, 794, 795, 826])

In [38]:
df_data = pd.read_csv("./featurized_unknown__full_features_828_20210206-132138.csv", index_col=0)
df_data.dropna(inplace=True)
features = df_data.iloc[:, :-2]
features_remove_similar = features.iloc[:, support_similar]
features_remove_similar.shape

(77, 420)

In [39]:
features_remove_similar[["name", "smiles"]] = df_data[["name", "smiles"]]
m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210205-111603.pkl")
out = m7p.predict(featurized_df = features_remove_similar, prefix="unknown_")

Loading trained model ... Done!
Start predicting ...Done!
Writing output ...Done! Results saved to: ../output/unknown_predicted_20210206-133145.csv


In [40]:
df_res_2 = pd.read_csv("../output/unknown_predicted_20210206-133145.csv", index_col=0)
df_pred_pos_2 = df_res_2[df_res_2["pred"]==1]
print(len(df_pred_pos_2), len(df_res_2))
df_pred_pos_2.sort_values(by=["score"], ascending=False)

55 77


Unnamed: 0,name,smiles,pred,score
50,Darapladib,CCN(CC)CCN(Cc1ccc(-c2ccc(C(F)(F)F)cc2)cc1)C(=O...,1,0.997755
11,Selinexor,O=C(C=Cn1cnc(-c2cc(C(F)(F)F)cc(C(F)(F)F)c2)n1)...,1,0.997692
27,Crenolanib,CC1(COc2ccc3c(c2)ncn3-c2ccc3cccc(N4CCC(N)CC4)c...,1,0.997318
8,Odanacatib,CC(C)(F)CC(NC(c1ccc(-c2ccc(S(C)(=O)=O)cc2)cc1)...,1,0.996955
18,Buparlisib,Nc1cc(C(F)(F)F)c(-c2cc(N3CCOCC3)nc(N3CCOCC3)n2...,1,0.992225
2,Galunisertib,Cc1cccc(-c2nn3c(c2-c2ccnc4ccc(C(N)=O)cc24)CCC3)n1,1,0.991269
37,PF-04691502,COc1ccc(-c2cc3c(C)nc(N)nc3n(C3CCC(OCCO)CC3)c2=...,1,0.989991
30,TG100-115,Nc1nc(N)c2nc(-c3cccc(O)c3)c(-c3cccc(O)c3)nc2n1,1,0.988912
48,AZD8055,COc1ccc(-c2ccc3c(N4CCOCC4C)nc(N4CCOCC4C)nc3n2)...,1,0.986881
65,SCH772984,O=C(Nc1ccc2[nH]nc(-c3ccncc3)c2c1)C1CCN(CC(=O)N...,1,0.986312


# 2. Model trained with heavy grid (60 features)

All positive

In [41]:
feats = [257, 131, 132, 133, 260, 135, 9, 266, 269, 14, 146, 22, 279, 280, 282, 27, 283, 288, 289, 291, 292, 294, 296, 297, 298, 302, 304, 305, 178, 306, 307, 308, 310, 311, 64, 71, 73, 74, 76, 78, 80, 81, 82, 86, 92, 99, 100, 102, 106, 107, 109, 113, 115, 116, 246, 248, 121, 250, 125, 126]


In [42]:
features = df_data.iloc[:, :-2]
features_60 = features.iloc[:, feats]

In [43]:
features_60.shape

(77, 60)

In [44]:
features_60[["name", "smiles"]] = df_data[["name", "smiles"]]
m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210121-185646_60feats_3day.pkl")
out = m7p.predict(featurized_df = features_60, prefix="unknown_")

Loading trained model ... Done!
Start predicting ...Done!
Writing output ...Done! Results saved to: ../output/unknown_predicted_20210206-133251.csv


In [45]:
df_res_2 = pd.read_csv("../output/unknown_predicted_20210206-133251.csv", index_col=0)
df_pred_pos_2 = df_res_2[df_res_2["pred"]==1]
print(len(df_pred_pos_2), len(df_res_2))
df_pred_pos_2.sort_values(by=["score"], ascending=False)

77 77


Unnamed: 0,name,smiles,pred,score
0,Citarinostat,O=C(CCCCCCNC(=O)c1cnc(N(c2ccccc2)c2ccccc2Cl)nc...,1,1.0
49,GSK2334470,CNc1nc(-c2ccc3c(N)n[nH]c3c2)cc(N2CC(C(=O)NC3CC...,1,1.0
56,PF-06840003,O=C1CC(c2c[nH]c3ccc(F)cc23)C(=O)N1,1,1.0
55,BAY-1816032,COc1cnccc1Nc1nc(-c2nn(Cc3c(F)cc(OCCO)cc3F)c3cc...,1,1.0
54,GNE-7915,CCNc1nc(Nc2cc(F)c(C(=O)N3CCOCC3)cc2OC)ncc1C(F)...,1,1.0
...,...,...,...,...
25,Ganetespib,CC(C)c1cc(-c2n[nH]c(=O)n2-c2ccc3c(ccn3C)c2)c(O...,1,1.0
24,Ixazomib Citrate,CC(C)CC(NC(=O)CNC(=O)c1cc(Cl)ccc1Cl)B1OC(=O)C(...,1,1.0
23,Disulfiram,CCN(CC)C(=S)SSC(=S)N(CC)CC,1,1.0
22,WYE354,COC(=O)Nc1ccc(-c2nc(N3CCOCC3)c3cnn(C4CCN(C(=O)...,1,1.0


# 3. Model trained with remove similar and no MLP light grid

best_model_20210205-180737.pkl

Training score:

Pipeline(steps=[('sclr', DummyScaler(scaler=StandardScaler())),
                ('clf', SVC(probability=True))])
stats
tp: 14.0
fp: 10.0
tn: 10.0
fn: 1.0

score
roc_auc: 0.7516666666666667
accuracy: 0.6857142857142857
precision: 0.5833333333333334
recall: 0.9333333333333333
specificity: 0.5
mcc: 0.461934885437156

66/77 positive

In [46]:
support_similar = np.array([  0,   6,   7,  11,  15,  16,  18,  19,  23,  24,  25,  29,  33,
        34,  35,  37,  39,  41,  43,  44,  45,  46,  47,  48,  49,  50,
        51,  52,  53,  54,  55,  56,  57,  58,  59,  61,  62,  64,  67,
        68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  82,  83,
        84,  85,  87,  88,  90,  91,  92,  93,  94,  95,  97,  98, 100,
       101, 102, 103, 104, 105, 130, 133, 138, 156, 162, 163, 164, 165,
       168, 173, 174, 175, 178, 184, 190, 192, 193, 197, 199, 213, 214,
       228, 245, 246, 250, 257, 259, 263, 271, 274, 278, 279, 291, 295,
       300, 301, 302, 303, 319, 325, 326, 327, 343, 351, 359, 366, 367,
       374, 385, 386, 389, 390, 391, 392, 397, 398, 399, 400, 401, 402,
       405, 407, 410, 412, 413, 415, 416, 418, 419, 420, 422, 423, 425,
       426, 429, 435, 436, 437, 438, 439, 440, 441, 443, 444, 451, 452,
       453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465,
       467, 468, 469, 470, 472, 473, 474, 475, 476, 477, 478, 479, 480,
       481, 482, 483, 484, 488, 489, 491, 492, 495, 496, 497, 498, 499,
       500, 501, 502, 503, 505, 506, 507, 508, 514, 515, 516, 517, 518,
       519, 520, 522, 523, 530, 531, 532, 533, 534, 535, 536, 537, 538,
       539, 540, 541, 542, 543, 544, 546, 547, 548, 549, 551, 552, 553,
       554, 555, 556, 557, 558, 559, 560, 563, 564, 565, 566, 567, 568,
       569, 570, 571, 572, 573, 574, 576, 577, 580, 581, 587, 588, 589,
       590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602,
       603, 604, 605, 606, 607, 609, 610, 611, 612, 613, 614, 615, 616,
       617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629,
       630, 631, 632, 633, 635, 638, 640, 641, 642, 643, 644, 645, 646,
       647, 648, 649, 652, 654, 656, 657, 658, 659, 660, 661, 663, 664,
       665, 666, 668, 669, 671, 674, 675, 676, 677, 678, 679, 680, 681,
       682, 684, 685, 691, 692, 693, 694, 695, 696, 697, 699, 700, 701,
       702, 703, 704, 705, 707, 711, 712, 713, 714, 715, 716, 718, 720,
       725, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 738, 740,
       741, 743, 744, 745, 746, 747, 751, 752, 753, 754, 758, 761, 762,
       763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775,
       776, 777, 778, 779, 780, 781, 782, 783, 784, 787, 789, 790, 791,
       793, 794, 795, 826])

In [48]:
df_data = pd.read_csv("./featurized_unknown__full_features_828_20210206-132138.csv", index_col=0)
df_data.dropna(inplace=True)
features = df_data.iloc[:, :-2]
features_remove_similar = features.iloc[:, support_similar]
features_remove_similar[["name", "smiles"]] = df_data[["name", "smiles"]]
m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210205-180737.pkl")
out = m7p.predict(featurized_df = features_remove_similar, prefix="unknown_")

Loading trained model ... Done!
Start predicting ...Done!
Writing output ...Done! Results saved to: ../output/unknown_predicted_20210206-133333.csv


In [49]:
df_res_2 = pd.read_csv("../output/unknown_predicted_20210206-133333.csv", index_col=0)
df_pred_pos_2 = df_res_2[df_res_2["pred"]==1]
print(len(df_pred_pos_2), len(df_res_2))
df_pred_pos_2.sort_values(by=["score"], ascending=False)

66 77


Unnamed: 0,name,smiles,pred,score
43,Azacitidine,Nc1ncn(C2OC(CO)C(O)C2O)c(=O)n1,1,0.856650
37,PF-04691502,COc1ccc(-c2cc3c(C)nc(N)nc3n(C3CCC(OCCO)CC3)c2=...,1,0.830362
27,Crenolanib,CC1(COc2ccc3c(c2)ncn3-c2ccc3cccc(N4CCC(N)CC4)c...,1,0.797859
2,Galunisertib,Cc1cccc(-c2nn3c(c2-c2ccnc4ccc(C(N)=O)cc24)CCC3)n1,1,0.792027
65,SCH772984,O=C(Nc1ccc2[nH]nc(-c3ccncc3)c2c1)C1CCN(CC(=O)N...,1,0.790360
...,...,...,...,...
72,Gefitinib,COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1OCCCN1CCOCC1,1,0.432580
16,Napabucasin,CC(=O)c1cc2c(o1)C(=O)c1ccccc1C2=O,1,0.420370
56,PF-06840003,O=C1CC(c2c[nH]c3ccc(F)cc23)C(=O)N1,1,0.417191
61,Vatalanib,Clc1ccc(Nc2nnc(Cc3ccncc3)c3ccccc23)cc1,1,0.415965


# 4. More weight in negative

../output/model/best_model_20210206-135046.pkl

Pipeline(steps=[('sclr', StandardScaler()),
                ('clf',
                 RandomForestClassifier(class_weight={0: 0.99, 1: 0.01}))])

stats
tp: 14.0
fp: 9.0
tn: 11.0
fn: 1.0

score
roc_auc: 0.8366666666666667
accuracy: 0.7142857142857143
precision: 0.6086956521739131
recall: 0.9333333333333333
specificity: 0.55
mcc: 0.503909833971264

60/77

In [51]:
support_similar = np.array([  0,   6,   7,   8,   9,  10,  15,  16,  17,  18,  19,  22,  23,
         24,  25,  26,  30,  32,  33,  34,  35,  36,  37,  39,  40,  41,
         42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,
         56,  57,  58,  59,  61,  64,  67,  68,  69,  70,  71,  72,  73,
         74,  75,  76,  77,  78,  82,  83,  84,  85,  87,  88,  90,  91,
         92,  93,  94,  95,  96,  97,  98, 100, 101, 102, 103, 104, 105,
        132, 133, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
        153, 154, 156, 162, 163, 164, 165, 166, 167, 168, 173, 174, 175,
        176, 186, 189, 191, 192, 193, 194, 195, 197, 200, 209, 211, 213,
        214, 216, 217, 228, 235, 245, 246, 249, 250, 254, 257, 259, 263,
        271, 274, 278, 279, 280, 291, 292, 295, 298, 299, 300, 301, 302,
        303, 310, 319, 325, 326, 327, 350, 351, 357, 358, 359, 363, 364,
        365, 366, 367, 372, 374, 375, 383, 384, 385, 386, 389, 390, 391,
        392, 395, 397, 398, 399, 400, 401, 402, 405, 407, 408, 409, 410,
        412, 413, 415, 416, 418, 419, 420, 422, 423, 425, 426, 429, 432,
        435, 436, 437, 438, 439, 440, 441, 443, 444, 451, 452, 453, 454,
        455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 467, 468,
        469, 470, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482,
        483, 484, 487, 488, 489, 490, 491, 492, 493, 494, 495, 497, 498,
        501, 502, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518,
        519, 520, 522, 523, 530, 531, 532, 533, 534, 535, 536, 537, 538,
        539, 540, 541, 542, 543, 544, 546, 547, 548, 549, 551, 552, 553,
        554, 555, 556, 557, 558, 559, 560, 563, 564, 565, 566, 567, 568,
        570, 571, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584,
        585, 586, 587, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602,
        603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615,
        616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628,
        629, 630, 631, 632, 633, 635, 636, 637, 638, 639, 641, 642, 643,
        644, 645, 646, 647, 648, 649, 650, 652, 653, 654, 655, 658, 659,
        660, 661, 662, 663, 664, 665, 667, 668, 669, 670, 671, 672, 673,
        674, 675, 676, 677, 678, 679, 680, 681, 682, 684, 685, 686, 687,
        688, 689, 690, 691, 692, 693, 694, 696, 697, 698, 699, 700, 701,
        702, 703, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716,
        718, 720, 721, 723, 724, 725, 726, 728, 729, 730, 731, 732, 733,
        734, 735, 736, 743, 744, 745, 746, 747, 752, 753, 754, 756, 757,
        758, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787,
        788, 789, 790, 791, 792, 793, 826])

In [52]:
df_data = pd.read_csv("./featurized_unknown__full_features_828_20210206-132138.csv", index_col=0)
df_data.dropna(inplace=True)
features = df_data.iloc[:, :-2]
features_remove_similar = features.iloc[:, support_similar]
features_remove_similar[["name", "smiles"]] = df_data[["name", "smiles"]]
m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210206-135046.pkl")
out = m7p.predict(featurized_df = features_remove_similar, prefix="unknown_")

Loading trained model ... Done!
Start predicting ...Done!
Writing output ...Done! Results saved to: ../output/unknown_predicted_20210206-140649.csv


In [53]:
df_res_2 = pd.read_csv("../output/unknown_predicted_20210206-140649.csv", index_col=0)
df_pred_pos_2 = df_res_2[df_res_2["pred"]==1]
print(len(df_pred_pos_2), len(df_res_2))
df_pred_pos_2.sort_values(by=["score"], ascending=False)

60 77


Unnamed: 0,name,smiles,pred,score
65,SCH772984,O=C(Nc1ccc2[nH]nc(-c3ccncc3)c2c1)C1CCN(CC(=O)N...,1,0.97
35,Gilteritinib,CCc1nc(C(N)=O)c(Nc2ccc(N3CCC(N4CCN(C)CC4)CC3)c...,1,0.92
17,Peficitinib,NC(=O)c1cnc2[nH]ccc2c1NC1C2CC3CC1CC(O)(C3)C2,1,0.92
14,Enzastaurin,Cn1cc(C2=C(c3cn(C4CCN(Cc5ccccn5)CC4)c4ccccc34)...,1,0.91
49,GSK2334470,CNc1nc(-c2ccc3c(N)n[nH]c3c2)cc(N2CC(C(=O)NC3CC...,1,0.9
50,Darapladib,CCN(CC)CCN(Cc1ccc(-c2ccc(C(F)(F)F)cc2)cc1)C(=O...,1,0.88
41,Midostaurin,COC1C(N(C)C(=O)c2ccccc2)CC2OC1(C)n1c3ccccc3c3c...,1,0.87
27,Crenolanib,CC1(COc2ccc3c(c2)ncn3-c2ccc3cccc(N4CCC(N)CC4)c...,1,0.87
74,FRAX486,CCn1c(=O)c(-c2ccc(Cl)cc2Cl)cc2cnc(Nc3ccc(N4CCN...,1,0.86
60,RXDX-106,COc1cc2nccc(Oc3ccc(NC(=O)c4cn(C(C)C)c(=O)n(-c5...,1,0.86


# 5. More weight in negative and score = precision

best_model_20210206-141413
---
Pipeline(steps=[('sclr', StandardScaler()),
                ('clf',
                 SVC(class_weight={0: 0.99, 1: 0.01}, kernel='linear',
                     probability=True))])
stats
tp: 12.0
fp: 7.0
tn: 13.0
fn: 3.0

score
roc_auc: 0.8033333333333333
accuracy: 0.7142857142857143
precision: 0.631578947368421
recall: 0.8
specificity: 0.65
mcc: 0.4470296705094523

52/77

In [54]:
df_data = pd.read_csv("./featurized_unknown__full_features_828_20210206-132138.csv", index_col=0)
df_data.dropna(inplace=True)
features = df_data.iloc[:, :-2]
features_remove_similar = features.iloc[:, support_similar]
features_remove_similar[["name", "smiles"]] = df_data[["name", "smiles"]]
m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210206-141413.pkl")
out = m7p.predict(featurized_df = features_remove_similar, prefix="unknown_")

Loading trained model ... Done!
Start predicting ...Done!
Writing output ...Done! Results saved to: ../output/unknown_predicted_20210206-141534.csv


In [55]:
df_res_2 = pd.read_csv("../output/unknown_predicted_20210206-141534.csv", index_col=0)
df_pred_pos_2 = df_res_2[df_res_2["pred"]==1]
print(len(df_pred_pos_2), len(df_res_2))
df_pred_pos_2.sort_values(by=["score"], ascending=False)

52 77


Unnamed: 0,name,smiles,pred,score
50,Darapladib,CCN(CC)CCN(Cc1ccc(-c2ccc(C(F)(F)F)cc2)cc1)C(=O...,1,0.926706
0,Citarinostat,O=C(CCCCCCNC(=O)c1cnc(N(c2ccccc2)c2ccccc2Cl)nc...,1,0.916923
30,TG100-115,Nc1nc(N)c2nc(-c3cccc(O)c3)c(-c3cccc(O)c3)nc2n1,1,0.91167
65,SCH772984,O=C(Nc1ccc2[nH]nc(-c3ccncc3)c2c1)C1CCN(CC(=O)N...,1,0.889683
63,Defactinib,CNC(=O)c1ccc(Nc2ncc(C(F)(F)F)c(NCc3nccnc3N(C)S...,1,0.877205
11,Selinexor,O=C(C=Cn1cnc(-c2cc(C(F)(F)F)cc(C(F)(F)F)c2)n1)...,1,0.869196
75,AZD4635,Cc1cc(-c2nnc(N)nc2-c2ccc(F)cc2)cc(Cl)n1,1,0.864318
2,Galunisertib,Cc1cccc(-c2nn3c(c2-c2ccnc4ccc(C(N)=O)cc24)CCC3)n1,1,0.84113
35,Gilteritinib,CCc1nc(C(N)=O)c(Nc2ccc(N3CCC(N4CCN(C)CC4)CC3)c...,1,0.840729
59,Pevonedistat,NS(=O)(=O)OCC1CC(n2ccc3c(NC4CCc5ccccc54)ncnc32...,1,0.840303


# 6. More weight in negative and score = precision, threshold = 0.99

best_model_20210206-142527.pkl
---
Pipeline(steps=[('sclr', StandardScaler()),
                ('clf',
                 SVC(class_weight={0: 0.99, 1: 0.01}, kernel='linear',
                     probability=True))])

stats
tp: 13.0
fp: 7.0
tn: 13.0
fn: 2.0

score
roc_auc: 0.8233333333333334
accuracy: 0.7428571428571429
precision: 0.65
recall: 0.8666666666666667
specificity: 0.65
mcc: 0.5166666666666667

55/77

In [5]:
support_similar = np.array([  0,   3,   4,   5,   6,   7,   8,  10,  11,  12,  13,  14,  15,
         17,  18,  19,  21,  22,  23,  24,  25,  26,  29,  31,  32,  33,
         34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,
         47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
         61,  62,  64,  65,  67,  68,  69,  70,  71,  72,  73,  74,  75,
         76,  77,  78,  82,  83,  84,  85,  87,  88,  90,  91,  92,  93,
         94,  95,  96,  97,  98, 100, 101, 102, 103, 104, 105, 106, 107,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 132, 133, 156, 157, 158, 159, 160, 161, 162, 173, 174, 175,
        176, 189, 191, 192, 193, 194, 195, 197, 198, 206, 209, 210, 212,
        215, 216, 225, 227, 228, 231, 235, 236, 237, 240, 241, 242, 244,
        245, 246, 247, 248, 249, 250, 251, 254, 255, 256, 257, 258, 259,
        260, 263, 265, 266, 271, 273, 274, 275, 276, 277, 278, 279, 280,
        282, 290, 292, 293, 294, 295, 296, 297, 298, 299, 301, 302, 303,
        304, 305, 306, 307, 311, 312, 313, 314, 315, 316, 317, 318, 319,
        320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332,
        333, 334, 335, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346,
        347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
        367, 368, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384,
        385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 396, 397, 398,
        399, 400, 401, 402, 403, 405, 407, 408, 409, 410, 411, 412, 413,
        414, 415, 416, 418, 419, 420, 421, 422, 423, 425, 426, 428, 429,
        430, 431, 432, 434, 435, 436, 437, 438, 439, 440, 441, 443, 444,
        450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462,
        463, 464, 465, 467, 468, 469, 470, 472, 473, 474, 475, 476, 477,
        478, 479, 480, 481, 482, 483, 484, 486, 487, 488, 489, 491, 492,
        493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
        508, 509, 510, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523,
        524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536,
        537, 538, 539, 540, 541, 542, 543, 544, 546, 547, 548, 549, 551,
        552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 563, 564, 565,
        566, 567, 568, 569, 570, 571, 572, 573, 574, 576, 577, 580, 581,
        584, 585, 586, 587, 589, 590, 591, 592, 593, 594, 595, 596, 597,
        598, 599, 601, 602, 609, 610, 611, 612, 613, 614, 615, 616, 617,
        618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630,
        631, 632, 633, 634, 635, 636, 637, 638, 639, 641, 642, 643, 644,
        645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657,
        658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670,
        671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683,
        684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696,
        697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709,
        710, 711, 712, 713, 714, 715, 716, 718, 720, 721, 722, 723, 724,
        725, 726, 728, 729, 730, 731, 732, 733, 734, 735, 736, 739, 740,
        741, 742, 743, 744, 745, 746, 747, 751, 752, 758, 759, 760, 761,
        762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774,
        786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798,
        799, 803, 813, 814, 816, 817, 818, 819, 820, 823, 824, 827])

In [57]:
df_data = pd.read_csv("./featurized_unknown__full_features_828_20210206-132138.csv", index_col=0)
df_data.dropna(inplace=True)
features = df_data.iloc[:, :-2]
features_remove_similar = features.iloc[:, support_similar]
features_remove_similar[["name", "smiles"]] = df_data[["name", "smiles"]]
m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210206-142527.pkl")
out = m7p.predict(featurized_df = features_remove_similar, prefix="unknown_")

Loading trained model ... Done!
Start predicting ...Done!
Writing output ...Done! Results saved to: ../output/unknown_predicted_20210206-142820.csv


In [58]:
df_res_2 = pd.read_csv("../output/unknown_predicted_20210206-142820.csv", index_col=0)
df_pred_pos_2 = df_res_2[df_res_2["pred"]==1]
print(len(df_pred_pos_2), len(df_res_2))
df_pred_pos_2.sort_values(by=["score"], ascending=False)

55 77


Unnamed: 0,name,smiles,pred,score
50,Darapladib,CCN(CC)CCN(Cc1ccc(-c2ccc(C(F)(F)F)cc2)cc1)C(=O...,1,0.937978
30,TG100-115,Nc1nc(N)c2nc(-c3cccc(O)c3)c(-c3cccc(O)c3)nc2n1,1,0.936954
0,Citarinostat,O=C(CCCCCCNC(=O)c1cnc(N(c2ccccc2)c2ccccc2Cl)nc...,1,0.931685
65,SCH772984,O=C(Nc1ccc2[nH]nc(-c3ccncc3)c2c1)C1CCN(CC(=O)N...,1,0.915993
63,Defactinib,CNC(=O)c1ccc(Nc2ncc(C(F)(F)F)c(NCc3nccnc3N(C)S...,1,0.913027
35,Gilteritinib,CCc1nc(C(N)=O)c(Nc2ccc(N3CCC(N4CCN(C)CC4)CC3)c...,1,0.912508
11,Selinexor,O=C(C=Cn1cnc(-c2cc(C(F)(F)F)cc(C(F)(F)F)c2)n1)...,1,0.878506
5,Berzosertib,CNCc1ccc(-c2cc(-c3nc(-c4ccc(S(=O)(=O)C(C)C)cc4...,1,0.876038
59,Pevonedistat,NS(=O)(=O)OCC1CC(n2ccc3c(NC4CCc5ccccc54)ncnc32...,1,0.875756
8,Odanacatib,CC(C)(F)CC(NC(c1ccc(-c2ccc(S(C)(=O)=O)cc2)cc1)...,1,0.873927


# 7. Train : Test = 0.8

Pipeline(steps=[('sclr', StandardScaler()),
                ('clf',
                 RandomForestClassifier(class_weight={0: 0.999, 1: 0.001}))])
                 
stats
tp: 10.0
fp: 4.0
tn: 9.0
fn: 1.0

score
roc_auc: 0.9090909090909092
accuracy: 0.7916666666666666
precision: 0.7142857142857143
recall: 0.9090909090909091
specificity: 0.6923076923076923
mcc: 0.6078080037565888

In [7]:
support_similar = np.array([  0,   3,   4,   5,   6,   7,   8,  10,  11,  12,  13,  14,  15,
        17,  18,  19,  21,  22,  23,  24,  25,  26,  29,  31,  32,  33,
        34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,
        47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
        61,  62,  64,  65,  67,  68,  69,  70,  71,  72,  73,  74,  75,
        76,  77,  78,  81,  82,  83,  84,  85,  87,  88,  90,  91,  92,
        93,  94,  95,  96,  97,  98, 100, 101, 102, 103, 104, 105, 106,
       107, 132, 133, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
       151, 152, 153, 154, 155, 156, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 178, 179, 180, 181, 182, 183, 184, 185,
       186, 187, 189, 190, 191, 192, 193, 197, 198, 202, 203, 206, 209,
       210, 212, 215, 216, 225, 227, 228, 229, 235, 236, 237, 240, 241,
       242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 254, 255, 256,
       257, 258, 259, 260, 263, 264, 267, 268, 272, 273, 274, 275, 277,
       278, 279, 280, 282, 283, 284, 285, 290, 292, 293, 294, 295, 296,
       297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 311, 312,
       313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325,
       326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 337, 338, 339,
       340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352,
       353, 354, 355, 356, 357, 358, 359, 367, 368, 374, 375, 376, 377,
       378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390,
       391, 392, 393, 394, 396, 397, 398, 399, 400, 401, 402, 403, 405,
       407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 418, 419, 420,
       421, 422, 423, 425, 426, 427, 428, 429, 430, 431, 432, 435, 436,
       437, 438, 439, 440, 441, 443, 444, 451, 452, 453, 454, 455, 456,
       457, 458, 459, 460, 461, 462, 463, 464, 465, 467, 468, 469, 470,
       472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
       486, 487, 488, 489, 491, 492, 493, 494, 495, 496, 497, 498, 499,
       500, 501, 502, 503, 504, 505, 508, 509, 510, 514, 515, 516, 517,
       518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530,
       531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543,
       544, 546, 547, 548, 549, 551, 552, 553, 554, 555, 556, 557, 558,
       559, 560, 561, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572,
       573, 574, 576, 577, 580, 581, 584, 585, 586, 587, 589, 590, 591,
       592, 593, 594, 595, 596, 597, 598, 599, 601, 602, 609, 610, 611,
       612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624,
       625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637,
       638, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651,
       652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664,
       665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677,
       678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690,
       691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703,
       704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716,
       718, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731,
       732, 733, 734, 735, 736, 747, 748, 749, 750, 752, 753, 754, 755,
       756, 757, 758, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784,
       785, 796, 797, 798, 799, 800, 801, 802, 803, 813, 814, 819, 820,
       823, 824, 827])

In [8]:
df_data = pd.read_csv("./featurized_unknown__full_features_828_20210206-132138.csv", index_col=0)
df_data.dropna(inplace=True)
features = df_data.iloc[:, :-2]
features_remove_similar = features.iloc[:, support_similar]
features_remove_similar[["name", "smiles"]] = df_data[["name", "smiles"]]
m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210208-224145.pkl")
out = m7p.predict(featurized_df = features_remove_similar, prefix="unknown_")

Loading trained model ... Done!
Start predicting ...Done!
Writing output ...Done! Results saved to: ../output/unknown_predicted_20210208-225757.csv


In [9]:
df_res_2 = pd.read_csv("../output/unknown_predicted_20210208-225757.csv", index_col=0)
df_pred_pos_2 = df_res_2[df_res_2["pred"]==1]
print(len(df_pred_pos_2), len(df_res_2))
df_pred_pos_2.sort_values(by=["score"], ascending=False)

52 77


Unnamed: 0,name,smiles,pred,score
14,Enzastaurin,Cn1cc(C2=C(c3cn(C4CCN(Cc5ccccn5)CC4)c4ccccc34)...,1,1.0
65,SCH772984,O=C(Nc1ccc2[nH]nc(-c3ccncc3)c2c1)C1CCN(CC(=O)N...,1,0.99
41,Midostaurin,COC1C(N(C)C(=O)c2ccccc2)CC2OC1(C)n1c3ccccc3c3c...,1,0.96
50,Darapladib,CCN(CC)CCN(Cc1ccc(-c2ccc(C(F)(F)F)cc2)cc1)C(=O...,1,0.94
34,TAE684,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,1,0.92
27,Crenolanib,CC1(COc2ccc3c(c2)ncn3-c2ccc3cccc(N4CCC(N)CC4)c...,1,0.87
60,RXDX-106,COc1cc2nccc(Oc3ccc(NC(=O)c4cn(C(C)C)c(=O)n(-c5...,1,0.86
43,Azacitidine,Nc1ncn(C2OC(CO)C(O)C2O)c(=O)n1,1,0.86
46,AZ628,Cc1ccc(NC(=O)c2cccc(C(C)(C)C#N)c2)cc1Nc1ccc2nc...,1,0.85
35,Gilteritinib,CCc1nc(C(N)=O)c(Nc2ccc(N3CCC(N4CCN(C)CC4)CC3)c...,1,0.83


# Positive has more weight

Pipeline(steps=[('sclr', StandardScaler()),
                ('clf',
                 SVC(class_weight={0: 0.01, 1: 0.99}, kernel='linear',
                     probability=True))])

stats
tp: 10.0
fp: 7.0
tn: 6.0
fn: 1.0

score
roc_auc: 0.7062937062937062
accuracy: 0.6666666666666666
precision: 0.5882352941176471
recall: 0.9090909090909091
specificity: 0.46153846153846156
mcc: 0.40628835067443575

In [10]:
df_data = pd.read_csv("./featurized_unknown__full_features_828_20210206-132138.csv", index_col=0)
df_data.dropna(inplace=True)
features = df_data.iloc[:, :-2]
features_remove_similar = features.iloc[:, support_similar]
features_remove_similar[["name", "smiles"]] = df_data[["name", "smiles"]]
m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210208-230344.pkl")
out = m7p.predict(featurized_df = features_remove_similar, prefix="unknown_")

Loading trained model ... Done!
Start predicting ...Done!
Writing output ...Done! Results saved to: ../output/unknown_predicted_20210208-230753.csv


In [11]:
df_res_2 = pd.read_csv("../output/unknown_predicted_20210208-225757.csv", index_col=0)
df_pred_pos_2 = df_res_2[df_res_2["pred"]==1]
print(len(df_pred_pos_2), len(df_res_2))
df_pred_pos_2.sort_values(by=["score"], ascending=False)

60 77


Unnamed: 0,name,smiles,pred,score
30,TG100-115,Nc1nc(N)c2nc(-c3cccc(O)c3)c(-c3cccc(O)c3)nc2n1,1,0.965995
8,Odanacatib,CC(C)(F)CC(NC(c1ccc(-c2ccc(S(C)(=O)=O)cc2)cc1)...,1,0.916169
12,Alisertib,COc1cc(Nc2ncc3c(n2)-c2ccc(Cl)cc2C(c2c(F)cccc2O...,1,0.913208
11,Selinexor,O=C(C=Cn1cnc(-c2cc(C(F)(F)F)cc(C(F)(F)F)c2)n1)...,1,0.912918
63,Defactinib,CNC(=O)c1ccc(Nc2ncc(C(F)(F)F)c(NCc3nccnc3N(C)S...,1,0.904948
0,Citarinostat,O=C(CCCCCCNC(=O)c1cnc(N(c2ccccc2)c2ccccc2Cl)nc...,1,0.89475
75,AZD4635,Cc1cc(-c2nnc(N)nc2-c2ccc(F)cc2)cc(Cl)n1,1,0.889083
65,SCH772984,O=C(Nc1ccc2[nH]nc(-c3ccncc3)c2c1)C1CCN(CC(=O)N...,1,0.887094
67,Rociletinib,C=CC(=O)Nc1cccc(Nc2nc(Nc3ccc(N4CCN(C(C)=O)CC4)...,1,0.886627
35,Gilteritinib,CCc1nc(C(N)=O)c(Nc2ccc(N3CCC(N4CCN(C)CC4)CC3)c...,1,0.886437


# New data, leave-one-out

#### best_model_20210211-031248.pkl

cv=38 (leave-one-out)

Pipeline(steps=[('sclr', StandardScaler()),
                ('clf', SVC(gamma=0.001, probability=True))])
stats
tp: 14.0
fp: 11.0
tn: 10.0
fn: 0.0

score
roc_auc: 0.7653061224489796
accuracy: 0.6857142857142857
precision: 0.56
recall: 1.0
specificity: 0.47619047619047616
mcc: 0.5163977794943223

In [42]:
support_similar = np.array([  0,   3,   4,   5,   6,   7,   8,  10,  11,  12,  13,  14,  15,
         17,  18,  19,  21,  22,  23,  24,  25,  26,  29,  31,  32,  33,
         34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,
         47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
         61,  62,  64,  65,  67,  68,  69,  70,  71,  72,  73,  74,  75,
         76,  77,  78,  82,  83,  84,  85,  87,  88,  90,  91,  92,  93,
         94,  95,  96,  97,  98, 100, 101, 102, 103, 104, 105, 106, 107,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 132, 133, 156, 157, 158, 159, 160, 161, 162, 173, 174, 175,
        176, 189, 191, 192, 193, 194, 195, 197, 198, 206, 209, 210, 212,
        215, 216, 225, 227, 228, 231, 235, 236, 237, 240, 241, 242, 244,
        245, 246, 247, 248, 249, 250, 251, 254, 255, 256, 257, 258, 259,
        260, 263, 265, 266, 271, 273, 274, 275, 276, 277, 278, 279, 280,
        282, 290, 292, 293, 294, 295, 296, 297, 298, 299, 301, 302, 303,
        304, 305, 306, 307, 311, 312, 313, 314, 315, 316, 317, 318, 319,
        320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332,
        333, 334, 335, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346,
        347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
        367, 368, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384,
        385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 396, 397, 398,
        399, 400, 401, 402, 403, 405, 407, 408, 409, 410, 411, 412, 413,
        414, 415, 416, 418, 419, 420, 421, 422, 423, 425, 426, 428, 429,
        430, 431, 432, 434, 435, 436, 437, 438, 439, 440, 441, 443, 444,
        450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462,
        463, 464, 465, 467, 468, 469, 470, 472, 473, 474, 475, 476, 477,
        478, 479, 480, 481, 482, 483, 484, 486, 487, 488, 489, 491, 492,
        493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
        508, 509, 510, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523,
        524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536,
        537, 538, 539, 540, 541, 542, 543, 544, 546, 547, 548, 549, 551,
        552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 563, 564, 565,
        566, 567, 568, 569, 570, 571, 572, 573, 574, 576, 577, 580, 581,
        584, 585, 586, 587, 589, 590, 591, 592, 593, 594, 595, 596, 597,
        598, 599, 601, 602, 609, 610, 611, 612, 613, 614, 615, 616, 617,
        618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630,
        631, 632, 633, 634, 635, 636, 637, 638, 639, 641, 642, 643, 644,
        645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657,
        658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670,
        671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683,
        684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696,
        697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709,
        710, 711, 712, 713, 714, 715, 716, 718, 720, 721, 722, 723, 724,
        725, 726, 728, 729, 730, 731, 732, 733, 734, 735, 736, 739, 740,
        741, 742, 743, 744, 745, 746, 747, 751, 752, 758, 759, 760, 761,
        762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774,
        786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798,
        799, 803, 813, 814, 816, 817, 818, 819, 820, 823, 824, 827])

In [43]:
np.save(f"{MODEL_DIR}/featureid_best_model_20210211-031248.npy", support_similar)

In [35]:
df_data = pd.read_csv("./featurized_unknown__full_features_828_20210206-132138.csv", index_col=0)
df_data.dropna(inplace=True)
features = df_data.iloc[:, :-2]
features_remove_similar = features.iloc[:, support_similar]
features_remove_similar[["name", "smiles"]] = df_data[["name", "smiles"]]
m7p = MRP7Pred(clf_dir=f"{MODEL_DIR}/best_model_20210211-031248.pkl")
out = m7p.predict(featurized_df = features_remove_similar, prefix="unknown_")

Loading trained model ... Done!
Start predicting ...Done!
Writing output ...Done! Results saved to: ../output/unknown_predicted_20210211-031940.csv


In [38]:
df_res_2 = pd.read_csv("../output/unknown_predicted_20210211-031940.csv", index_col=0)
df_pred_pos_2 = df_res_2[df_res_2["pred"]==1]
print(len(df_pred_pos_2), len(df_res_2))
df_pred_pos_2.sort_values(by=["score"], ascending=False)

64 77


Unnamed: 0,name,smiles,pred,score
65,SCH772984,O=C(Nc1ccc2[nH]nc(-c3ccncc3)c2c1)C1CCN(CC(=O)N...,1,0.862105
35,Gilteritinib,CCc1nc(C(N)=O)c(Nc2ccc(N3CCC(N4CCN(C)CC4)CC3)c...,1,0.842842
14,Enzastaurin,Cn1cc(C2=C(c3cn(C4CCN(Cc5ccccn5)CC4)c4ccccc34)...,1,0.841575
37,PF-04691502,COc1ccc(-c2cc3c(C)nc(N)nc3n(C3CCC(OCCO)CC3)c2=...,1,0.836080
27,Crenolanib,CC1(COc2ccc3c(c2)ncn3-c2ccc3cccc(N4CCC(N)CC4)c...,1,0.827933
...,...,...,...,...
28,Crizotinib,CC(Oc1cc(-c2cnn(C3CCNCC3)c2)cnc1N)c1c(Cl)ccc(F...,1,0.512048
68,GSK2256098,CONC(=O)c1ccccc1Nc1cc(Nc2cc(C)nn2C(C)C)ncc1Cl,1,0.480411
44,Letrozole,N#Cc1ccc(C(c2ccc(C#N)cc2)n2cncn2)cc1,1,0.466487
76,SHP099 hydrochloride,CC1(N)CCN(c2cnc(-c3cccc(Cl)c3Cl)c(N)n2)CC1,1,0.448787


In [41]:
df_res_2 = pd.read_csv("../output/unknown_predicted_20210211-031940.csv", index_col=0)
df_pred_pos_2 = df_res_2[(df_res_2["pred"]==1)&(df_res_2["score"]>0.5)]
print(len(df_pred_pos_2), len(df_res_2))
df_pred_pos_2.sort_values(by=["score"], ascending=False)

60 77


Unnamed: 0,name,smiles,pred,score
65,SCH772984,O=C(Nc1ccc2[nH]nc(-c3ccncc3)c2c1)C1CCN(CC(=O)N...,1,0.862105
35,Gilteritinib,CCc1nc(C(N)=O)c(Nc2ccc(N3CCC(N4CCN(C)CC4)CC3)c...,1,0.842842
14,Enzastaurin,Cn1cc(C2=C(c3cn(C4CCN(Cc5ccccn5)CC4)c4ccccc34)...,1,0.841575
37,PF-04691502,COc1ccc(-c2cc3c(C)nc(N)nc3n(C3CCC(OCCO)CC3)c2=...,1,0.83608
27,Crenolanib,CC1(COc2ccc3c(c2)ncn3-c2ccc3cccc(N4CCC(N)CC4)c...,1,0.827933
49,GSK2334470,CNc1nc(-c2ccc3c(N)n[nH]c3c2)cc(N2CC(C(=O)NC3CC...,1,0.824706
60,RXDX-106,COc1cc2nccc(Oc3ccc(NC(=O)c4cn(C(C)C)c(=O)n(-c5...,1,0.820921
59,Pevonedistat,NS(=O)(=O)OCC1CC(n2ccc3c(NC4CCc5ccccc54)ncnc32...,1,0.817449
66,Acalabrutinib,CC#CC(=O)N1CCCC1c1nc(-c2ccc(C(=O)Nc3ccccn3)cc2...,1,0.806609
55,BAY-1816032,COc1cnccc1Nc1nc(-c2nn(Cc3c(F)cc(OCCO)cc3F)c3cc...,1,0.805924
