In [1]:
import sys
sys.path.append('../')
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
from src.data import make_dataset
from src.features import build_features, build_targets
from src.models import split_data, train_model, predict_model, bitranking



In [2]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload
# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2
seed=42

In [2]:
# Load data
molecules=make_dataset.load()

OSError: File error: Bad input file ../data/raw/ChemDivFull.sdf

In [4]:
# Build the features
Xfull,fpcat=build_features.buildFragment(molecules)

Num functional groups: 39
Have 48716 fragments in library


In [7]:
# Build the targets]
yfull,aptamers,tfull=build_targets.build_hitbyapt(molecules)
print("Loaded",len(aptamers),"aptamers and",len(tfull),"targets.")

Column names are Target, H960-319, H960-940, H960-003, H960-650, H960-050, H960-616, H960-724, H960-256, H960-920, Amb-767, Amb-751, Amb-816, Amb-720, Amb-318, Amb-6319
Processed 960 targets.
['1A02', '1A03', '1A04', '1A05', '1A06', '1A07', '1A08', '1A09', '1A10', '1A11', '1B02', '1B03', '1B04', '1B05', '1B06', '1B07', '1B08', '1B09', '1B10', '1B11', '1C02', '1C03', '1C04', '1C05', '1C06', '1C07', '1C08', '1C09', '1C10', '1C11', '1D02', '1D03', '1D04', '1D05', '1D06', '1D07', '1D08', '1D09', '1D10', '1D11', '1E02', '1E03', '1E04', '1E05', '1E06', '1E07', '1E08', '1E09', '1E10', '1E11', '1F02', '1F03', '1F04', '1F05', '1F06', '1F07', '1F08', '1F09', '1F10', '1F11', '1G02', '1G03', '1G04', '1G05', '1G06', '1G07', '1G08', '1G09', '1G10', '1G11', '1H02', '1H03', '1H04', '1H05', '1H06', '1H07', '1H08', '1H09', '1H10', '1H11', '11A02', '11A03', '11A04', '11A05', '11A06', '11A07', '11A08', '11A09', '11A10', '11A11', '11B02', '11B03', '11B04', '11B05', '11B06', '11B07', '11B08', '11B09', '11B1

In [10]:
# Choose an aptamer to work on
# Setup training data to include only the ones for which we have definite hit/miss classification
apt=4
print("Testing",aptamers[apt])
tsel=[True if np.isfinite(yi) else False for yi in yfull[apt]]
y=yfull[apt][tsel]
X=[Xfull[i] for i in range(len(tsel)) if tsel[i]]
targets=[tfull[i] for i in range(len(tsel)) if tsel[i] ]

Testing H960-050


In [11]:
# Create train/test sets
X_train, X_test, y_train, y_test, ind_train, ind_test = split_data.split(0.4,X,y,seed)
tgt_test=[targets[x] for x in ind_test]
tgt_train=[targets[x] for x in ind_train]
    

Fraction hits: train: 0.017, test: 0.004


In [12]:
# Train model
# Need to handle NaNs!
models=[train_model.train_rfc(X_train,y_train,seed,n_estimators=10)]

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)
Cross-validation score = 0.993 += 0.014


In [13]:
# Test model
for model in models:
    print("------")
    print(model)
    print("Train:")
    yp_train=predict_model.predict(model,X_train,y_train,tgt_train)
    print("Test:")
    yp_test=predict_model.predict(model,X_test,y_test,tgt_test)
    print("LOO:")
    yp_loo=predict_model.predictLOO(model,X,y,targets)

------
RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)
Train:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       352
         1.0       1.00      1.00      1.00         6

    accuracy                           1.00       358
   macro avg       1.00      1.00      1.00       358
weighted avg       1.00      1.00      1.00       358

[[352   0]
 [  0   6]]
ROC_AUC Score = 1.000
Test:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       239
     

In [12]:
model.fit(X,y)
yfull = model.predict(Xfull)
for i in range(len(yfull)):
    if yfull[i]==1:
        if molecules[i].GetProp("NAME") not in targets:
            print(molecules[i].GetProp("NAME"),"may be an untested hit")    

51F8 may be an untested hit
91G3 may be an untested hit


In [8]:
# Rank fragments based on observed activity
ntop=20
for apt in range(len(aptamers)):
    print("Testing",aptamers[apt])
    tsel=[True if np.isfinite(yi) else False for yi in yfull[apt]]
    y=yfull[apt][tsel]
    X=[Xfull[i] for i in range(len(tsel)) if tsel[i]]
    targets=[tfull[i] for i in range(len(tsel)) if tsel[i] ]

    frag=bitranking.getFragRanks(fpcat, X, y,ntop=ntop)
    print('Postives       : ',[targets[i] for i in range(len(targets)) if y[i]==1 and X[i].GetBit(int(frag))])
    print('False negatives: ',[targets[i] for i in range(len(targets)) if y[i]==1 and not X[i].GetBit(int(frag))])
    print('False positives: ',[targets[i] for i in range(len(targets)) if y[i]==0 and X[i].GetBit(int(frag))])
    print()

Testing H960-319
Num of targets active: 3, inactive: 891
 6895 0.021 11  3 +++ C<=O>ccccnC
41313 0.021 11  3 +++ c<=N>c(c)C<=O>N
41326 0.021 11  3 +++ C<=O>cc<=N>ncn
41307 0.021 11  3 +++ C<=O>cc<=N>nC
41302 0.021 11  3 +++ C<=O>cc<=N>n
41314 0.021 11  3 +++ c<=N>cC<=O>NC
41304 0.021 11  3 +++ c<=N>cC<=O>N
41330 0.021 11  3 +++ cc(C<=O>)c<=N>nC
41323 0.021 11  3 +++ cccc(c<=N>)C<=O>
41329 0.021 11  3 +++ cn(C)c<=N>cC<=O>
41328 0.021 11  3 +++ C<=O>cc<=N>nCC
41303 0.021 11  3 +++ c<=N>c(c)C<=O>
41310 0.021 11  3 +++ nc<=N>cC<=O>N
41301 0.021 11  3 +++ c<=N>cC<=O>
41332 0.021 11  3 +++ cnc<=N>c(c)C<=O>
41331 0.021 11  3 +++ Cnc<=N>cC<=O>N
41317 0.021 11  3 +++ ccnc<=N>cC<=O>
41312 0.021 11  3 +++ ccc(c<=N>)C<=O>
41309 0.021 11  3 +++ cc(C<=O>)c<=N>n
41308 0.021 11  3 +++ cnc<=N>cC<=O>
Postives       :  ['91B03', '91D02', '91E02']
False negatives:  []
False positives:  ['91A03', '91B04', '91C03', '91D04', '91E03', '91E04', '91F02', '91F03', '91F04', '91G02', '91G04']

Testing H960-940
Num

In [21]:
m=molecules[648]
m2=Chem.AddHs(m)
print(CalcMolFormula(m2), Chem.MolToSmiles(m2))
print(Chem.Descriptors.ExactMolWt(m2))

C23H26N2O7S [H]/C(=C1\C(=O)N(C([H])([H])C([H])([H])C([H])([H])OC([H])([H])[H])C(C([H])([H])[H])=C1C(=O)OC([H])([H])[H])c1oc(C([H])([H])N([H])S(=O)(=O)c2c([H])c([H])c([H])c([H])c2[H])c([H])c1[H]
474.14607217200063
