In [1]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
from src.data import make_dataset
from src.features import build_features, build_targets
from src.models import split_data, train_model, predict_model, bitranking



ModuleNotFoundError: No module named 'src'

In [None]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload
# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2
seed=42

In [5]:
# Load data
molecules=make_dataset.load()

In [6]:
# Build the features
Xfull,fpcat=build_features.buildFragment(molecules)

Num functional groups: 39
Have 48716 fragments in library


In [14]:
# Build the targets]
yfull,aptamers,tfull=build_targets.build_hitbyapt(molecules)
print("Loaded",len(aptamers),"aptamers and",len(tfull),"targets.")

Column names are Target, H960-319, H960-940, H960-003, H960-650, H960-050, H960-616, H960-724, H960-354, H960-256, H960-920, H960-337, Amb-767, Amb-563, Amb-113, Amb-751, Amb-816, Amb-720, Amb-318, Amb-6319
Processed 960 targets.
['1A2', '1A3', '1A4', '1A5', '1A6', '1A7', '1A8', '1A9', '1A10', '1A11', '1B2', '1B3', '1B4', '1B5', '1B6', '1B7', '1B8', '1B9', '1B10', '1B11', '1C2', '1C3', '1C4', '1C5', '1C6', '1C7', '1C8', '1C9', '1C10', '1C11', '1D2', '1D3', '1D4', '1D5', '1D6', '1D7', '1D8', '1D9', '1D10', '1D11', '1E2', '1E3', '1E4', '1E5', '1E6', '1E7', '1E8', '1E9', '1E10', '1E11', '1F2', '1F3', '1F4', '1F5', '1F6', '1F7', '1F8', '1F9', '1F10', '1F11', '1G2', '1G3', '1G4', '1G5', '1G6', '1G7', '1G8', '1G9', '1G10', '1G11', '1H2', '1H3', '1H4', '1H5', '1H6', '1H7', '1H8', '1H9', '1H10', '1H11', '11A2', '11A3', '11A4', '11A5', '11A6', '11A7', '11A8', '11A9', '11A10', '11A11', '11B2', '11B3', '11B4', '11B5', '11B6', '11B7', '11B8', '11B9', '11B10', '11B11', '11C2', '11C3', '11C4', '11C5

In [8]:
# Choose an aptamer to work on
# Setup training data to include only the ones for which we have definite hit/miss classification
apt=4
print("Testing",aptamers[apt])
tsel=[True if np.isfinite(yi) else False for yi in yfull[apt]]
y=yfull[apt][tsel]
X=[Xfull[i] for i in range(len(tsel)) if tsel[i]]
targets=[tfull[i] for i in range(len(tsel)) if tsel[i] ]

Testing H960-050


In [9]:
# Create train/test sets
X_train, X_test, y_train, y_test, ind_train, ind_test = split_data.split(0.4,X,y,seed)
tgt_test=[targets[x] for x in ind_test]
tgt_train=[targets[x] for x in ind_train]
    

Fraction hits: train: 0.017, test: 0.004


In [10]:
# Train model
# Need to handle NaNs!
models=[train_model.train_rfc(X_train,y_train,seed,n_estimators=10)]

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)
Cross-validation score = 0.993 += 0.014


In [11]:
# Test model
for model in models:
    print("------")
    print(model)
    print("Train:")
    yp_train=predict_model.predict(model,X_train,y_train,tgt_train)
    print("Test:")
    yp_test=predict_model.predict(model,X_test,y_test,tgt_test)
    print("LOO:")
    yp_loo=predict_model.predictLOO(model,X,y,targets)

------
RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)
Train:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       352
         1.0       1.00      1.00      1.00         6

    accuracy                           1.00       358
   macro avg       1.00      1.00      1.00       358
weighted avg       1.00      1.00      1.00       358

[[352   0]
 [  0   6]]
ROC_AUC Score = 1.000
Test:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       239
     

KeyboardInterrupt: 

In [12]:
model.fit(X,y)
yfull = model.predict(Xfull)
for i in range(len(yfull)):
    if yfull[i]==1:
        if molecules[i].GetProp("NAME") not in targets:
            print(molecules[i].GetProp("NAME"),"may be an untested hit")    

51F8 may be an untested hit
91G3 may be an untested hit


In [15]:
# Rank fragments based on observed activity
ntop=1
for apt in range(len(aptamers)):
    print("Testing",aptamers[apt])
    tsel=[True if np.isfinite(yi) else False for yi in yfull[apt]]
    y=yfull[apt][tsel]
    X=[Xfull[i] for i in range(len(tsel)) if tsel[i]]
    targets=[tfull[i] for i in range(len(tsel)) if tsel[i] ]

    frag=bitranking.getFragRanks(fpcat, X, y,ntop=ntop)
    print('Postives       : ',[targets[i] for i in range(len(targets)) if y[i]==1 and X[i].GetBit(int(frag))])
    print('False negatives: ',[targets[i] for i in range(len(targets)) if y[i]==1 and not X[i].GetBit(int(frag))])
    print('False positives: ',[targets[i] for i in range(len(targets)) if y[i]==0 and X[i].GetBit(int(frag))])


Testing H960-319
Num of targets active: 3, inactive: 934
41301 0.020 10 3 c<=N>cC<=O>
Postives       :  ['91B3', '91D2', '91E2']
False negatives:  []
False positives:  ['91B4', '91C3', '91D4', '91E3', '91E4', '91F2', '91F3', '91F4', '91G2', '91G4']
Testing H960-940
Num of targets active: 5, inactive: 934
41301 0.035 8 5 c<=N>cC<=O>
Postives       :  ['91C3', '91D3', '91F4', '91G2', '91H2']
False negatives:  []
False positives:  ['91A4', '91B4', '91D2', '91D4', '91E2', '91E3', '91F3', '91H3']
Testing H960-003
Num of targets active: 9, inactive: 740
41141 0.058 8 8 ccnc<=N>
Postives       :  ['91B2', '91B3', '91D2', '91D3', '91E2', '91F2', '91G2', '91H2']
False negatives:  ['41F7']
False positives:  ['91A4', '91B4', '91C3', '91C4', '91E3', '91E4', '91F3', '91H3']
Testing H960-650
Num of targets active: 6, inactive: 926
41391 0.044 0 5 C<=O>cc<=N>nCCC<-OMe>
Postives       :  ['91D2', '91E2', '91F2', '91G2', '91H2']
False negatives:  ['91B3']
False positives:  []
Testing H960-050
Num of ta

In [21]:
m=molecules[648]
m2=Chem.AddHs(m)
print(CalcMolFormula(m2), Chem.MolToSmiles(m2))
print(Chem.Descriptors.ExactMolWt(m2))

C23H26N2O7S [H]/C(=C1\C(=O)N(C([H])([H])C([H])([H])C([H])([H])OC([H])([H])[H])C(C([H])([H])[H])=C1C(=O)OC([H])([H])[H])c1oc(C([H])([H])N([H])S(=O)(=O)c2c([H])c([H])c([H])c([H])c2[H])c([H])c1[H]
474.14607217200063


In [5]:
!ls /tmp


[34m015C0934-09EE-49B4-9038-CBFCD2FEBDCC[m[m
[34m063A22AA-44F8-4A7E-917A-1ECB7085ABA3[m[m
[33m108CC535-64B3-4522-ACC4-71D438E6F80C_IN[m[m
[33m108CC535-64B3-4522-ACC4-71D438E6F80C_OUT[m[m
[34m1B08C95A-EC8C-4C4D-BBB2-6618D75EC074[m[m
[34m251ACD2A-1EEE-4D77-A8F8-C523D90EBA71[m[m
[34m296B0395-AA22-42EF-A013-8216528823A4[m[m
[34m2BC41AC3-6AC4-46F5-91B0-9A30CCABA3F7[m[m
[34m2C0BDF47-32FB-4C1C-8F44-6D863A170C77[m[m
[34m2F6DDABD-A1D1-44E2-A43F-F96030AFC204[m[m
[34m4CDC438B-A068-4307-96B2-FD98690588B1[m[m
[34m5AE263A2-CB77-4C9E-BE90-55524A8F881E[m[m
[35m5eaa258b67bc3[m[m
[34m845C7F54-4480-469B-B49D-696EEB082250[m[m
[34m85FF0CB7-BD27-4B72-9846-DF7881733D78[m[m
[34m9470531D-3B98-4D17-BE40-BF0B96A0D97B[m[m
[33m96138A0D-5E0B-4827-9208-A644432C88FF_IN[m[m
[33m96138A0D-5E0B-4827-9208-A644432C88FF_OUT[m[m
[33m9A9A1578-B008-4A9F-B580-2BC295913503_IN[m[m
[33m9A9A1578-B008-4A9F-B580-2BC295913503_OUT[m[m
[34mA01B7ADF-B7E4-4AE