In [1]:
import csv

import matplotlib.pyplot as plt
import rdkit as rd
import sklearn.metrics as sm
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score



In [2]:
# Read in the CDIV molecules
suppl = rd.Chem.SDMolSupplier('../data/raw/ChemDivFull.sdf')

In [3]:
# Extract only the 960 tested molecules
plates = ["CDIV%04d"%p for p in range(1,121,10)]
print(len(plates),plates)
tested = [x for x in suppl if x.GetProp("BATCH_PLATE") in plates]
print(len(suppl),len(tested))

12 ['CDIV0001', 'CDIV0011', 'CDIV0021', 'CDIV0031', 'CDIV0041', 'CDIV0051', 'CDIV0061', 'CDIV0071', 'CDIV0081', 'CDIV0091', 'CDIV0101', 'CDIV0111']
50000 960


In [4]:
# Setup fingerprints for all the molecules 
fp=[]
for m in tested:
    fp.append(AllChem.RDKFingerprint(m, fpSize=2048))
print(fp)

[<rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x1a243df030>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x1a243df080>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x1a243df120>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x1a243df170>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x1a243df1c0>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x1a243df260>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x1a243df2b0>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x1a243df300>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x1a243df350>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x1a243df3a0>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x1a243f2f30>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x1a243df3f0>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x1a243df440>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect ob

In [5]:
# Set NAME property of molecules to match data from Matlab
for mol in tested:
    if mol is None: continue
    plate=mol.GetProp("BATCH_PLATE")
    plate=int(plate[5:])
    well=mol.GetProp("BATCH_WELL")
    name="%d%s"%(plate,well.replace('0',''))
    print(plate,well,name)
    mol.SetProp("NAME",name)

1 A02 1A2
1 A03 1A3
1 A04 1A4
1 A05 1A5
1 A06 1A6
1 A07 1A7
1 A08 1A8
1 A09 1A9
1 A10 1A1
1 A11 1A11
1 B02 1B2
1 B03 1B3
1 B04 1B4
1 B05 1B5
1 B06 1B6
1 B07 1B7
1 B08 1B8
1 B09 1B9
1 B10 1B1
1 B11 1B11
1 C02 1C2
1 C03 1C3
1 C04 1C4
1 C05 1C5
1 C06 1C6
1 C07 1C7
1 C08 1C8
1 C09 1C9
1 C10 1C1
1 C11 1C11
1 D02 1D2
1 D03 1D3
1 D04 1D4
1 D05 1D5
1 D06 1D6
1 D07 1D7
1 D08 1D8
1 D09 1D9
1 D10 1D1
1 D11 1D11
1 E02 1E2
1 E03 1E3
1 E04 1E4
1 E05 1E5
1 E06 1E6
1 E07 1E7
1 E08 1E8
1 E09 1E9
1 E10 1E1
1 E11 1E11
1 F02 1F2
1 F03 1F3
1 F04 1F4
1 F05 1F5
1 F06 1F6
1 F07 1F7
1 F08 1F8
1 F09 1F9
1 F10 1F1
1 F11 1F11
1 G02 1G2
1 G03 1G3
1 G04 1G4
1 G05 1G5
1 G06 1G6
1 G07 1G7
1 G08 1G8
1 G09 1G9
1 G10 1G1
1 G11 1G11
1 H02 1H2
1 H03 1H3
1 H04 1H4
1 H05 1H5
1 H06 1H6
1 H07 1H7
1 H08 1H8
1 H09 1H9
1 H10 1H1
1 H11 1H11
11 A02 11A2
11 A03 11A3
11 A04 11A4
11 A05 11A5
11 A06 11A6
11 A07 11A7
11 A08 11A8
11 A09 11A9
11 A10 11A1
11 A11 11A11
11 B02 11B2
11 B03 11B3
11 B04 11B4
11 B05 11B5
11 B06 11B6
11 B07 11B7

In [6]:
def getMol(mols,plate,well):
    mol=[x for x in mols if x.GetProp("BATCH_PLATE")=="CDIV%04d"%plate and x.GetProp("BATCH_WELL")==well]
    assert(len(mol)==1)
    return mol[0]

In [7]:
# Set categorization of molecules
# Property, HIT, will be set to true iff we found an aptamer for the molecule
fold=[]
targets=[]
with open('../data/raw/fold.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            aptamers=row[1:]
            line_count += 1
        else:
            line_count += 1
            fold.append([float(x) for x in row[1:]])
            targets.append(row[0])
    print(f'Processed {line_count} targets.')
print(targets)
# Set fold property



Column names are Target, H960-266, H960-319, H960-850, H960-892, H960-735, H960-425, H960-940, H960-613, H960-251, H960-003, H960-875, H960-650, H960-172, H960-186, H960-050, H960-594, H960-228, H960-316, H960-488, H960-629, H960-668, H960-5050, H960-561, H960-156, H960-922, H960-843, H960-616, H960-617, H960-505, H960-072, H960-724, H960-315, H960-939, H960-256, H960-920, H960-869, H960-356, H960-337, H960-540, NSRef-630, NS-404, NS-231, NS-160, Amb-767, Amb-563, Amb-113, Amb-751, Amb-816, Amb-720, Amb-318, Amb-6319
Processed 102 targets.
['01A4', '01C5', '01C7', '01F7', '101B7', '101C7', '101D11', '101D7', '101D9', '101E10', '101E6', '101F11', '101F2', '101F7', '101G6', '101H6', '111E3', '111H2', '111H7', '11D3', '21G8', '31B11', '31B9', '31C10', '31C2', '31C3', '31C8', '31D4', '31D7', '31D8', '31E10', '31E3', '31E4', '31E7', '31E9', '31F10', '31H10', '41A11', '41C4', '41C5', '41C7', '41D10', '41D4', '41D7', '41E10', '41E2', '41E3', '41E4', '41E7', '41E9', '41F10', '41F2', '41F3', '4

In [8]:
# Setup ML input as X, output as Y
import numpy as np
X = np.array(list(fp))
y=[t.GetBoolProp("HIT") for t in tested]


KeyError: 'HIT'

In [None]:
# Create train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
print('Fraction hits: train: %.3f, test: %.3f'%(np.mean(y_train),np.mean(y_test)))


In [None]:
clf = RandomForestClassifier(n_estimators=15)
scores = cross_val_score(clf, X, y, cv=5)
np.mean(scores)

In [None]:
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)
y_predict=rfc.predict(X_test)
y_train_predict=rfc.predict(X_train)

In [None]:
ax = plt.gca()
sm.roc_auc_score(y_test, y_predict)

In [None]:
print(sm.classification_report(y_test, y_predict))
print(sm.confusion_matrix(y_test, y_predict))
print(sm.classification_report(y_train, y_train_predict))
print(sm.confusion_matrix(y_train, y_train_predict))

In [None]:
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
y_predict=lr.predict(X_test)
y_train_predict=lr.predict(X_train)

In [None]:
print(sm.classification_report(y_test, y_predict))
print(sm.confusion_matrix(y_test, y_predict))
print(sm.classification_report(y_train, y_train_predict))
print(sm.confusion_matrix(y_train, y_train_predict))

In [None]:
lr.coef_.shape
plt.hist(lr.coef_[0],100)
plt.show()
