https://www.blopig.com/blog/2022/11/how-to-turn-a-smiles-string-into-an-extended-connectivity-fingerprint-using-rdkit/

In [3]:
import numpy as np
from rdkit.Chem import AllChem
from dataloader import DataLoader

dl = DataLoader("../data/test.csv", modelType="smile",fiftyfifty=True)

/Users/aidan/Documents/caterpillar/model




In [4]:
# From site
# define function that transforms SMILES strings into ECFPs
def ECFP_from_smiles(smiles,
                     R = 2,
                     L = 2**10,
                     use_features = False,
                     use_chirality = False):
    """
    Inputs:
    
    - smiles ... SMILES string of input compound
    - R ... maximum radius of circular substructures
    - L ... fingerprint-length
    - use_features ... if false then use standard DAYLIGHT atom features, if true then use pharmacophoric atom features
    - use_chirality ... if true then append tetrahedral chirality flags to atom features
    
    Outputs:
    - np.array(feature_list) ... ECFP with length L and maximum radius R
    """
    
    molecule = AllChem.MolFromSmiles(smiles)
    feature_list = AllChem.GetMorganFingerprintAsBitVect(molecule,
                                                                       radius = R,
                                                                       nBits = L,
                                                                       useFeatures = use_features,
                                                                       useChirality = use_chirality)
    return np.array(feature_list)

In [5]:
smile_train, y_train, smile_test, y_test = dl.getData()

[['COc1ccc2nccc([C@@H](O)[C@@H]3C[C@H]4CCN3C[C@@H]4C=C)c2c1']
 ['N[C@@H](C(=O)N[C@H]1[C@H]2SCC(=C(N2C1=O)C(O)=O)Cl)c3ccccc3']
 ['C1C(Nc2c1c(ccc2)CCN(CCC)CCC)=O']
 ['C3=C(C(OC1C2CCN(C1)CC2)=O)C=CC=C3']
 ['[C@]23([C@H]([C@H]1[C@]([C@](C(CO)=O)(C)[C@@H](C1)C)(C)C[C@@H]2O)CCC4=CC(=O)C=C[C@]34C)F']
 ['Clc1cccc(c1)N2CCN(CCCN3N=C4C=CC=CN4C3=O)CC2']
 ['C1=C(C(=NN=C1C2=CC=CC=C2)NCCN3CCOCC3)C']
 ['COc1cccc2C(=O)c3c(O)c4C[C@](O)(C[C@H](O[C@H]5C[C@H](N)[C@H](O[C@H]6CCCCO6)[C@H](C)O5)c4c(O)c3C(=O)c12)C(=O)CO']
 ['[C@H]23[C@@]([C@@]1(O[C@@H](O[C@@H]1C2)CCC)C(=O)CO)(C[C@H](O)[C@@]4(F)[C@H]3C[C@H](F)C5=CC(=O)CC[C@]45C)C']
 ['COC1=C(N3C(SC1)C(NC(=O)C(N)C2C=CCC=C2)C3=O)C(O)=O']
 ['CNC1C(O)C(OCC1(C)O)OC2C(O)C(OC3OC(CN)C(O)C(O)C3O)C(N)CC2NC(=O)C(O)CN']
 ['C(C1=C(C)[NH]C3=C1C(C(CN2CCOCC2)CC3)=O)C']
 ['O.O.CN(C)[C@H]1[C@@H]2[C@@H](O)[C@H]3C(=C(O)c4c(O)cccc4[C@@]3(C)O)C(=O)[C@]2(O)C(=O)\\C(=C(N)/O)C1=O']
 ['CC1=C(N2[C@H](SC1)[C@H](NC(=O)[C@H](N)c3ccc(O)c(Cl)c3)C2=O)C(O)=O']
 ['[H+].C3=C2\\C(C1=CC=CC=C1SCC2=C

In [12]:
print(str(smile_train[0][0]))

COc1ccc2nccc([C@@H](O)[C@@H]3C[C@H]4CCN3C[C@@H]4C=C)c2c1


In [13]:
x_train = []
x_test = []
for smile in smile_train:
    x_train.append(ECFP_from_smiles(str(smile[0])))
for smile in smile_test:
    x_test.append(ECFP_from_smiles(str(smile[0])))
x_train = np.array(x_train)
x_test = np.array(x_test)



In [17]:
from keras import layers
import tensorflow as tf

spectral_model = tf.keras.Sequential([
    layers.Dense(1000, activation = 'relu'),
    layers.Dense(1000, activation = 'relu'),
    layers.Dense(500, activation = 'relu'),
    layers.Dense(200, activation = 'relu'),
    layers.Dense(50, activation = 'relu'),
    layers.Dense(1, activation = 'sigmoid')
])

2023-12-04 14:15:54.715636: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
spectral_model.compile(optimizer='adam', loss='binary_crossentropy')
spectral_model.fit(x_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fec2a685c70>

In [19]:
y_hat = spectral_model.predict(x_test)
#print(y_hat)
right = 0
for i in range(len(y_hat)):
    t = 0
    if y_hat[i] > 0.5:
        t = 1
    if t == y_test[i]:
        right += 1
print(right/len(y_test))

0.8125
