In [2]:
import pandas as pd
from pathlib import Path
from lightning import pytorch as pl
from chemprop import data, featurizers, models, nn

In [11]:
df_input = pd.read_csv('cell_based_assay_traindata_binary_smiles.csv')
df_input

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,CID,1159552,1159553,1159555,1224838,1224839,1224841,1224842,...,1920064,1920067,1920068,1963577,1963578,1963582,1963583,1963584,1963585,SMILES
0,0,19,196.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C(CCC(=O)O)CC(=O)O
1,1,20,204.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C1(C(=O)NC(=O)N1)NC(=O)N
2,2,21,227.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C1=CC=C(C(=C1)C(=O)O)N
3,3,25,244.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C1=CC=C(C=C1)CO
4,4,44,379.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,CCCCCCCC(=O)O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,355,8347,135398739.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C1C[C@@H](O[C@@H]1CO)N2C=NC3=C2N=CNC3=O
356,356,8348,135398740.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C1=NC2=C(N1COC(CO)CO)N=C(NC2=O)N
357,357,8394,135508550.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C[C@H]1C(=O)N[C@H](C(=O)N/C(=C\NC(=O)N)/C(=O)N...
358,358,8396,135512460.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,COC(=O)[C@H](CCCN/C(=N/[N+](=O)[O-])/N)N


In [12]:
smiles_column = 'SMILES'
smis = df_input.loc[:, smiles_column].values

In [13]:
target_columns= '1224842'
ys = df_input.loc[:, target_columns].values
smis[:5], ys[:5]

(array(['C(CCC(=O)O)CC(=O)O', 'C1(C(=O)NC(=O)N1)NC(=O)N',
        'C1=CC=C(C(=C1)C(=O)O)N', 'C1=CC=C(C=C1)CO', 'CCCCCCCC(=O)O'],
       dtype=object),
 array([0, 0, 0, 0, 0]))

In [14]:
all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)]

In [15]:
# available split types
list(data.SplitType.keys())

['CV_NO_VAL',
 'CV',
 'SCAFFOLD_BALANCED',
 'RANDOM_WITH_REPEATED_SMILES',
 'RANDOM',
 'KENNARD_STONE',
 'KMEANS']

In [16]:
mols = [d.mol for d in all_data]  # RDkit Mol objects are use for structure based splits
train_indices, val_indices, test_indices = data.make_split_indices(mols, "random", (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(
    all_data, train_indices, val_indices, test_indices
)

In [17]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

train_dset = data.MoleculeDataset(train_data[0], featurizer)
val_dset = data.MoleculeDataset(val_data[0], featurizer)
test_dset = data.MoleculeDataset(test_data[0], featurizer)

TypeError: 'MoleculeDatapoint' object is not iterable

In [3]:
df_input = pd.read_csv('single_assay_train_binary.csv')
df_input

Unnamed: 0.1,Unnamed: 0,AID,CID,Activity Outcome,Protein Accession,SMILES
0,2194047,1671197,155819421.0,1,AAV41877,C[C@H]1/C=C\C=C(/C(=O)NC2=CC(=C3C(=C2O)C(=C(C4...
1,2194049,1671197,57372628.0,0,AAV41877,C[N+](C)(CCOC1=CC=CC=C1)CC2=CC=CS2.[I-]
2,2194053,1671197,72187528.0,0,AAV41877,CC(C)C1=C(C=C2C(=C1)CC[C@@H]3[C@@]2(CCC[C@@]3(...
3,2194057,1671197,69155711.0,1,AAV41877,C[C@@H](C1=NC=NC=C1F)C(CN2C=NC=N2)(C3=C(C=C(C=...
4,2194059,1671197,9916275.0,1,AAV41877,CCN(CC)C1=CC2=C(C=C1)C(=C3C=CC(=[N+](CC)CC)C=C...
...,...,...,...,...,...,...
8120,2213083,1671197,6432394.0,1,AAV41877,CCOC(=O)C1(CCN(CC1)CCC(C#N)(C2=CC=CC=C2)C3=CC=...
8121,2213087,1671197,2723891.0,1,AAV41877,C(CC(=O)O)[C@@H](C(=O)O)N.Cl
8122,2213089,1671197,3317081.0,1,AAV41877,C1=CC=C(C(=C1)C2C(O2)(CN3C=NC=N3)C4=CC=C(C=C4)...
8123,2213091,1671197,6917787.0,0,AAV41877,CC1=CC=CC=C1N2CCN(CC2)CCC3=NN=C4N3CCCC4.Cl


In [4]:
columns = ['Activity Outcome', 'SMILES']
df_input = df_input[columns]
df_input

Unnamed: 0,Activity Outcome,SMILES
0,1,C[C@H]1/C=C\C=C(/C(=O)NC2=CC(=C3C(=C2O)C(=C(C4...
1,0,C[N+](C)(CCOC1=CC=CC=C1)CC2=CC=CS2.[I-]
2,0,CC(C)C1=C(C=C2C(=C1)CC[C@@H]3[C@@]2(CCC[C@@]3(...
3,1,C[C@@H](C1=NC=NC=C1F)C(CN2C=NC=N2)(C3=C(C=C(C=...
4,1,CCN(CC)C1=CC2=C(C=C1)C(=C3C=CC(=[N+](CC)CC)C=C...
...,...,...
8120,1,CCOC(=O)C1(CCN(CC1)CCC(C#N)(C2=CC=CC=C2)C3=CC=...
8121,1,C(CC(=O)O)[C@@H](C(=O)O)N.Cl
8122,1,C1=CC=C(C(=C1)C2C(O2)(CN3C=NC=N3)C4=CC=C(C=C4)...
8123,0,CC1=CC=CC=C1N2CCN(CC2)CCC3=NN=C4N3CCCC4.Cl


In [30]:
df_input.to_csv('single_assay_train_test.csv', header=True)

In [23]:
smiles_column = 'SMILES'
smis = df_input.loc[:, smiles_column].values

In [24]:
target_columns= 'Activity Outcome'
ys = df_input.loc[:, target_columns].values
smis[:5], ys[:5]

(array(['C[C@H]1/C=C\\C=C(/C(=O)NC2=CC(=C3C(=C2O)C(=C(C4=C3C(=O)[C@](O4)(OC=C[C@@H]([C@H]([C@H]([C@@H]([C@@H]([C@@H]([C@H]1O)C)O)C)OC(=O)C)C)OC)C)C)O)[O-])\\C.[Na+]',
        'C[N+](C)(CCOC1=CC=CC=C1)CC2=CC=CS2.[I-]',
        'CC(C)C1=C(C=C2C(=C1)CC[C@@H]3[C@@]2(CCC[C@@]3(C)C(=O)[O-])C)S(=O)(=O)O.[Na+]',
        'C[C@@H](C1=NC=NC=C1F)C(CN2C=NC=N2)(C3=C(C=C(C=C3)F)F)O',
        'CCN(CC)C1=CC2=C(C=C1)C(=C3C=CC(=[N+](CC)CC)C=C3O2)C4=C(C=C(C=C4)S(=O)(=O)[O-])S(=O)(=O)[O-].[Na+]'],
       dtype=object),
 array([1, 0, 0, 1, 1]))

In [25]:
all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)]



In [26]:
list(data.SplitType.keys())

['CV_NO_VAL',
 'CV',
 'SCAFFOLD_BALANCED',
 'RANDOM_WITH_REPEATED_SMILES',
 'RANDOM',
 'KENNARD_STONE',
 'KMEANS']

In [27]:
mols = [d.mol for d in all_data]  # RDkit Mol objects are use for structure based splits
train_indices, val_indices, test_indices = data.make_split_indices(mols, "random", (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(
    all_data, train_indices, val_indices, test_indices
)

In [28]:
from sklearn.model_selection import KFold

k_splits = KFold(n_splits=5)
k_train_indices, k_val_indices, k_test_indices = [], [], []
for fold in k_splits.split(mols):
    k_train_indices.append(fold[0])
    k_val_indices.append([])
    k_test_indices.append(fold[1])
k_train_data, _, k_test_data = data.split_data_by_indices(
    all_data, k_train_indices, None, k_test_indices
)

In [29]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

train_dset = data.MoleculeDataset(train_data[0], featurizer)
scaler = train_dset.normalize_targets()

val_dset = data.MoleculeDataset(val_data[0], featurizer)
val_dset.normalize_targets(scaler)

test_dset = data.MoleculeDataset(test_data[0], featurizer)

TypeError: 'MoleculeDatapoint' object is not iterable

In [5]:
pip list

Package                   Version
------------------------- -----------
aimsim_core               2.2.2
aiohappyeyeballs          2.4.4
aiohttp                   3.11.10
aiosignal                 1.3.1
anyio                     4.6.2
argon2-cffi               21.3.0
argon2-cffi-bindings      21.2.0
astartes                  1.3.0
asttokens                 2.0.5
async-lru                 2.0.4
attrs                     24.2.0
Babel                     2.11.0
beautifulsoup4            4.12.3
bleach                    6.2.0
Brotli                    1.0.9
certifi                   2024.8.30
cffi                      1.17.1
charset-normalizer        3.3.2
chemprop                  2.1.0
comm                      0.2.1
ConfigArgParse            1.7
debugpy                   1.6.7
decorator                 5.1.1
defusedxml                0.7.1
descriptastorus           2.8.0
dill                      0.3.9
executing                 0.8.3
fastjsonschema            2.20.0
filelock             