In [1]:
import os
import pytorch_lightning as pl
import pandas as pd

from rdkit import Chem # safe import before ccdc imports
from torch_geometric.loader import DataLoader
from torch.utils.data import ConcatDataset

from conf_ensemble_dataset_in_memory import ConfEnsembleDataset
from litschnet import LitSchNet
from molsize_model import MolSizeModel
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from tqdm import tqdm

In [2]:
pl.seed_everything(42, workers=True)

Global seed set to 42


42

# Data preparation

In [3]:
# run once to preprocess datasets and generate chunks
dataset = ConfEnsembleDataset()
# dataset = ConfEnsembleDataset(dataset='platinum')

Processing...
  0%|                                      | 23/12592 [00:09<1:43:29,  2.02it/s]RDKit ERROR: [17:26:17] UFFTYPER: Unrecognized hybridization for atom: 1
[17:26:17] UFFTYPER: Unrecognized hybridization for atom: 1
RDKit ERROR: [17:26:17] UFFTYPER: Unrecognized atom type: V_ (1)
RDKit ERROR: [17:26:17] UFFTYPER: Unrecognized hybridization for atom: 1
RDKit ERROR: [17:26:17] UFFTYPER: Unrecognized atom type: V_ (1)
[17:26:17] UFFTYPER: Unrecognized atom type: V_ (1)
[17:26:17] UFFTYPER: Unrecognized hybridization for atom: 1
[17:26:17] UFFTYPER: Unrecognized atom type: V_ (1)
  0%|▏                                       | 45/12592 [00:15<48:21,  4.32it/s]RDKit ERROR: [17:26:22] UFFTYPER: Unrecognized charge state for atom: 9
[17:26:22] UFFTYPER: Unrecognized charge state for atom: 9
RDKit ERROR: [17:26:22] UFFTYPER: Unrecognized charge state for atom: 9
[17:26:22] UFFTYPER: Unrecognized charge state for atom: 9
RDKit ERROR: [17:26:22] UFFTYPER: Unrecognized charge state for 

File error: Bad input file data/all_conf_ensembles/267.sdf
Error with smiles Cc1ccccc1S(=O)(=O)N(CCN(Cc1cncn1C)c1ccc(C#N)cc1)CC1CCN(C(=O)OC(C)(C)C)CC1
data/all_conf_ensembles/267.sdf is not ok
File error: Bad input file data/all_conf_ensembles/268.sdf
Error with smiles CO[C@@H]1O[C@@H](C)[C@@H](O)[C@@H](O)[C@@H]1O
data/all_conf_ensembles/268.sdf is not ok
File error: Bad input file data/all_conf_ensembles/269.sdf
Error with smiles CC(C)(C)c1cc(NC(=O)Nc2cccc(Cl)c2Cl)n(-c2cccc(CC(N)=O)c2)n1
data/all_conf_ensembles/269.sdf is not ok
File error: Bad input file data/all_conf_ensembles/270.sdf
Error with smiles CC(=S)N[C@H]1[C@H](O[C@@H]2[C@@H](CO)O[C@@H]3SC(C)=N[C@@H]3[C@H]2O)O[C@H](CO)[C@@H](O)[C@@H]1O
data/all_conf_ensembles/270.sdf is not ok
File error: Bad input file data/all_conf_ensembles/271.sdf
Error with smiles COc1ccc(S(=O)(=O)Nc2ccc3c(c2)CCC(=O)N3C)c(OC)c1
data/all_conf_ensembles/271.sdf is not ok
File error: Bad input file data/all_conf_ensembles/272.sdf
Error with smiles OC[C@@

  3%|█▏                                     | 397/12592 [02:57<55:50,  3.64it/s]RDKit ERROR: [17:29:05] UFFTYPER: Unrecognized charge state for atom: 11
[17:29:05] UFFTYPER: Unrecognized charge state for atom: 11
RDKit ERROR: [17:29:05] UFFTYPER: Unrecognized charge state for atom: 11
[17:29:05] UFFTYPER: Unrecognized charge state for atom: 11
RDKit ERROR: [17:29:05] UFFTYPER: Unrecognized charge state for atom: 11
[17:29:05] UFFTYPER: Unrecognized charge state for atom: 11
RDKit ERROR: [17:29:05] UFFTYPER: Unrecognized charge state for atom: 11
[17:29:05] UFFTYPER: Unrecognized charge state for atom: 11
RDKit ERROR: [17:29:05] UFFTYPER: Unrecognized charge state for atom: 11
[17:29:05] UFFTYPER: Unrecognized charge state for atom: 11
RDKit ERROR: [17:29:05] UFFTYPER: Unrecognized charge state for atom: 11
[17:29:05] UFFTYPER: Unrecognized charge state for atom: 11
RDKit ERROR: [17:29:05] UFFTYPER: Unrecognized charge state for atom: 11
[17:29:05] UFFTYPER: Unrecognized charge state fo

File error: Bad input file data/all_conf_ensembles/4872.sdf
Error with smiles N=CCNC(=O)C1(NC(=O)c2ccc(N)c(F)c2)CCCCC1.Nc1ccc(C(=O)NC2CCCCC2)cc1F
data/all_conf_ensembles/4872.sdf is not ok


 34%|████████████▎                       | 4301/12592 [32:00<2:23:48,  1.04s/it]RDKit ERROR: [17:58:08] UFFTYPER: Unrecognized atom type: B_1 (4)
RDKit ERROR: [17:58:08] UFFTYPER: Unrecognized atom type: B_1 (4)
RDKit ERROR: [17:58:08] UFFTYPER: Unrecognized atom type: B_1 (4)
RDKit ERROR: [17:58:08] UFFTYPER: Unrecognized atom type: B_1 (4)
RDKit ERROR: [17:58:08] UFFTYPER: Unrecognized atom type: B_1 (4)
RDKit ERROR: [17:58:08] UFFTYPER: Unrecognized atom type: B_1 (4)
RDKit ERROR: [17:58:08] UFFTYPER: Unrecognized atom type: B_1 (4)
RDKit ERROR: [17:58:08] UFFTYPER: Unrecognized atom type: B_1 (4)
RDKit ERROR: [17:58:08] UFFTYPER: Unrecognized atom type: B_1 (4)
RDKit ERROR: [17:58:08] UFFTYPER: Unrecognized atom type: B_1 (4)
[17:58:08] UFFTYPER: Unrecognized atom type: B_1 (4)
RDKit ERROR: [17:58:08] UFFTYPER: Unrecognized atom type: B_1 (4)
[17:58:08] UFFTYPER: Unrecognized atom type: B_1 (4)
[17:58:08] UFFTYPER: Unrecognized atom type: B_1 (4)
[17:58:08] UFFTYPER: Unrecognized a

File error: Bad input file data/all_conf_ensembles/5353.sdf
Error with smiles [NH3+]CCCC[C@H](NC(=O)[C@H]([NH3+])CO)C(=O)NCC=O.[NH3+][C@H]1CCCCNC(=O)[C@H](CO)NC(=O)[C@@H](CCC[N+](=O)C=O)NC1=O
data/all_conf_ensembles/5353.sdf is not ok


 38%|█████████████▌                      | 4747/12592 [36:02<1:00:27,  2.16it/s]RDKit ERROR: [18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
[18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
RDKit ERROR: [18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
[18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
RDKit ERROR: [18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
[18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
RDKit ERROR: [18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
[18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
RDKit ERROR: [18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
[18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
RDKit ERROR: [18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
[18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
RDKit ERROR: [18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
[18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
RDKit ERROR: [18:02:10] UFFTYPER: Unrecognized atom type: B_1 (1)
[18:02:10] UFFTYPER: 

In [4]:
splits = ['random', 'scaffold', 'protein']

In [5]:
def get_loaders(split, 
                iteration, 
                data_dir='data/',
                ) :
    
    pdbbind_chunks = [filename for filename in os.listdir(os.path.join(data_dir, 'processed')) if filename.startswith('pdbbind')]
    pdbbind_n_chunks = len(pdbbind_chunks)
    
    train_datasets = []
    val_datasets = []
    test_datasets = []
    
    if split in ['random', 'scaffold'] :
        
        with open(os.path.join(data_dir, f'ligand_{split}_splits', f'train_smiles_{split}_split_{iteration}.txt'), 'r') as f :
            train_smiles = f.readlines()
            train_smiles = [smiles.strip() for smiles in train_smiles]

        with open(os.path.join(data_dir, f'ligand_{split}_splits', f'val_smiles_{split}_split_{iteration}.txt'), 'r') as f :
            val_smiles = f.readlines()
            val_smiles = [smiles.strip() for smiles in val_smiles]

        with open(os.path.join(data_dir, f'ligand_{split}_splits', f'test_smiles_{split}_split_{iteration}.txt'), 'r') as f :
            test_smiles = f.readlines()
            test_smiles = [smiles.strip() for smiles in test_smiles]

        for chunk_number in tqdm(range(pdbbind_n_chunks)) :

            dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                          smiles_list=train_smiles)
            train_datasets.append(dataset)

            dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                          smiles_list=val_smiles)
            val_datasets.append(dataset)

            dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                          smiles_list=test_smiles)
            test_datasets.append(dataset)
            
    else : #protein split
        
        with open(os.path.join(data_dir, 'protein_similarity_splits', f'train_pdb_protein_similarity_split_{iteration}.txt'), 'r') as f :
            train_pdbs = f.readlines()
            train_pdbs = [pdb.strip() for pdb in train_pdbs]

        with open(os.path.join(data_dir, 'protein_similarity_splits', f'val_pdb_protein_similarity_split_{iteration}.txt'), 'r') as f :
            val_pdbs = f.readlines()
            val_pdbs = [pdb.strip() for pdb in val_pdbs]

        with open(os.path.join(data_dir, 'protein_similarity_splits', f'test_pdb_protein_similarity_split_{iteration}.txt'), 'r') as f :
            test_pdbs = f.readlines()
            test_pdbs = [pdb.strip() for pdb in test_pdbs]

        for chunk_number in tqdm(range(pdbbind_n_chunks)) :

            dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                          pdb_ids_list=train_pdbs)
            train_datasets.append(dataset)

            dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                          pdb_ids_list=val_pdbs)
            val_datasets.append(dataset)

            dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                          pdb_ids_list=test_pdbs)
            test_datasets.append(dataset)

    train_dataset = ConcatDataset(train_datasets)
    val_dataset = ConcatDataset(val_datasets)
    test_dataset = ConcatDataset(test_datasets)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)
    
    return train_loader, val_loader, test_loader

In [6]:
#for split in splits :
for split in ['protein'] :
    
    for iteration in range(5) :
    
        train_loader, val_loader, test_loader = get_loaders(split, iteration)
        
        experiment_name = f'{split}_split_{iteration}'
        if not experiment_name in os.listdir('lightning_logs') :
            litschnet = LitSchNet()
            logger = TensorBoardLogger(save_dir=os.getcwd(), version=experiment_name, name="lightning_logs")
            trainer = pl.Trainer(logger=logger, callbacks=[EarlyStopping(monitor="val_loss", patience=5)], gpus=1)
            trainer.fit(litschnet, train_loader, val_loader)
            trainer.test(litschnet, test_loader)
            
        experiment_name = f'{split}_split_{iteration}_molsize'
        if not experiment_name in os.listdir('lightning_logs') :
            molsize_model = MolSizeModel()
            logger = TensorBoardLogger(save_dir=os.getcwd(), version=experiment_name, name="lightning_logs")
            trainer = pl.Trainer(logger=logger, callbacks=[EarlyStopping(monitor="val_loss", patience=5)], gpus=1)
            trainer.fit(molsize_model, train_loader, val_loader)
            trainer.test(molsize_model, test_loader)

100%|████████████████████████████████████████████| 3/3 [07:43<00:00, 154.64s/it]
100%|████████████████████████████████████████████| 3/3 [07:49<00:00, 156.41s/it]
100%|████████████████████████████████████████████| 3/3 [07:54<00:00, 158.31s/it]
100%|████████████████████████████████████████████| 3/3 [07:52<00:00, 157.53s/it]
100%|████████████████████████████████████████████| 3/3 [07:51<00:00, 157.30s/it]


In [7]:
from collections import defaultdict
n_bioactive_conformations = defaultdict(list)
n_conformations = defaultdict(list)
data_dir = 'data/'

for split in splits :
    
    for iteration in range(5) :
    
        pdbbind_chunks = [filename for filename in os.listdir(os.path.join(data_dir, 'processed')) if filename.startswith('pdbbind')]
        pdbbind_n_chunks = len(pdbbind_chunks)

        train_datasets = []
        val_datasets = []
        test_datasets = []

        if split in ['random', 'scaffold'] :

            with open(os.path.join(data_dir, f'ligand_{split}_splits', f'train_smiles_{split}_split_{iteration}.txt'), 'r') as f :
                train_smiles = f.readlines()
                train_smiles = [smiles.strip() for smiles in train_smiles]

            with open(os.path.join(data_dir, f'ligand_{split}_splits', f'val_smiles_{split}_split_{iteration}.txt'), 'r') as f :
                val_smiles = f.readlines()
                val_smiles = [smiles.strip() for smiles in val_smiles]

            with open(os.path.join(data_dir, f'ligand_{split}_splits', f'test_smiles_{split}_split_{iteration}.txt'), 'r') as f :
                test_smiles = f.readlines()
                test_smiles = [smiles.strip() for smiles in test_smiles]

            for chunk_number in tqdm(range(pdbbind_n_chunks)) :

                dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                              smiles_list=train_smiles)
                train_datasets.append(dataset)

                dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                              smiles_list=val_smiles)
                val_datasets.append(dataset)

                dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                              smiles_list=test_smiles)
                test_datasets.append(dataset)

        else : #protein split

            with open(os.path.join(data_dir, 'protein_similarity_splits', f'train_pdb_protein_similarity_split_{iteration}.txt'), 'r') as f :
                train_pdbs = f.readlines()
                train_pdbs = [pdb.strip() for pdb in train_pdbs]

            with open(os.path.join(data_dir, 'protein_similarity_splits', f'val_pdb_protein_similarity_split_{iteration}.txt'), 'r') as f :
                val_pdbs = f.readlines()
                val_pdbs = [pdb.strip() for pdb in val_pdbs]

            with open(os.path.join(data_dir, 'protein_similarity_splits', f'test_pdb_protein_similarity_split_{iteration}.txt'), 'r') as f :
                test_pdbs = f.readlines()
                test_pdbs = [pdb.strip() for pdb in test_pdbs]

            for chunk_number in tqdm(range(pdbbind_n_chunks)) :

                dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                              pdb_ids_list=train_pdbs)
                train_datasets.append(dataset)

                dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                              pdb_ids_list=val_pdbs)
                val_datasets.append(dataset)

                dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                              pdb_ids_list=test_pdbs)
                test_datasets.append(dataset)

        train_dataset = ConcatDataset(train_datasets)
        val_dataset = ConcatDataset(val_datasets)
        test_dataset = ConcatDataset(test_datasets)
        
        d = {
            'train' : train_dataset,
            'val' : val_dataset,
            'test' : test_dataset
        }
        for s, dataset in d.items() :
            n_conformations[s].append(len(dataset))
            n_bio = 0
            for data in dataset :
                if data.rmsd == 0 :
                    n_bio = n_bio + 1
            n_bioactive_conformations[s].append(n_bio)

 75%|█████████████████████████████████▊           | 3/4 [04:44<01:34, 94.75s/it]


IndexError: list index out of range

In [None]:
n_conformations

In [None]:
n_bioactive_conformations