In [1]:
import os
import pandas as pd

from rdkit import Chem # safe import before ccdc imports
from torch_geometric.loader import DataLoader
from torch.utils.data import ConcatDataset

from conf_ensemble_dataset_in_memory import ConfEnsembleDataset
from litschnet import LitSchNet
from molsize_model import MolSizeModel
from rmsd_predictor_evaluator import RMSDPredictorEvaluator
from tqdm import tqdm

# Data preparation

In [2]:
# run once to preprocess datasets and generate chunks
dataset = ConfEnsembleDataset()
# dataset = ConfEnsembleDataset(dataset='platinum') # 16G

In [3]:
def get_test_dataset(split, 
                    iteration, 
                    data_dir='data/',
                    ) :
    
    pdbbind_chunks = [filename for filename in os.listdir(os.path.join(data_dir, 'processed')) if filename.startswith('pdbbind')]
    pdbbind_n_chunks = len(pdbbind_chunks)
    
    test_datasets = []
    
    if split in ['random', 'scaffold'] :

        with open(os.path.join(data_dir, f'ligand_{split}_splits', f'train_smiles_{split}_split_{iteration}.txt'), 'r') as f :
            train_smiles = f.readlines()
            train_smiles = [smiles.strip() for smiles in train_smiles]
        
        with open(os.path.join(data_dir, f'ligand_{split}_splits', f'test_smiles_{split}_split_{iteration}.txt'), 'r') as f :
            test_smiles = f.readlines()
            test_smiles = [smiles.strip() for smiles in test_smiles]

        for chunk_number in tqdm(range(pdbbind_n_chunks)) :

            dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                          smiles_list=test_smiles)
            test_datasets.append(dataset)
            
    else : #protein split

        with open(os.path.join(data_dir, 'protein_similarity_splits', f'train_pdb_protein_similarity_split_{iteration}.txt'), 'r') as f :
            train_pdbs = f.readlines()
            train_pdbs = [pdb.strip() for pdb in train_pdbs]
        
        with open(os.path.join(data_dir, 'protein_similarity_splits', f'test_pdb_protein_similarity_split_{iteration}.txt'), 'r') as f :
            test_pdbs = f.readlines()
            test_pdbs = [pdb.strip() for pdb in test_pdbs]

        smiles_df = pd.read_csv('data/smiles_df.csv')
        train_smiles = smiles_df[smiles_df['id'].isin(train_pdbs)]['smiles'].values
            
        for chunk_number in tqdm(range(pdbbind_n_chunks)) :

            dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                          pdb_ids_list=test_pdbs)
            test_datasets.append(dataset)

    test_dataset = ConcatDataset(test_datasets)
    
    return test_dataset, train_smiles

In [4]:
def evaluate_model(experiment_name,
                   test_dataset,
                   #platinum_dataset,
                   training_smiles,
                   tasks = ['all']) :
    
    checkpoint_name = os.listdir(os.path.join('lightning_logs', experiment_name, 'checkpoints'))[0]
    checkpoint_path = os.path.join('lightning_logs', experiment_name, 'checkpoints', checkpoint_name)
    if 'molsize' in experiment_name :
        model = MolSizeModel.load_from_checkpoint(checkpoint_path=checkpoint_path)
    else :
        model = LitSchNet.load_from_checkpoint(checkpoint_path=checkpoint_path)
    
    evaluation_name = experiment_name + '_pdbbind'
    evaluator = RMSDPredictorEvaluator(model=model, 
                                       evaluation_name=evaluation_name,
                                       training_smiles=train_smiles)
    evaluator.evaluate(test_dataset)
    # evaluator.evaluate(test_dataset, overwrite=True)
    for task in tasks :
        evaluator.evaluation_report(task=task)

#     evaluation_name = experiment_name + '_platinum'
#     evaluator = RMSDPredictorEvaluator(model=model, 
#                                        evaluation_name=evaluation_name,
#                                        training_smiles=train_smiles)
#     evaluator.evaluate(platinum_dataset, overwrite=True)
#     for task in tasks :
#         evaluator.evaluation_report(task=task)

In [5]:
data_dir = 'data/'
# platinum_chunks = [filename for filename in os.listdir(os.path.join(data_dir, 'processed')) if filename.startswith('platinum')]
# platinum_n_chunks = len(platinum_chunks)

In [6]:
# platinum_datasets = []
# for chunk_number in tqdm(range(platinum_n_chunks)) :
#     dataset = ConfEnsembleDataset(dataset='platinum', loaded_chunk=chunk_number)
#     platinum_datasets.append(dataset)
# platinum_dataset = ConcatDataset(platinum_datasets)

In [7]:
splits = ['random', 'scaffold', 'protein']

In [8]:
%%time

for split in splits :
#for split in ['protein'] :

    for iteration in range(1) :

        test_dataset, train_smiles = get_test_dataset(split, iteration)
        
        experiment_name = f'{split}_split_{iteration}'
        # evaluate_model(experiment_name, test_dataset, platinum_dataset, train_smiles)
        evaluate_model(experiment_name, test_dataset, train_smiles)
        
#         experiment_name = f'{split}_split_{iteration}_molsize'
#         evaluate_model(experiment_name, test_dataset, train_smiles)

100%|█████████████████████████████████████████████| 3/3 [01:31<00:00, 30.37s/it]


Computing training set fingerprints
Evaluation already done for given experiment random_split_0_pdbbind
Loading existing results


100%|█████████████████████████████████████████████| 3/3 [02:19<00:00, 46.46s/it]


Computing training set fingerprints
Evaluation already done for given experiment scaffold_split_0_pdbbind
Loading existing results


100%|█████████████████████████████████████████████| 3/3 [02:59<00:00, 59.98s/it]


Computing training set fingerprints
Evaluation already done for given experiment protein_split_0_pdbbind
Loading existing results
CPU times: user 20min 13s, sys: 16.9 s, total: 20min 30s
Wall time: 13min 8s


In [9]:
len(test_dataset)

101283