In [1]:
import os
import pandas as pd
import torch

from rdkit import Chem # safe import before ccdc imports
from torch_geometric.loader import DataLoader
from torch.utils.data import ConcatDataset

from conf_ensemble_dataset_in_memory import ConfEnsembleDataset
from litschnet import LitSchNet
from molsize_model import MolSizeModel
from rmsd_predictor_evaluator import RMSDPredictorEvaluator
from tqdm import tqdm

# Data preparation

In [2]:
# run once to preprocess datasets and generate chunks
dataset = ConfEnsembleDataset()
# dataset = ConfEnsembleDataset(dataset='platinum') # 16G

In [3]:
def get_train_smiles(split, 
                     iteration, 
                     data_dir='data/') :
    
    if split in ['random', 'scaffold'] :
        with open(os.path.join(data_dir, f'ligand_{split}_splits', f'train_smiles_{split}_split_{iteration}.txt'), 'r') as f :
            train_smiles = f.readlines()
            train_smiles = [smiles.strip() for smiles in train_smiles]
    else :
        with open(os.path.join(data_dir, 'protein_similarity_splits', f'train_pdb_protein_similarity_split_{iteration}.txt'), 'r') as f :
            train_pdbs = f.readlines()
            train_pdbs = [pdb.strip() for pdb in train_pdbs]
        smiles_df = pd.read_csv('data/smiles_df.csv')
        train_smiles = smiles_df[smiles_df['id'].isin(train_pdbs)]['smiles'].values
        
    return train_smiles

In [4]:
def get_test_dataset(split, 
                    iteration, 
                    data_dir='data/') :
    
    train_smiles = get_train_smiles(split, iteration, data_dir)
    
    test_dataset_path = os.path.join(data_dir, f'test_dataset_{split}_{iteration}.p')
    if not os.path.exists(test_dataset_path) :
    
        pdbbind_chunks = [filename for filename in os.listdir(os.path.join(data_dir, 'processed')) if filename.startswith('pdbbind')]
        pdbbind_n_chunks = len(pdbbind_chunks)
    
        test_datasets = []
    
        if split in ['random', 'scaffold'] :
            
            with open(os.path.join(data_dir, f'ligand_{split}_splits', f'test_smiles_{split}_split_{iteration}.txt'), 'r') as f :
                test_smiles = f.readlines()
                test_smiles = [smiles.strip() for smiles in test_smiles]

            for chunk_number in tqdm(range(pdbbind_n_chunks)) :
                dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                              smiles_list=test_smiles)
                test_datasets.append(dataset)

        else : #protein split

            with open(os.path.join(data_dir, 'protein_similarity_splits', f'test_pdb_protein_similarity_split_{iteration}.txt'), 'r') as f :
                test_pdbs = f.readlines()
                test_pdbs = [pdb.strip() for pdb in test_pdbs]
            
            for chunk_number in tqdm(range(pdbbind_n_chunks)) :
                dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                              pdb_ids_list=test_pdbs)
                test_datasets.append(dataset)

        test_dataset = ConcatDataset(test_datasets)
        torch.save(test_dataset, test_dataset_path)
    
    else : #Load dataset
        test_dataset = torch.load(test_dataset_path)
    
    return test_dataset, train_smiles

In [5]:
def evaluate_model(experiment_name,
                   test_dataset,
                   #platinum_dataset,
                   training_smiles,
                   tasks = ['all', 'easy', 'hard']) :
    
    checkpoint_name = os.listdir(os.path.join('lightning_logs', experiment_name, 'checkpoints'))[0]
    checkpoint_path = os.path.join('lightning_logs', experiment_name, 'checkpoints', checkpoint_name)
    if 'molsize' in experiment_name :
        model = MolSizeModel.load_from_checkpoint(checkpoint_path=checkpoint_path)
    else :
        model = LitSchNet.load_from_checkpoint(checkpoint_path=checkpoint_path)
    
    evaluation_name = experiment_name + '_pdbbind'
    evaluator = RMSDPredictorEvaluator(model=model, 
                                       evaluation_name=evaluation_name,
                                       training_smiles=train_smiles)
#     evaluator.evaluate(test_dataset)
    evaluator.evaluate(test_dataset, overwrite=True)
    for task in tasks :
        evaluator.evaluation_report(task=task)

#     evaluation_name = experiment_name + '_platinum'
#     evaluator = RMSDPredictorEvaluator(model=model, 
#                                        evaluation_name=evaluation_name,
#                                        training_smiles=train_smiles)
#     evaluator.evaluate(platinum_dataset, overwrite=True)
#     for task in tasks :
#         evaluator.evaluation_report(task=task)

In [6]:
data_dir = 'data/'
# platinum_chunks = [filename for filename in os.listdir(os.path.join(data_dir, 'processed')) if filename.startswith('platinum')]
# platinum_n_chunks = len(platinum_chunks)

In [7]:
# platinum_datasets = []
# for chunk_number in tqdm(range(platinum_n_chunks)) :
#     dataset = ConfEnsembleDataset(dataset='platinum', loaded_chunk=chunk_number)
#     platinum_datasets.append(dataset)
# platinum_dataset = ConcatDataset(platinum_datasets)

In [8]:
splits = ['random', 'scaffold', 'protein']
iterations = range(5)

In [None]:
%%time

# for split in splits :
for split in ['random'] :

    # for iteration in [0] :
    for iteration in iterations :

        test_dataset, train_smiles = get_test_dataset(split, iteration)
        
        experiment_name = f'{split}_split_{iteration}'
        # evaluate_model(experiment_name, test_dataset, platinum_dataset, train_smiles)
        evaluate_model(experiment_name, test_dataset, train_smiles)
        
#         experiment_name = f'{split}_split_{iteration}_molsize'
#         evaluate_model(experiment_name, test_dataset, train_smiles)

Computing training set fingerprints
Grouping data by smiles


  0%|                                                  | 0/1258 [00:00<?, ?it/s]

Starting evaluation


100%|███████████████████████████████████████| 1258/1258 [01:08<00:00, 18.37it/s]
100%|█████████████████████████████████████████████| 3/3 [01:24<00:00, 28.05s/it]


Computing training set fingerprints
Grouping data by smiles


  0%|                                          | 2/1258 [00:00<01:13, 17.14it/s]

Starting evaluation


100%|███████████████████████████████████████| 1258/1258 [01:04<00:00, 19.40it/s]
100%|█████████████████████████████████████████████| 3/3 [01:23<00:00, 27.77s/it]


Computing training set fingerprints
Grouping data by smiles


  0%|                                          | 2/1258 [00:00<01:16, 16.39it/s]

Starting evaluation


100%|███████████████████████████████████████| 1258/1258 [01:06<00:00, 18.81it/s]
100%|█████████████████████████████████████████████| 3/3 [01:25<00:00, 28.54s/it]


Computing training set fingerprints
Grouping data by smiles


  0%|                                          | 2/1259 [00:00<01:04, 19.48it/s]

Starting evaluation


100%|███████████████████████████████████████| 1259/1259 [01:05<00:00, 19.16it/s]
100%|█████████████████████████████████████████████| 3/3 [01:26<00:00, 28.73s/it]


Computing training set fingerprints
Grouping data by smiles


  0%|                                          | 2/1257 [00:00<01:26, 14.56it/s]

Starting evaluation


100%|███████████████████████████████████████| 1257/1257 [01:08<00:00, 18.34it/s]


In [None]:
len(test_dataset)

In [None]:
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

In [None]:
def plot_ecdf_bioactive_like(active_ratio_threshold=0.1) :
    rankers = ['model', 'energy', 'ccdc', 'random']
    task = 'all'

    mol_results_path = 'results/random_split_0_pdbbind/mol_results.p'
    with open(mol_results_path, 'rb') as f:
        mol_results = pickle.load(f)

    included_smiles = []
    for smiles, mol_result in mol_results.items() :
        n_generated = mol_result['n_generated']
        has_generated = n_generated > 1
        is_easy = n_generated < 100
        is_hard = n_generated == 100
        task_filter = (task == 'all') or (task == 'hard' and is_hard) or (task == 'easy' and is_easy)
        include_smiles = task_filter and has_generated
        if include_smiles :
            included_smiles.append(smiles)

    rankers_ranks = defaultdict(list)
    rank_name = 'normalized_first_bioactive_like_rank'

    for ranker in rankers :
        for smiles in included_smiles :
            mol_result = mol_results[smiles]
            if rank_name in mol_result :
                rank = mol_result[rank_name][ranker]
                n_actives = mol_result['n_actives']
                n_generated = mol_result['n_generated']
                active_ratio = n_actives / n_generated
                if active_ratio < active_ratio_threshold :
                    rankers_ranks[ranker].append(rank)

    clean_ranker_names = {'model' : 'BioSchNet',
                                       'model_random' : 'BioSchNet (random split)',
                                       'model_scaffold' : 'BioSchNet (scaffold split)',
                                       'model_protein' : 'BioSchNet (protein split)',
                                       'energy' : 'UFF energy',
                                       'random' : 'Random',
                                       'ccdc' : 'CCDC'}

    xlabel = 'Normalized rank of bioactive-like'
    suffix = 'generated'
    master_df = pd.DataFrame()
    for ranker in rankers_ranks :
        ranks = rankers_ranks[ranker]
        df = pd.DataFrame({xlabel : ranks})
        df['Ranker'] = ranker
        master_df = pd.concat([master_df, df])

    master_df['Ranker'] = master_df['Ranker'].replace(clean_ranker_names)

    with sns.plotting_context('talk', font_scale=0.7) :
        sns.ecdfplot(data=master_df, x=xlabel, hue='Ranker')
        plt.tight_layout()

    plt.title(f'Ratio of actives < {active_ratio_threshold}')
    plt.show()

In [None]:
plot_ecdf_bioactive_like(0.02)
for i in range(5, 105, 5) :
    plot_ecdf_bioactive_like(i / 100)