In [1]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import rmsd_predictor_evaluator
from tqdm import tqdm
from litschnet import LitSchNet
from rmsd_predictor_evaluator import RMSDPredictorEvaluator
from conf_ensemble_dataset_in_memory import ConfEnsembleDataset
from torch_geometric.loader import DataLoader
from torch.utils.data import ConcatDataset

In [2]:
data_dir = 'data/'
split = 'random'
dataset = 'pdbbind'
iteration = 0
with open(os.path.join(data_dir, f'ligand_{split}_splits', f'train_smiles_{split}_split_{iteration}.txt'), 'r') as f :
    train_smiles = f.readlines()
    train_smiles = [smiles.strip() for smiles in train_smiles]
            
with open(os.path.join(data_dir, f'ligand_{split}_splits', f'test_smiles_{split}_split_{iteration}.txt'), 'r') as f :
    test_smiles = f.readlines()
    test_smiles = [smiles.strip() for smiles in test_smiles]

In [3]:
len(test_smiles)

1530

In [4]:
pdbbind_chunks = [filename for filename in os.listdir(os.path.join(data_dir, 'processed')) if filename.startswith('pdbbind')]
pdbbind_n_chunks = len(pdbbind_chunks)

In [5]:
tasks = ['all', 'easy', 'hard']

In [6]:
test_datasets = []

for chunk_number in tqdm(range(pdbbind_n_chunks)) :

    dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                  smiles_list=test_smiles)
    test_datasets.append(dataset)

test_dataset = ConcatDataset(test_datasets)

100%|█████████████████████████████████████████████| 4/4 [01:21<00:00, 20.26s/it]


In [11]:
importlib.reload(rmsd_predictor_evaluator)
RMSDPredictorEvaluator = rmsd_predictor_evaluator.RMSDPredictorEvaluator

In [12]:
experiment_name = f'{split}_split_{iteration}_v2'
if experiment_name in os.listdir('lightning_logs') :
    checkpoint_name = os.listdir(os.path.join('lightning_logs', experiment_name, 'checkpoints'))[0]
    checkpoint_path = os.path.join('lightning_logs', experiment_name, 'checkpoints', checkpoint_name)
    litschnet = LitSchNet.load_from_checkpoint(checkpoint_path=checkpoint_path)

    evaluation_name = experiment_name + '_pdbbind'
    evaluator = RMSDPredictorEvaluator(model=litschnet, 
                                       evaluation_name=evaluation_name, 
                                       training_smiles=train_smiles)
    evaluator.evaluate(test_dataset)
    evaluator.evaluation_report()

Computing training set fingerprints
Evaluation already done for given experiment random_split_0_v2_pdbbind
Loading existing results


In [13]:
data_dir = 'data/'
split = 'scaffold'
dataset = 'pdbbind'
iteration = 0
with open(os.path.join(data_dir, f'ligand_{split}_splits', f'train_smiles_{split}_split_{iteration}.txt'), 'r') as f :
    train_smiles = f.readlines()
    train_smiles = [smiles.strip() for smiles in train_smiles]
            
with open(os.path.join(data_dir, f'ligand_{split}_splits', f'test_smiles_{split}_split_{iteration}.txt'), 'r') as f :
    test_smiles = f.readlines()
    test_smiles = [smiles.strip() for smiles in test_smiles]

In [14]:
len(test_smiles)

1233

In [15]:
pdbbind_chunks = [filename for filename in os.listdir(os.path.join(data_dir, 'processed')) if filename.startswith('pdbbind')]
pdbbind_n_chunks = len(pdbbind_chunks)

In [16]:
tasks = ['all', 'easy', 'hard']

In [17]:
test_datasets = []

for chunk_number in tqdm(range(pdbbind_n_chunks)) :

    dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                  smiles_list=test_smiles)
    test_datasets.append(dataset)

test_dataset = ConcatDataset(test_datasets)

100%|█████████████████████████████████████████████| 4/4 [01:22<00:00, 20.67s/it]


In [18]:
import importlib
import rmsd_predictor_evaluator
importlib.reload(rmsd_predictor_evaluator)
RMSDPredictorEvaluator = rmsd_predictor_evaluator.RMSDPredictorEvaluator

In [19]:
experiment_name = f'{split}_split_{iteration}_v2'
if experiment_name in os.listdir('lightning_logs') :
    checkpoint_name = os.listdir(os.path.join('lightning_logs', experiment_name, 'checkpoints'))[0]
    checkpoint_path = os.path.join('lightning_logs', experiment_name, 'checkpoints', checkpoint_name)
    litschnet = LitSchNet.load_from_checkpoint(checkpoint_path=checkpoint_path)

    evaluation_name = experiment_name + '_pdbbind'
    evaluator = RMSDPredictorEvaluator(model=litschnet, 
                                       evaluation_name=evaluation_name, 
                                       training_smiles=train_smiles)
    evaluator.evaluate(test_dataset, overwrite=True)
    evaluator.evaluation_report()

Computing training set fingerprints
Grouping data by smiles


  0%|                                                  | 0/1091 [00:00<?, ?it/s]

Starting evaluation


100%|███████████████████████████████████████| 1091/1091 [01:00<00:00, 17.97it/s]


In [20]:
data_dir = 'data/'
split = 'protein_similarity'
dataset = 'pdbbind'
iteration = 0
with open(os.path.join(data_dir, f'{split}_splits', f'train_pdb_{split}_split_{iteration}.txt'), 'r') as f :
    train_pdbs = f.readlines()
    train_pdbs = [pdb.strip() for pdb in train_pdbs]
            
with open(os.path.join(data_dir, f'{split}_splits', f'test_pdb_{split}_split_{iteration}.txt'), 'r') as f :
    test_pdbs = f.readlines()
    test_pdbs = [pdb.strip() for pdb in test_pdbs]

In [21]:
import pandas as pd
smiles_df = pd.read_csv('data/smiles_df.csv')
train_smiles = smiles_df[smiles_df['id'].isin(train_pdbs)]['smiles'].values
test_smiles = smiles_df[smiles_df['id'].isin(test_pdbs)]['smiles'].values

In [22]:
len(test_smiles)

1733

In [23]:
pdbbind_chunks = [filename for filename in os.listdir(os.path.join(data_dir, 'processed')) if filename.startswith('pdbbind')]
pdbbind_n_chunks = len(pdbbind_chunks)

In [24]:
tasks = ['all', 'easy', 'hard']

In [25]:
test_datasets = []

for chunk_number in tqdm(range(pdbbind_n_chunks)) :

    dataset = ConfEnsembleDataset(loaded_chunk=chunk_number,
                                  pdb_ids_list=test_pdbs)
    test_datasets.append(dataset)

test_dataset = ConcatDataset(test_datasets)

100%|█████████████████████████████████████████████| 4/4 [01:49<00:00, 27.42s/it]


In [26]:
import importlib
import rmsd_predictor_evaluator
importlib.reload(rmsd_predictor_evaluator)
RMSDPredictorEvaluator = rmsd_predictor_evaluator.RMSDPredictorEvaluator

In [27]:
split = 'protein'
experiment_name = f'{split}_split_{iteration}_v2'
if experiment_name in os.listdir('lightning_logs') :
    checkpoint_name = os.listdir(os.path.join('lightning_logs', experiment_name, 'checkpoints'))[0]
    checkpoint_path = os.path.join('lightning_logs', experiment_name, 'checkpoints', checkpoint_name)
    litschnet = LitSchNet.load_from_checkpoint(checkpoint_path=checkpoint_path)

    evaluation_name = experiment_name + '_pdbbind'
    evaluator = RMSDPredictorEvaluator(model=litschnet, 
                                       evaluation_name=evaluation_name, 
                                       training_smiles=train_smiles)
    evaluator.evaluate(test_dataset)
    evaluator.evaluation_report()

Computing training set fingerprints
Evaluation already done for given experiment protein_split_0_v2_pdbbind
Loading existing results


# Produce EF figures across split

In [67]:
splits = ['random', 'scaffold', 'protein']
datasets = ['pdbbind', 'platinum']
for dataset in datasets :
    df = pd.DataFrame()
    for split in splits :
        evaluation_name = f'{split}_split_0_{dataset}'
        ef_df_path = os.path.join('results/',
                                 evaluation_name,
                                 'ef_df.csv')
        ef_df = pd.read_csv(ef_df_path, index_col=0)
        ef_df['ranker'] = ef_df['ranker'].replace({'model' : f'model_{split}'})
        df = df.append(ef_df, ignore_index=True)
    df = df.rename({'Enrichment factor' : 'Enrichment factor of the top 10% closest to bioactive'}, axis=1)

    sns.lineplot(data=df, x='Fraction', y=f'Enrichment factor of the top 10% closest to bioactive', hue='ranker')
    plt.title(f'Generated conformation ranking evaluation ({dataset})')
    fig_path = os.path.join('figures/', 
                            f'efs_{dataset}.png')
    plt.savefig(fig_path, dpi=300)
    plt.close()

  mask |= (ar1 == a)


In [63]:
splits = ['random', 'scaffold', 'protein']
df = pd.DataFrame()
for split in splits :
    evaluation_name = f'{split}_split_0_platinum'
    ef_df_path = os.path.join('results/',
                             evaluation_name,
                             'ef_df.csv')
    ef_df = pd.read_csv(ef_df_path, index_col=0)
    ef_df['ranker'] = ef_df['ranker'].replace({'model' : f'model_{split}'})
    df = df.append(ef_df, ignore_index=True)
df = df.rename({'Enrichment factor' : 'Enrichment factor of the top 10% closest to bioactive'}, axis=1)

In [64]:
sns.lineplot(data=df, x='Fraction', y=f'Enrichment factor of the top 10% closest to bioactive', hue='ranker')
plt.title(f'Generated conformation ranking evaluation')
fig_path = os.path.join('figures/', 
                        f'efs.png')
plt.savefig(fig_path, dpi=300)
plt.close()

# Produce rigid-ligand docking figures across split

In [43]:
splits = ['random', 'scaffold', 'protein']
df = pd.DataFrame()
for split in splits :
    evaluation_name = f'{split}_split_0_v2_pdbbind'
    recall_df_path = os.path.join('results/',
                             evaluation_name,
                             'rigid_ligand_docking_recall_successful_only.csv')
    recall_df = pd.read_csv(recall_df_path, index_col=0)
    recall_df['ranker'] = recall_df['ranker'].replace({'model' : f'model_{split}'})
    df = df.append(recall_df, ignore_index=True)

In [44]:
df.head()

Unnamed: 0,Conformation rank,Recall,metric,ranker
0,0,0.075542,score,model_random
1,1,0.103964,score,model_random
2,2,0.142109,score,model_random
3,3,0.167539,score,model_random
4,4,0.190726,score,model_random


In [45]:
df[(df['Conformation rank'] == 19) & (df['metric'] == 'ligand_rmsd')].groupby('ranker').agg({'Recall' : ['mean', 'std']})

Unnamed: 0_level_0,Recall,Recall
Unnamed: 0_level_1,mean,std
ranker,Unnamed: 1_level_2,Unnamed: 2_level_2
CCDC,0.262278,0.01634
energy,0.316484,0.007794
model_protein,0.332944,
model_random,0.577412,
model_scaffold,0.402421,
random,0.206594,0.022923
score,0.77397,0.004849


In [46]:
df['metric'].unique()

array(['score', 'ligand_rmsd', 'overlay_rmsd', 'docking_power',
       'correct_conf'], dtype=object)

In [47]:
metric = 'ligand_rmsd'
metric_df = df[df['metric'] == metric]
sns.lineplot(data=metric_df, x='Conformation rank', y=f'Recall', hue='ranker')

if metric == 'docking_power' :
    title = 'Docking power'
elif metric == 'ligand_rmsd' :
    title = 'Retrieval of closest conformation to bioactive'
else :
    title = f'Retrieval of top {metric}'

plt.title(title)
fig_path = os.path.join('figures/', 
                        f'{metric}_rigid_docking.png')
plt.savefig(fig_path, dpi=300)
plt.close()

In [48]:
metric = 'ligand_rmsd'
metric_df = df[df['metric'] == metric]
sns.lineplot(data=metric_df, x='Conformation rank', y=f'Recall', hue='ranker')

if metric == 'docking_power' :
    title = 'Docking power'
elif metric == 'ligand_rmsd' :
    title = 'Retrieval of closest conformation to bioactive'
else :
    title = f'Retrieval of top {metric}'

plt.title(title)
plt.xlim(0, 20)
fig_path = os.path.join('figures/', 
                        f'{metric}_rigid_docking_truncated.png')
plt.savefig(fig_path, dpi=300)
plt.close()

In [60]:
flexible_docking_powers = {
    'random' : 0.57,
    'scaffold' : 0.48,
    'protein' : 0.53
}

generation_powers = {
    'random' : 0.76,
    'scaffold' : 0.70,
    'protein' : 0.79
}

In [62]:
splits = ['random', 'scaffold', 'protein']
df = pd.DataFrame()
for split in splits :
    evaluation_name = f'{split}_split_0_v2_pdbbind'
    recall_df_path = os.path.join('results/',
                             evaluation_name,
                             'rigid_ligand_docking_recall_all.csv')
    recall_df = pd.read_csv(recall_df_path, index_col=0)
    recall_df['ranker'] = recall_df['ranker'].replace({'model' : f'model_{split}'})
    flexible_docking_power = flexible_docking_powers[split]
    print(recall_df[(recall_df['Recall'] >= flexible_docking_power) 
              & (recall_df['metric'] == 'docking_power')].drop_duplicates(subset='ranker'))
    df = df.append(recall_df, ignore_index=True)

      Conformation rank    Recall         metric        ranker
1505                  5  0.587933  docking_power  model_random
1617                 17  0.572529  docking_power        energy
1701                  1  0.591142  docking_power         score
1812                 12  0.575096  docking_power        random
1916                 16  0.573813  docking_power          CCDC
      Conformation rank    Recall         metric          ranker
1504                  4  0.500000  docking_power  model_scaffold
1607                  7  0.486284  docking_power          energy
1700                  0  0.567332  docking_power           score
1806                  6  0.495012  docking_power          random
1906                  6  0.493766  docking_power            CCDC
      Conformation rank    Recall         metric         ranker
1507                  7  0.534722  docking_power  model_protein
1608                  8  0.542659  docking_power         energy
1700                  0  0.589286  docki

In [50]:
df[(df['Conformation rank'] == 99) & (df['metric'] == 'docking_power')].groupby('ranker').agg({'Recall' : ['mean', 'std']})

Unnamed: 0_level_0,Recall,Recall
Unnamed: 0_level_1,mean,std
ranker,Unnamed: 1_level_2,Unnamed: 2_level_2
CCDC,0.815619,0.014264
energy,0.815619,0.014264
model_protein,0.825397,
model_random,0.822208,
model_scaffold,0.799252,
random,0.815619,0.014264
score,0.815619,0.014264


In [58]:
df[(df['Recall'] >= 0.60) & (df['metric'] == 'docking_power')].drop_duplicates(subset='ranker')

Unnamed: 0,Conformation rank,Recall,metric,ranker
1506,6,0.603338,docking_power,model_random
1621,21,0.605905,docking_power,energy
1702,2,0.618742,docking_power,score
1815,15,0.605263,docking_power,random
1919,19,0.602696,docking_power,CCDC
4009,9,0.604738,docking_power,model_scaffold
6511,11,0.608135,docking_power,model_protein


In [51]:
df[(df['Recall'] >= 0.57) & (df['metric'] == 'docking_power')].drop_duplicates(subset='ranker')# .groupby('ranker').agg({'Conformation rank' : ['mean', 'std']})

Unnamed: 0,Conformation rank,Recall,metric,ranker
1505,5,0.587933,docking_power,model_random
1617,17,0.572529,docking_power,energy
1701,1,0.591142,docking_power,score
1812,12,0.575096,docking_power,random
1916,16,0.573813,docking_power,CCDC
4008,8,0.586035,docking_power,model_scaffold
6509,9,0.573413,docking_power,model_protein


In [56]:
df[(df['Recall'] >= 0.48) & (df['metric'] == 'docking_power')].drop_duplicates(subset='ranker')# .groupby('ranker').agg({'Conformation rank' : ['mean', 'std']})

Unnamed: 0,Conformation rank,Recall,metric,ranker
1502,2,0.494865,docking_power,model_random
1609,9,0.486521,docking_power,energy
1700,0,0.537227,docking_power,score
1807,7,0.489089,docking_power,random
1908,8,0.482028,docking_power,CCDC
4004,4,0.5,docking_power,model_scaffold
6505,5,0.502976,docking_power,model_protein


In [57]:
df[(df['Recall'] >= 0.53) & (df['metric'] == 'docking_power')].drop_duplicates(subset='ranker')# .groupby('ranker').agg({'Conformation rank' : ['mean', 'std']})

Unnamed: 0,Conformation rank,Recall,metric,ranker
1503,3,0.535302,docking_power,model_random
1613,13,0.53466,docking_power,energy
1700,0,0.537227,docking_power,score
1809,9,0.533376,docking_power,random
1912,12,0.530809,docking_power,CCDC
4006,6,0.549875,docking_power,model_scaffold
6507,7,0.534722,docking_power,model_protein


In [54]:
metric = 'docking_power'
metric_df = df[df['metric'] == metric]
sns.lineplot(data=metric_df, x='Conformation rank', y=f'Recall', hue='ranker')

if metric == 'docking_power' :
    title = 'Docking power'
elif metric == 'ligand_rmsd' :
    title = 'Retrieval of closest conformation to bioactive'
else :
    title = f'Retrieval of top {metric}'

plt.title(title)
plt.ylim(0,1)
fig_path = os.path.join('figures/', 
                        f'{metric}_rigid_docking.png')
plt.savefig(fig_path, dpi=300)
plt.close()

In [55]:
metric = 'docking_power'
metric_df = df[df['metric'] == metric]
sns.lineplot(data=metric_df, x='Conformation rank', y=f'Recall', hue='ranker')

if metric == 'docking_power' :
    title = 'Docking power'
elif metric == 'ligand_rmsd' :
    title = 'Retrieval of closest conformation to bioactive'
else :
    title = f'Retrieval of top {metric}'

plt.title(title)
plt.xlim(0, 20)
fig_path = os.path.join('figures/', 
                        f'{metric}_rigid_docking_trucated.png')
plt.savefig(fig_path, dpi=300)
plt.close()