In [1]:
import json, os, tqdm
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from collections import defaultdict

## Table 1 - Characteristic timings

In [2]:
timing_vote_log = defaultdict(list)
def log_timing_vote(key, time):
    #print(key, time)
    timing_vote_log[key].append(time)

In [3]:
for path in [ # T-SCORING RUNS
    '../paper_results/scoring/ESF/TS_pdbbind/timings.csv',
    '../paper_results/scoring/ESF/TS_pdbbind_esmfold/timings.csv',
    '../paper_results/scoring/ESF_N/TS_pdbbind/timings.csv',
    '../paper_results/scoring/ESF_N/TS_pdbbind_esmfold/timings.csv',
]:
    timings = pd.read_csv(path).iloc[10:]
    log_timing_vote('protein model', timings['protein_model'].mean())
    log_timing_vote('protein fft', timings['protein_fft'].mean())
    log_timing_vote('ligand model', timings['ligand_model'].mean() / 32) # number of conformers
    log_timing_vote('ligand fft', (timings['ligand_coeffs'] + timings['ligand_fft']).mean() / 32**2) # number of conformers x rotations
    log_timing_vote('T-scoring', timings['scoring'].mean() / 32**3) # number of poses

In [4]:
for path in [ # R-SCORING RUNS
    '../paper_results/scoring/ESF/RS_pdbbind/timings.csv',
    '../paper_results/scoring/ESF/RS_pdbbind_esmfold/timings.csv',
    '../paper_results/scoring/ESF_N/RS_pdbbind/timings.csv',
    '../paper_results/scoring/ESF_N/RS_pdbbind_esmfold/timings.csv',
]:  
    timings = pd.read_csv(path).iloc[10:]
    log_timing_vote('protein model', timings['protein_model'].mean())
    log_timing_vote('protein coeffs', timings['protein_coeffs'].mean() / 32) # number of translations
    log_timing_vote('ligand model', timings['ligand_model'].mean() / 32) # number of conformers
    log_timing_vote('ligand coeffs', timings['ligand_coeffs'].mean() / 32) # number of conformers
    log_timing_vote('R-scoring', timings['scoring'].mean() / 32**3) # number of poses

In [5]:
for path in [ # T-OPT RUNS
    '../paper_results/docking/ESF/pdbbind/T_grid2_scaling1.json',
    '../paper_results/docking/ESF/pdbbind_esmfold/T_grid2_scaling1.json',
    '../paper_results/docking/ESF_N/pdbbind/T_grid2_scaling1.json',
    '../paper_results/docking/ESF_N/pdbbind_esmfold/T_grid2_scaling1.json'
]:
    timings = pd.DataFrame(json.load(open(path))['timings']).T.iloc[10:]
    log_timing_vote('protein model', timings['protein_model'].mean())
    log_timing_vote('protein fft', timings['protein_fft'].mean())
    # log_timing_vote('ligand model', timings['ligand_model'].mean()) # not reliable since ligands aren't batched
    log_timing_vote('ligand fft', timings['ligand_fft'].mean() / 4608) # number of rotations
    log_timing_vote('T-FFT', timings['cross_correlate'].mean() / 4608) # number of rotations

In [6]:
for path in [ # R-OPT RUNS
    '../paper_results/docking/ESF/pdbbind/R_box9_lmax25.json',
    '../paper_results/docking/ESF/pdbbind_esmfold/R_box9_lmax25.json',
    '../paper_results/docking/ESF_N/pdbbind/R_box9_lmax25.json',
    '../paper_results/docking/ESF_N/pdbbind_esmfold/R_box9_lmax25.json',
]:
    timings = pd.DataFrame(json.load(open(path))['timings']).T.iloc[10:]
    log_timing_vote('protein model', timings['protein_model'].mean())
    log_timing_vote('protein coeffs', timings['protein_coeffs'].mean() / 729) # number of translations
    # log_timing_vote('ligand model', timings['ligand_model'].mean()) # not reliable since ligands aren't batched
    # log_timing_vote('ligand coeffs', timings['ligand_coeffs'].mean()) # not reliable since ligands aren't batched
    log_timing_vote('R-FFT', timings['fft'].mean() / 729) # number of translations

In [7]:
## Ordered according to Table 1 in the paper
## We eyeball a mean after dropping outliers
for key in ['protein model', 'protein fft', 'protein coeffs', 'ligand model', 'ligand coeffs', 'ligand fft', 'T-FFT', 'R-FFT', 'T-scoring', 'R-scoring']:
    print(key.ljust(20), ' '.join(map(lambda n: f'{n:.3}', sorted(timing_vote_log[key]))))

protein model        59.9 59.9 62.5 62.9 63.6 64.4 65.3 66.1 66.3 66.5 66.6 66.9 68.9 72.3 72.9 1.1e+02
protein fft          3.87 3.94 7.01 7.04 7.13 7.2 8.39 11.6
protein coeffs       79.4 80.8 80.9 85.0 86.0 87.3 90.5 91.8
ligand model         3.65 4.34 4.34 4.35 4.35 4.37 4.39 6.88
ligand coeffs        16.6 16.6 16.7 17.8
ligand fft           0.838 0.839 1.23 1.62 1.62 1.63 1.64 2.26
T-FFT                0.155 0.158 0.161 0.167
R-FFT                0.644 0.653 0.653 0.654
T-scoring            0.000962 0.000963 0.00135 0.0026
R-scoring            0.00816 0.00817 0.00817 0.00817


## Table 2 - decoy scoring results

In [8]:
def auroc(csv, key, thresh=2.):
    label = csv.rmsd[1:] < thresh
    scores = csv[key][1:]
    return {f'auroc<{thresh}': roc_auc_score(label, scores)}
def top1(csv, keys):
    label = np.array(csv.rmsd[1:])
    scores = np.array(csv[key][1:])
    return {
        'top1_rmsd': label[np.argmax(scores)],
        #out[key + '_top5_rmsd'] = label[np.argsort(scores)[-5:]].min()
        'toprank': (scores >= scores[np.argmin(label)]).sum()
    }
    return out

### PDBBind scoring

In [38]:
paths = {
    'T': "../paper_results/scoring/ESF/TS_pdbbind/", # shared a GPU so runtime is an anomaly...
    'R': "../paper_results/scoring/ESF/RS_pdbbind/",
    'noise_T': "../paper_results/scoring/ESF_N/TS_pdbbind/",
    'noise_R': "../paper_results/scoring/ESF_N/RS_pdbbind/",
}
keys = ['vina', 'gnina'] + list(paths.keys())
df = pd.read_csv('../splits/timesplit_test.csv')
pdbbind_dir = '../data/PDBBind_processed'

method_dfs = defaultdict(list)
for name in tqdm.tqdm(df.name):
    try:
        vina_out = f"../paper_results/scoring/vina/pdbbind/{name}.out"
        vina_out = open(vina_out).read().split('\n')
        vina_scores = [-float(line.split()[1]) for line in vina_out if 'Affinity' in line]

        gnina_out = f"../paper_results/scoring/gnina/pdbbind/{name}.out"
        gnina_out = open(gnina_out).read().split('\n')
        gnina_scores = [float(line.split()[1]) for line in gnina_out if 'CNNscore' in line]
        csv = pd.read_csv(f"{pdbbind_dir}/{name}/{name}_poses.csv")
        csv['gnina'] = gnina_scores
        csv['vina'] = vina_scores
        for key in paths:
            our_scores = np.load(f"{paths[key]}/{name}.npy")
            csv[key] = our_scores
        for key in keys:
            method_dfs[key].append({'name': name} | auroc(csv, key) | top1(csv, key))
    except Exception as e:
        print('Error', name, e)
        pass

 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 251/363 [00:31<00:10, 10.26it/s]

Error 6a73 [Errno 2] No such file or directory: '../data/PDBBind_processed/6a73/6a73_poses.csv'


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 363/363 [00:45<00:00,  7.96it/s]


In [39]:
# TankBind
key = 'tankbind'
errors = []
for name in tqdm.tqdm(df.name):
    if not os.path.exists(f"../paper_results/scoring/tankbind/pdbbind/{name}.npz"): 
        errors.append(name); continue
    tankbind_out = np.load(f"../paper_results/scoring/tankbind/pdbbind/{name}.npz")
    csv = pd.read_csv(f"{pdbbind_dir}/{name}/{name}_poses.csv")    
    csv[key] = tankbind_out['scores']
    method_dfs[key].append({'name': name} | auroc(csv, key) | top1(csv, key) | \
                           {'per_pose': tankbind_out['scoring_time'] / (32**3 - 1), 'per_complex': tankbind_out['total_time'] })
print(errors)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 363/363 [00:09<00:00, 37.81it/s]

['6qtr', '6qto', '6qtq', '6qts', '6qsz', '6qtx', '6m7h', '6rtn', '6a73', '6qtw', '6qtm', '6t6a']





In [40]:
# DiffDock
errors = []
key = 'diffdock'
for name in tqdm.tqdm(df.name):
    if not os.path.exists(f"../paper_results/scoring/diffdock/pdbbind/{name}.npz"):
        errors.append(name); continue
    diffdock_out = np.load(f"../paper_results/scoring/diffdock/pdbbind/{name}.npz")
    all_timings.append(diffdock_out['timing'].sum())
    csv = pd.read_csv(f"{pdbbind_dir}/{name}/{name}_poses.csv")    
    csv[key] = diffdock_out['confidence'][:,0]
    method_dfs[key].append({'name': name} | auroc(csv, key) | top1(csv, key) | \
                          {'per_pose': diffdock_out['timing'].sum() / (32**3 - 1), 'per_complex': diffdock_out['timing'].sum()})
print(errors)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 363/363 [00:09<00:00, 36.88it/s]

['6o0h', '6a73']





In [41]:
analysis_df = []
for key in method_dfs:
    method_df = pd.DataFrame(method_dfs[key])
    analysis_df.append({'method': key} | dict(method_df.median(numeric_only=True)) | {'rmsd<2': (method_df['top1_rmsd'] < 2).mean()})
    if key in paths:
        timing_csv = pd.read_csv(f'{paths[key]}/timings.csv')
        analysis_df[-1] |= {'per_pose': timing_csv['scoring'].mean() / (32**3 - 1), 'per_complex': timing_csv['all'].mean()}
    elif key in ['vina', 'gnina']:
        timing_csv = pd.read_csv(f'../paper_results/scoring/{key}/pdbbind.csv')
        analysis_df[-1] |= {'per_pose': 1000 * timing_csv['time'].mean() / (32**3 - 1), 'per_complex': 1000 * timing_csv['time'].mean()}
    elif key in ['tankbind', 'diffdock']:
        analysis_df[-1] |= {'per_pose': 1000 * method_df.per_pose.mean(), 'per_complex': 1000 * method_df.per_complex.mean()}
analysis_df = pd.DataFrame(analysis_df)
analysis_df['pct'] = analysis_df['per_pose'] * (32**3 - 1) / analysis_df['per_complex']
analysis_df

Unnamed: 0,method,auroc<2.0,top1_rmsd,toprank,rmsd<2,per_pose,per_complex,pct
0,vina,0.930783,0.535812,2.0,0.914365,3.350479,109785.2,1.0
1,gnina,0.901632,0.589409,3.0,0.834254,13.438713,440346.3,1.0
2,T,0.866294,0.594187,3.0,0.870166,0.001355,3855.353,0.011514
3,R,0.867013,0.626216,3.0,0.845304,0.008167,5736.527,0.046647
4,noise_T,0.917205,0.691733,4.0,0.809392,0.000962,3220.698,0.009788
5,noise_R,0.915764,0.748279,5.0,0.803867,0.008168,5745.759,0.046579
6,tankbind,0.687665,4.02176,6813.0,0.096866,0.001064,62.26732,0.559851
7,diffdock,0.964906,0.659582,3.0,0.867036,62.278209,2040670.0,1.0


### ESMFold scoring

In [42]:
paths = {
    'T': "../paper_results/scoring/ESF/TS_pdbbind_esmfold/",
    'R': "../paper_results/scoring/ESF/RS_pdbbind_esmfold/",
    'noise_T': "../paper_results/scoring/ESF_N/TS_pdbbind_esmfold/",
    'noise_R': "../paper_results/scoring/ESF_N/RS_pdbbind_esmfold/",
}
keys = ['vina', 'gnina'] + list(paths.keys())
df = pd.read_csv('../splits/timesplit_test.csv')
pdbbind_dir = '../data/PDBBind_processed'

method_dfs = defaultdict(list)
for name in tqdm.tqdm(df.name):
    try:
        vina_out = f"../paper_results/scoring/vina/pdbbind_esmfold/{name}.out"
        vina_out = open(vina_out).read().split('\n')
        vina_scores = [-float(line.split()[1]) for line in vina_out if 'Affinity' in line]

        gnina_out = f"../paper_results/scoring/gnina/pdbbind_esmfold/{name}.out"
        gnina_out = open(gnina_out).read().split('\n')
        gnina_scores = [float(line.split()[1]) for line in gnina_out if 'CNNscore' in line]
        csv = pd.read_csv(f"{pdbbind_dir}/{name}/{name}_poses.csv")
        csv['gnina'] = gnina_scores
        csv['vina'] = vina_scores
        for key in paths:
            our_scores = np.load(f"{paths[key]}/{name}.npy")
            csv[key] = our_scores
        for key in keys:
            method_dfs[key].append({'name': name} | auroc(csv, key) | top1(csv, key))
    except Exception as e:
        print('Error', name, e)
        pass

 61%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                          | 222/363 [01:58<01:25,  1.65it/s]

Error 6rtn Length of values (0) does not match length of index (32768)


 69%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                                               | 249/363 [02:12<01:01,  1.86it/s]

Error 6a73 [Errno 2] No such file or directory: '../data/PDBBind_processed/6a73/6a73_poses.csv'


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 363/363 [03:12<00:00,  1.89it/s]


In [43]:
# TankBind
key = 'tankbind'
errors = []
for name in tqdm.tqdm(df.name):
    if not os.path.exists(f"../paper_results/scoring/tankbind/pdbbind_esmfold/{name}.npz"): 
        errors.append(name); continue
    tankbind_out = np.load(f"../paper_results/scoring/tankbind/pdbbind_esmfold/{name}.npz")
    csv = pd.read_csv(f"{pdbbind_dir}/{name}/{name}_poses.csv")    
    csv[key] = tankbind_out['scores']
    method_dfs[key].append({'name': name} | auroc(csv, key) | top1(csv, key) | \
                           {'per_pose': tankbind_out['scoring_time'] / (32**3 - 1), 'per_complex': tankbind_out['total_time'] })
print(errors)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 363/363 [00:09<00:00, 36.48it/s]

['6rtn', '6a73']





In [44]:
# DiffDock
errors = []
key = 'diffdock'
for name in tqdm.tqdm(df.name):
    if not os.path.exists(f"../paper_results/scoring/diffdock/pdbbind_esmfold/{name}.npz"):
        errors.append(name); continue
    diffdock_out = np.load(f"../paper_results/scoring/diffdock/pdbbind_esmfold/{name}.npz")
    all_timings.append(diffdock_out['timing'].sum())
    csv = pd.read_csv(f"{pdbbind_dir}/{name}/{name}_poses.csv")    
    csv[key] = diffdock_out['confidence'][:,0]
    method_dfs[key].append({'name': name} | auroc(csv, key) | top1(csv, key) | \
                          {'per_pose': diffdock_out['timing'].sum() / (32**3 - 1), 'per_complex': diffdock_out['timing'].sum()})
print(errors)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 363/363 [00:10<00:00, 36.14it/s]

['6o0h', '6rtn', '6a73']





In [46]:
analysis_df = []
for key in method_dfs:
    method_df = pd.DataFrame(method_dfs[key])
    analysis_df.append({'method': key} | dict(method_df.median(numeric_only=True)) | {'rmsd<2': (method_df['top1_rmsd'] < 2).mean()})
    if key in paths:
        timing_csv = pd.read_csv(f'{paths[key]}/timings.csv')
        analysis_df[-1] |= {'per_pose': timing_csv['scoring'].mean() / (32**3 - 1), 'per_complex': timing_csv['all'].mean()}
    elif key in ['vina', 'gnina']:
        timing_csv = pd.read_csv(f'../paper_results/scoring/{key}/pdbbind_esmfold.csv')
        analysis_df[-1] |= {'per_pose': 1000 * timing_csv['time'].mean() / (32**3 - 1), 'per_complex': 1000 * timing_csv['time'].mean()}
    elif key in ['tankbind', 'diffdock']:
        analysis_df[-1] |= {'per_pose': 1000 * method_df.per_pose.mean(), 'per_complex': 1000 * method_df.per_complex.mean()}
analysis_df = pd.DataFrame(analysis_df)
analysis_df['pct'] = analysis_df['per_pose'] * (32**3 - 1) / analysis_df['per_complex']
analysis_df

Unnamed: 0,method,auroc<2.0,top1_rmsd,toprank,rmsd<2,per_pose,per_complex,pct
0,vina,0.856654,2.427288,419.0,0.429363,3.267835,107077.1,1.0
1,gnina,0.837391,2.190598,1110.0,0.462604,13.284013,435277.3,1.0
2,T,0.818127,1.38971,24.0,0.567867,0.000963,3169.801,0.009956
3,R,0.816608,1.751995,22.0,0.534626,0.008159,6061.82,0.044105
4,noise_T,0.867949,1.643191,22.0,0.542936,0.002606,4917.672,0.017365
5,noise_R,0.868098,1.740755,26.0,0.529086,0.00817,5692.995,0.047026
6,tankbind,0.638461,4.218317,8538.0,0.091413,0.001085,67.65363,0.525457
7,diffdock,0.890674,2.012072,142.5,0.497222,62.387745,2044259.0,1.0


### Timings

In [13]:
for path in [ # T-SCORING RUNS
    '../paper_results/scoring/ESF/TS_pdbbind/timings.csv',
    '../paper_results/scoring/ESF/TS_pdbbind_esmfold/timings.csv',
    '../paper_results/scoring/ESF_N/TS_pdbbind/timings.csv',
    '../paper_results/scoring/ESF_N/TS_pdbbind_esmfold/timings.csv',
]:
    timings = pd.read_csv(path).iloc[10:]
    print(path.ljust(80), timings['scoring'].mean() / 32**3, timings['all'].mean())

../paper_results/scoring/ESF/TS_pdbbind/timings.csv                              0.0013513534282868602 3800.5023700269785
../paper_results/scoring/ESF/TS_pdbbind_esmfold/timings.csv                      0.0009628568394202474 3115.613517598209
../paper_results/scoring/ESF_N/TS_pdbbind/timings.csv                            0.0009618077947379788 3164.313866333528
../paper_results/scoring/ESF_N/TS_pdbbind_esmfold/timings.csv                    0.002603903936522023 4854.7292849277155


In [14]:
for path in [ # R-SCORING RUNS
    '../paper_results/scoring/ESF/RS_pdbbind/timings.csv',
    '../paper_results/scoring/ESF/RS_pdbbind_esmfold/timings.csv',
    '../paper_results/scoring/ESF_N/RS_pdbbind/timings.csv',
    '../paper_results/scoring/ESF_N/RS_pdbbind_esmfold/timings.csv',
]:  
    timings = pd.read_csv(path).iloc[10:]
    print(path.ljust(80), timings['scoring'].mean() / 32**3, timings['all'].mean())

../paper_results/scoring/ESF/RS_pdbbind/timings.csv                              0.00816645910857584 5678.557356650179
../paper_results/scoring/ESF/RS_pdbbind_esmfold/timings.csv                      0.008158907763914675 6002.632096961693
../paper_results/scoring/ESF_N/RS_pdbbind/timings.csv                            0.008167465368222276 5693.4107027270575
../paper_results/scoring/ESF_N/RS_pdbbind_esmfold/timings.csv                    0.008170008738963013 5629.246268177304


## Table 3 - rigid conformer docking

In [15]:
def analyze(path):
    js = json.load(open(path))
    df = pd.DataFrame(js['rmsds']).T
    df['rot_rmsd'] = (df['rmsd']**2 - df['tr_rmsd']**2)**0.5
    df['grid_offset'] = (df['tr_grid_offset']**2 + df['rot_grid_offset']**2)**0.5
    tim = pd.DataFrame(js['timings']).T
    return df, tim

def analyze_pde10a(path):
    df, tim = analyze(path)
    entry = dict(df.median()) | {
        'pct<2': (df.rmsd < 2).mean(),
        'time': tim['cross_correlate'].mean() + tim['readout'].mean(),
        'all': tim.sum().sum() / (len(df) - 1)
    }
    return df, tim, entry

### PDBBind

In [16]:
df = pd.read_csv('../paper_results/docking/gnina/pdbbind/default.csv')
pdb_ids = df.name[df.rmsd == df.rmsd]
print(len(pdb_ids))
paths = {
    'T': '../paper_results/docking/ESF/pdbbind/T_grid2_scaling1.json',
    'R': '../paper_results/docking/ESF/pdbbind/R_box9_lmax25.json',
    'noise_T': '../paper_results/docking/ESF_N/pdbbind/T_grid2_scaling1.json',
    'noise_R': '../paper_results/docking/ESF_N/pdbbind/R_box9_lmax25.json'
}
entries = []
for key in paths:
    df, tim = analyze(paths[key])
    df, tim = df.loc[pdb_ids], tim.loc[pdb_ids].iloc[10:]
    unit_time = sum(tim[k].mean() for k in ['fft', 'cross_correlate', 'readout'] if k in tim)
    entries.append({'name': key, 'pct<2': (df.rmsd < 2).mean(),
                    'rmsd': df['rmsd'].median(), 'grid_offset': df['grid_offset'].median(), 
                    'time': unit_time, 'time_all': tim['all'].mean()})
    
csvs = {
    'default': '../paper_results/docking/gnina/pdbbind/default.csv',
    'score_only': '../paper_results/docking/gnina/pdbbind/score_only.csv',
    'gnina': '../paper_results/docking/gnina/pdbbind/gnina.csv',
}
for csv in csvs:
    df = pd.read_csv(csvs[csv])
    entries.append({'name': csv, 'pct<2': (df.rmsd < 2).mean(), 'rmsd': df.rmsd.median(), 'time': 1000 * df.time.mean()})
entries = pd.DataFrame(entries)
entries

333


Unnamed: 0,name,pct<2,rmsd,grid_offset,time,time_all
0,T,0.6997,1.125495,0.666174,780.263848,8320.452813
1,R,0.705706,0.965665,0.522667,478.516265,66749.398061
2,noise_T,0.720721,1.104596,0.667066,723.806405,8166.544446
3,noise_R,0.72973,0.997074,0.534659,477.719273,67695.861123
4,default,0.793388,0.318332,,21332.200317,
5,score_only,0.0,37.036648,,929.515594,
6,gnina,0.774105,0.327086,,23871.171836,


### PDBBind ESMFold

In [17]:
df = pd.read_csv('../paper_results/docking/gnina/pdbbind_esmfold/default.csv')
pdb_ids = df.name[df.rmsd == df.rmsd]
print(len(pdb_ids))
paths = {
    'T': '../paper_results/docking/ESF/pdbbind_esmfold/T_grid2_scaling1.json',
    'R': '../paper_results/docking/ESF/pdbbind_esmfold/R_box9_lmax25.json',
    'noise_T': '../paper_results/docking/ESF_N/pdbbind_esmfold/T_grid2_scaling1.json',
    'noise_R': '../paper_results/docking/ESF_N/pdbbind_esmfold/R_box9_lmax25.json'
}
entries = []
for key in paths:
    df, tim = analyze(paths[key])
    df, tim = df.loc[pdb_ids], tim.loc[pdb_ids].iloc[10:]
    unit_time = sum(tim[k].mean() for k in ['fft', 'cross_correlate', 'readout'] if k in tim)
    entries.append({'name': key, 'pct<2': (df.rmsd < 2).mean(),
                    'rmsd': df['rmsd'].median(), 'grid_offset': df['grid_offset'].median(), 
                    'time': unit_time, 'time_all': tim['all'].mean()})

csvs = {
    'default': '../paper_results/docking/gnina/pdbbind_esmfold/default.csv',
    'score_only': '../paper_results/docking/gnina/pdbbind_esmfold/score_only.csv',
    'gnina': '../paper_results/docking/gnina/pdbbind_esmfold/gnina.csv',
}
for csv in csvs:
    df = pd.read_csv(csvs[csv])
    entries.append({'name': csv, 'pct<2': (df.rmsd < 2).mean(), 'rmsd': df.rmsd.median(), 'time': 1000 * df.time.mean()})
entries = pd.DataFrame(entries)
entries

333


Unnamed: 0,name,pct<2,rmsd,grid_offset,time,time_all
0,T,0.312312,4.575101,0.653368,734.345484,8136.157242
1,R,0.315315,4.377143,0.508446,470.80473,62818.092969
2,noise_T,0.456456,2.857305,0.656374,748.187767,8196.442644
3,noise_R,0.465465,3.008733,0.523161,477.635502,64459.533702
4,default,0.236915,6.149646,,15676.110242,
5,score_only,0.0,37.036648,,1898.002729,
6,gnina,0.283747,5.900586,,16615.527507,


### PDE10A

In [18]:
paths = {
    'T': '../paper_results/docking/ESF/pde10a/T_grid2_scaling1.json',
    'R': '../paper_results/docking/ESF/pde10a/R_box9_lmax25.json',
    'noise_T': '../paper_results/docking/ESF_N/pde10a/T_grid2_scaling1.json',
    'noise_R': '../paper_results/docking/ESF_N/pde10a/R_box9_lmax25.json'
}
df = pd.read_csv('../paper_results/docking/gnina/pde10a/default.csv')
pdb_ids = df.name[df.rmsd == df.rmsd]
print(len(pdb_ids))
entries = []
for key in paths:
    df, tim = analyze(paths[key])
    df = df.loc[pdb_ids]
    tim = tim.loc[['preprocess'] + list(pdb_ids)]
    
    unit_time = sum(tim[k].mean() for k in ['cross_correlate', 'readout'])
    entries.append({'name': key, 'pct<2': (df.rmsd < 2).mean(),
                    'rmsd': df['rmsd'].median(), 'grid_offset': df['grid_offset'].median(), 
                    'time': unit_time, 'time_all': tim.sum().sum() / (len(tim) - 1)})


csvs = {
    'default': '../paper_results/docking/gnina/pde10a/default.csv',
    'score_only': '../paper_results/docking/gnina/pde10a/score_only.csv',
    'gnina': '../paper_results/docking/gnina/pde10a/gnina.csv',
}
for csv in csvs:
    df = pd.read_csv(csvs[csv])
    df = df.set_index('name').loc[pdb_ids]
    entries.append({'name': csv, 'pct<2': (df.rmsd < 2).mean(), 'rmsd': df.rmsd.median(), 'time': 1000 * df.time.mean()})
entries = pd.DataFrame(entries)
entries

70


Unnamed: 0,name,pct<2,rmsd,grid_offset,time,time_all
0,T,0.671429,1.201335,0.693207,952.949657,7055.453798
1,R,0.728571,0.822772,0.525635,477.739434,1494.846699
2,noise_T,0.642857,1.113944,0.686324,985.233527,7175.569025
3,noise_R,0.7,1.004207,0.514965,466.527057,1463.952908
4,default,0.742857,0.747402,,7514.434167,
5,score_only,0.0,57.171319,,1444.79912,
6,gnina,0.728571,0.770874,,7442.702508,
