# CASP 13 metrics

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from proteins.metrics import rmse, pearson, kendalltau, spearmanr, first_rank_loss

pd.set_option("display.expand_frame_repr", False)

## GDT_TS

### Ground truth scores

In [2]:
df_true = pd.read_csv('GDT_TS/data-all.tsv', header=None, sep='\s+', names=['target_decoy', 'gdtts'], usecols=[0, 1])
df_true['gdtts'] = df_true['gdtts'] / 100

# IDs like T0950TS004_1      are split as T0950   TS004_1
# IDs like T0953s1TS004_1    are split as T0953s1 TS004_1
# IDs like T0953s1TS004_1-D1 are split as T0953s1 TS004_1
df_true[['target', 'decoy']] = df_true['target_decoy'].str.extract(r'(T\d{4}(?:s\d)?)([TS\d_]+)(?:\-D1)?')
df_true.drop(columns='target_decoy', inplace=True)

print(f'Targets {df_true.target.nunique()}',f'Decoys {len(df_true)}', sep='\n')
df_true.groupby('target').head(2).head(10)

Targets 80
Decoys 31519


Unnamed: 0,gdtts,target,decoy
0,0.531,T0949,TS004_1
1,0.5291,T0949,TS004_2
395,0.1089,T0950,TS004_1
396,0.1762,T0950,TS004_2
578,0.9126,T0951,TS004_1
579,0.9192,T0951,TS004_2
733,0.2836,T0953s1,TS004_1
734,0.4105,T0953s1,TS004_2
1152,0.0867,T0953s2,TS004_1
1153,0.0887,T0953s2,TS004_2


### Other papers predictions

In [3]:
others = {
    '3D CNN': 'GDT_TS/3DCNN-all.tsv',
    'Ornate': 'GDT_TS/Ornate-all.tsv',
    'ProQ3D': 'GDT_TS/ProQ3D-all.tsv',
    'ProQ4': 'GDT_TS/ProQ4-all.tsv',
    'VoroMQA': 'GDT_TS/VoroMQA-all.tsv',
}
results = {k: {} for k in others}

for other_name, other_file in others.items():
    df = pd.read_csv(other_file, header=None, sep='\s+', names=['target_decoy', 'gdtts'], usecols=[0, 1])
    df[['target', 'decoy']] = df['target_decoy'].str.extract(r'(T\d{4}(?:s\d)?)([TS\d_]+)(?:\-D1)?')
    df.drop(columns='target_decoy', inplace=True)
    
    df_merge = pd.merge(
        df_true,
        df,
        on=['target', 'decoy'],
        suffixes=['_true', '_pred']
    )
    print(
        other_name, 
        f'Targets {df.target.nunique()}', 
        f'Decoys {len(df)}', 
        f'Merged targets {df_merge.target.nunique()}', 
        f'Merged decoys {len(df_merge)}', 
        df_merge.head().to_string(index=False), sep='\n', end='\n\n'
    )
    
    results[other_name]['scored_targets'] = df_merge.target.nunique()
    results[other_name]['scored_decoys'] = len(df_merge)
    results[other_name]['RMSE'] = rmse(preds=df_merge['gdtts_pred'], true=df_merge['gdtts_true'])
    results[other_name]['First Rank Loss'] = first_rank_loss(preds=df_merge['gdtts_pred'], true=df_merge['gdtts_true'])
    
    for name, func in {'R': pearson, 'τ': kendalltau, 'ρ': spearmanr}.items():
        results[other_name][name] = func(preds=df_merge['gdtts_pred'], true=df_merge['gdtts_true'])
        results[other_name][f'{name} per target'] = df_merge.groupby('target').apply(lambda group: func(preds=group['gdtts_pred'], true=group['gdtts_true'])).mean()
        
results = pd.DataFrame(results).transpose()
results['scored_targets'] = results['scored_targets'].astype(int)

3D CNN
Targets 57
Decoys 8539
Merged targets 57
Merged decoys 8539
 gdtts_true target    decoy  gdtts_pred
     0.7415  T0962  TS004_1    0.647932
     0.7867  T0962  TS004_2    0.652553
     0.5014  T0962  TS004_3    0.585919
     0.4915  T0962  TS004_4    0.581748
     0.4675  T0962  TS004_5    0.584404

Ornate
Targets 71
Decoys 10637
Merged targets 71
Merged decoys 10632
 gdtts_true target    decoy  gdtts_pred
     0.5310  T0949  TS004_1    0.627190
     0.5291  T0949  TS004_2    0.629923
     0.4961  T0949  TS004_3    0.687469
     0.5330  T0949  TS004_4    0.678921
     0.5233  T0949  TS004_5    0.597810

ProQ3D
Targets 79
Decoys 11788
Merged targets 79
Merged decoys 11783
 gdtts_true target    decoy  gdtts_pred
     0.5310  T0949  TS004_1      0.3107
     0.5291  T0949  TS004_2      0.2706
     0.4961  T0949  TS004_3      0.2886
     0.5330  T0949  TS004_4      0.3589
     0.5233  T0949  TS004_5      0.2895

ProQ4
Targets 80
Decoys 11992
Merged targets 80
Merged decoys 11987
 gdt

In [4]:
results.rename_axis(index='Method').round(3)

Unnamed: 0_level_0,scored_targets,scored_decoys,RMSE,First Rank Loss,R,R per target,τ,τ per target,ρ,ρ per target
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3D CNN,57,8539.0,0.218,0.664,0.538,0.566,0.423,0.395,0.599,0.544
Ornate,71,10632.0,0.356,0.038,0.381,0.644,0.26,0.471,0.383,0.642
ProQ3D,79,11783.0,0.144,0.2,0.816,0.652,0.615,0.453,0.808,0.614
ProQ4,80,11987.0,0.176,0.186,0.751,0.711,0.571,0.512,0.76,0.662
VoroMQA,80,11982.0,0.204,0.063,0.689,0.642,0.501,0.44,0.694,0.596


This is the format for `other_results.csv`

In [5]:
datasets = {
    '3D CNN': 'CASP 7+8+9+10+11+12',
    'Ornate': 'CASP 7+8+9+10+11+12',
    'ProQ3D': 'CASP 10+11+12',
    'ProQ4': 'CASP 10+11+12',
    'VoroMQA': 'PDB',
}

for method, v in results.transpose().drop(['scored_targets', 'scored_decoys']).to_dict().items():
    for kk, vv in v.items():
        print('Ours',method, datasets[method], 'CASP 13', 'Global GDT_TS', kk, vv, sep=',')
    print()

Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,RMSE,0.21775760940235467
Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,First Rank Loss,0.6638
Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,R,0.5378206847097099
Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,R per target,0.565756889277539
Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,τ,0.4229519587066553
Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,τ per target,0.3953209049887292
Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,ρ,0.5994595168281719
Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,ρ per target,0.5444447058597801

Ours,Ornate,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,RMSE,0.3562729438498173
Ours,Ornate,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,First Rank Loss,0.03759999999999997
Ours,Ornate,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,R,0.3813603229716177
Ours,Ornate,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,R per target,0.6436224836059586
Ours,Ornate,CASP 7+8+9+10+11+12,

## LDDT

TODO