# CASP 13 metrics

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from proteins.metrics import rmse, pearson, kendalltau, spearmanr, first_rank_loss

pd.set_option("display.expand_frame_repr", False)

## GDT_TS

### Ground truth scores

Use only the targets we have structures for, more to come.

In [2]:
df_true = pd.read_csv('GDT_TS/data-20.tsv', header=None, sep='\s+', names=['target_decoy', 'gdtts'], usecols=[0, 1])
# df_true = pd.read_csv('GDT_TS/data-all.tsv', header=None, sep='\s+', names=['target_decoy', 'gdtts'], usecols=[0, 1])
df_true['gdtts'] = df_true['gdtts'] / 100

# IDs like T0950TS004_1      are split as T0950   TS004_1
# IDs like T0953s1TS004_1    are split as T0953s1 TS004_1
# IDs like T0953s1TS004_1-D1 are split as T0953s1 TS004_1
df_true[['target', 'decoy']] = df_true['target_decoy'].str.extract(r'(T\d{4}(?:s\d)?)([TS\d_]+)(?:\-D1)?')
df_true.drop(columns='target_decoy', inplace=True)

print(f'Targets {df_true.target.nunique()}',f'Decoys {len(df_true)}', sep='\n', end='\n\n')
print('Targets:', *df_true.target.unique(), sep='\n- ', end='\n\n')
df_true.groupby('target').head(2).head(10)

Targets 20
Decoys 7838

Targets:
- T0950
- T0951
- T0953s1
- T0953s2
- T0954
- T0955
- T0957s1
- T0957s2
- T0958
- T0960
- T0963
- T0966
- T0968s1
- T0968s2
- T1003
- T1005
- T1008
- T1009
- T1011
- T1016



Unnamed: 0,gdtts,target,decoy
0,0.1089,T0950,TS004_1
1,0.1762,T0950,TS004_2
183,0.9126,T0951,TS004_1
184,0.9192,T0951,TS004_2
338,0.2836,T0953s1,TS004_1
339,0.4105,T0953s1,TS004_2
757,0.0867,T0953s2,TS004_1
758,0.0887,T0953s2,TS004_2
1169,0.5863,T0954,TS004_1
1170,0.5737,T0954,TS004_2


### Other papers predictions

In [3]:
others = {
    '3D CNN': 'GDT_TS/3DCNN-all.tsv',
    'Ornate': 'GDT_TS/Ornate-all.tsv',
    'ProQ3D': 'GDT_TS/ProQ3D-all.tsv',
    'ProQ4': 'GDT_TS/ProQ4-all.tsv',
    'VoroMQA': 'GDT_TS/VoroMQA-all.tsv',
}
results = {k: {} for k in others}

for other_name, other_file in others.items():
    df = pd.read_csv(other_file, header=None, sep='\s+', names=['target_decoy', 'gdtts'], usecols=[0, 1])
    df[['target', 'decoy']] = df['target_decoy'].str.extract(r'(T\d{4}(?:s\d)?)([TS\d_]+)(?:\-D1)?')
    df.drop(columns='target_decoy', inplace=True)
    
    df_merge = pd.merge(
        df_true,
        df,
        on=['target', 'decoy'],
        suffixes=['_true', '_pred']
    )
    print(
        other_name, 
        f'Targets {df.target.nunique()}', 
        f'Decoys {len(df)}', 
        f'Merged targets {df_merge.target.nunique()}', 
        f'Merged decoys {len(df_merge)}', 
        df_merge.head().to_string(index=False), sep='\n', end='\n\n'
    )
    
    results[other_name]['scored_targets'] = df_merge.target.nunique()
    results[other_name]['scored_decoys'] = len(df_merge)
    results[other_name]['RMSE'] = rmse(preds=df_merge['gdtts_pred'], true=df_merge['gdtts_true'])
    results[other_name]['First Rank Loss'] = first_rank_loss(preds=df_merge['gdtts_pred'], true=df_merge['gdtts_true'])
    
    for name, func in {'R': pearson, 'τ': kendalltau, 'ρ': spearmanr}.items():
        results[other_name][name] = func(preds=df_merge['gdtts_pred'], true=df_merge['gdtts_true'])
        results[other_name][f'{name} per target'] = df_merge.groupby('target').apply(lambda group: func(preds=group['gdtts_pred'], true=group['gdtts_true'])).mean()
        
results = pd.DataFrame(results).transpose()
results['scored_targets'] = results['scored_targets'].astype(int)

3D CNN
Targets 57
Decoys 8539
Merged targets 6
Merged decoys 900
 gdtts_true target    decoy  gdtts_pred
     0.7874  T1003  TS004_1     0.51988
     0.8906  T1003  TS004_2     0.51960
     0.8836  T1003  TS004_3     0.51216
     0.6976  T1003  TS004_4     0.52058
     0.6912  T1003  TS004_5     0.51424

Ornate
Targets 71
Decoys 10637
Merged targets 18
Merged decoys 2694
 gdtts_true target    decoy  gdtts_pred
     0.1089  T0950  TS004_1    0.644755
     0.1762  T0950  TS004_2    0.731465
     0.1594  T0950  TS023_4    0.726716
     0.1820  T0950  TS023_5    0.741695
     0.1498  T0950  TS041_1    0.648748

ProQ3D
Targets 79
Decoys 11788
Merged targets 20
Merged decoys 2983
 gdtts_true target    decoy  gdtts_pred
     0.1089  T0950  TS004_1      0.2508
     0.1762  T0950  TS004_2      0.2629
     0.1594  T0950  TS023_4      0.3723
     0.1820  T0950  TS023_5      0.3321
     0.1498  T0950  TS041_1      0.3004

ProQ4
Targets 80
Decoys 11992
Merged targets 20
Merged decoys 2994
 gdtts_tr

In [4]:
results.rename_axis(index='Method').round(3)

Unnamed: 0_level_0,scored_targets,scored_decoys,RMSE,First Rank Loss,R,R per target,τ,τ per target,ρ,ρ per target
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3D CNN,6,900.0,0.192,0.135,0.661,0.753,0.457,0.487,0.632,0.662
Ornate,18,2694.0,0.352,0.283,0.533,0.646,0.36,0.467,0.522,0.642
ProQ3D,20,2983.0,0.129,0.16,0.849,0.671,0.625,0.458,0.81,0.619
ProQ4,20,2994.0,0.182,0.147,0.671,0.733,0.492,0.508,0.645,0.668
VoroMQA,20,2994.0,0.177,0.024,0.767,0.665,0.571,0.443,0.765,0.606


This is the format for `other_results.csv`

In [5]:
datasets = {
    '3D CNN': 'CASP 7+8+9+10+11+12',
    'Ornate': 'CASP 7+8+9+10+11+12',
    'ProQ3D': 'CASP 10+11+12',
    'ProQ4': 'CASP 10+11+12',
    'VoroMQA': 'PDB',
}

for method, v in results.transpose().drop(['scored_targets', 'scored_decoys']).to_dict().items():
    for kk, vv in v.items():
        print('Ours',method, datasets[method], 'CASP 13', 'Global GDT_TS', kk, vv, sep=',')
    print()

Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,RMSE,0.19242343373375523
Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,First Rank Loss,0.13549999999999995
Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,R,0.6606190745228988
Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,R per target,0.752984162395777
Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,τ,0.4566757062907246
Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,τ per target,0.4871131927764349
Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,ρ,0.6315529838298138
Ours,3D CNN,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,ρ per target,0.6620721054238546

Ours,Ornate,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,RMSE,0.35156237152131875
Ours,Ornate,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,First Rank Loss,0.2831
Ours,Ornate,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,R,0.5330911712769
Ours,Ornate,CASP 7+8+9+10+11+12,CASP 13,Global GDT_TS,R per target,0.6459532521734549
Ours,Ornate,CASP 7+8+9+10+11+12,CA

## LDDT

TODO