In [1]:
import sys
home_dir = "../"
sys.path.append(home_dir)

import pandas as pd

### About SCOPe

In [2]:
# SCOPe datasets after filtering at different thresholds
print(f"th\tn_datapoints\tn_unique_folds\tn_unique_superfamilies\tn_unique_families")
for th in [10, 20, 30, 40, 70, 95]:
    data_df = pd.read_csv(home_dir+"data/SCOPe/processed_at_th/th_"+str(th)+".tsv", sep="\t")
    total = data_df.shape[0]
    n_folds = data_df["fold"].unique().shape[0]
    n_superfamilies = data_df["superfamily"].unique().shape[0]
    n_families = data_df["family"].unique().shape[0]
    print(f"{th}\t{total}\t{n_folds}\t{n_superfamilies}\t{n_families}")

th	n_datapoints	n_unique_folds	n_unique_superfamilies	n_unique_families
10	6784	1220	1985	3999
20	7547	1229	1995	4116
30	10370	1229	1997	4310
40	14280	1230	1998	4521
70	23349	1230	1998	4791
95	33771	1230	1998	4826


In [3]:
# SCOPe class specific data distribution at th
# for th in [10, 20, 30, 40, 70, 95]:
th = 10    
df = pd.read_csv(home_dir+"data/SCOPe/processed_at_th/th_"+str(th)+".tsv", sep="\t")
df[["cls_label", "fold", "superfamily", "family"]].groupby(by="cls_label").nunique().reset_index()

Unnamed: 0,cls_label,fold,superfamily,family
0,a,282,510,896
1,b,173,334,747
2,c,140,237,749
3,d,392,570,1109
4,e,70,70,100
5,f,66,126,160
6,g,97,138,238


### About SCOP

In [5]:
# SCOP datasets after filtering at different thresholds
print(f"th\tn_datapoints\tn_unique_folds\tn_unique_superfamilies\tn_unique_families")
for th in [10, 20, 30, 40, 70, 95]:
    # th=30
    data_df = pd.read_csv(home_dir+"data/SCOP/processed_at_th/th_"+str(th)+".tsv", sep="\t")
    total = data_df.shape[0]
    n_folds = data_df["CF"].unique().shape[0]
    n_superfamilies = data_df["SF"].unique().shape[0]
    n_families = data_df["FA"].unique().shape[0]
    print(f"{th}\t{total}\t{n_folds}\t{n_superfamilies}\t{n_families}")
    # break

th	n_datapoints	n_unique_folds	n_unique_superfamilies	n_unique_families
10	9843	1488	2589	5122
20	10073	1488	2589	5127
30	11852	1488	2589	5131
40	15481	1488	2589	5131
70	23059	1489	2589	5135
95	27563	1489	2589	5135


In [6]:
# SCOPe class specific data distribution at th
# for th in [10, 20, 30, 40, 70, 95]:
th = 95    
df = pd.read_csv(home_dir+"data/SCOP/processed_at_th/th_"+str(th)+".tsv", sep="\t")
df[["CL", "CF", "SF", "FA"]].groupby(by="CL").nunique().reset_index()

Unnamed: 0,CL,CF,SF,FA
0,1000000,461,947,1404
1,1000001,240,458,1062
2,1000002,165,261,881
3,1000003,519,724,1456
4,1000004,104,205,333


### Results

In [7]:
# data specific results 
data_name = "SCOP" # SCOPe, SCOP
remote_homology_level = "fold"  # superfamily, fold
results_type = "weighted" # non_weighted, weighted
ranking_results = "ranking_results_random_repseq" # ranking_results, ranking_results_cdhit_repseq, ranking_results_random_repseq

for metric_name in ["auroc", "auprc", "hit1", "hit10"]:

    print(data_name, remote_homology_level, results_type, metric_name)

    model_names_dict = {"random": "Random", "tapebert":"TAPE-BERT", "proteinbert": "ProteinBERT", "esm2_t33_650M_UR50D":"ESM2", "esm1b_t33_650M_UR50S":"ESM1b", 
                        "prottrans_bert_bfd": "Prottrans-BERT", "prottrans_albert_bfd":"Prottrans-ALBERT", "prottrans_t5_bfd":"Prottrans-T5"}
    ths = [10, 20, 30, 40, 70, 95]

    for model_name_key, model_name_value in model_names_dict.items():
        # print(model_name_value, sep="\t", end="\t")
        for j, th in enumerate(ths):
            results_df = pd.read_csv(home_dir+f"data/{data_name}/{ranking_results}/{model_name_key}/{remote_homology_level}/th_{th}/{results_type}_results.tsv", sep="\t")
            mean_perf = results_df[results_df["index"]=="mean"]
            # auroc, auprc, hit1, hit10 = mean_perf["auroc"].values[0],  mean_perf["auprc"].values[0],  mean_perf["hit1"].values[0],  mean_perf["hit10"].values[0]

            metric = mean_perf[metric_name].values[0]
            
            if j==len(ths)-1:
                print(f"{metric:.2f}", sep="\t", end="\n")
            else: print(f"{metric:.2f}", sep="\t", end="\t")
            # break
        # break

SCOP fold weighted auroc
0.49	0.49	0.50	0.50	0.50	0.50
0.72	0.72	0.73	0.73	0.74	0.74
0.71	0.71	0.72	0.74	0.75	0.76
0.63	0.62	0.63	0.63	0.63	0.63
0.64	0.64	0.64	0.64	0.64	0.64
0.60	0.60	0.61	0.60	0.59	0.59
0.62	0.62	0.62	0.61	0.61	0.61
0.73	0.73	0.73	0.73	0.73	0.74
SCOP fold weighted auprc
0.00	0.00	0.00	0.00	0.00	0.00
0.03	0.02	0.03	0.02	0.02	0.02
0.02	0.02	0.02	0.02	0.02	0.02
0.04	0.03	0.04	0.03	0.03	0.03
0.05	0.04	0.04	0.04	0.04	0.04
0.02	0.02	0.02	0.02	0.02	0.02
0.02	0.02	0.02	0.01	0.01	0.01
0.06	0.05	0.05	0.05	0.04	0.04
SCOP fold weighted hit1
0.00	0.00	0.00	0.00	0.00	0.00
0.05	0.05	0.05	0.05	0.05	0.05
0.04	0.04	0.04	0.04	0.04	0.04
0.10	0.09	0.10	0.09	0.09	0.09
0.12	0.12	0.12	0.12	0.12	0.12
0.05	0.05	0.05	0.05	0.05	0.04
0.06	0.06	0.05	0.05	0.05	0.05
0.12	0.11	0.11	0.11	0.11	0.11
SCOP fold weighted hit10
0.02	0.02	0.02	0.02	0.02	0.02
0.14	0.14	0.14	0.15	0.14	0.13
0.13	0.12	0.12	0.12	0.12	0.12
0.22	0.20	0.21	0.21	0.19	0.19
0.24	0.24	0.23	0.22	0.22	0.21
0.13	0.13	0.13	0.13	0.12	0.11
0